In [2]:
# 2. Regression:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
from xgboost import XGBRegressor
from ngboost import NGBRegressor

# Load dataset
file_path = 'dataset.csv'
data = pd.read_csv(file_path)

# Remove categorical attributes and select a numerical target variable
numerical_data = data.select_dtypes(include=['float64', 'int64'])
target_variable = 'temp'

# Fill missing values with the most frequent value (mode)
for column in numerical_data.columns:
    mode_value = numerical_data[column].mode()[0]
    numerical_data[column] = numerical_data[column].fillna(mode_value)

X = numerical_data.drop(columns=[target_variable]).values
y = numerical_data[target_variable].values

# Train-test split
np.random.seed(42)
indices = np.random.permutation(len(X))
train_size = int(0.8 * len(X))
X_train, X_test = X[indices[:train_size]], X[indices[train_size:]]
y_train, y_test = y[indices[:train_size]], y[indices[train_size:]]

# Evaluation metrics
def rmse(y_true, y_pred):
    return np.sqrt(np.mean((y_true - y_pred) ** 2))

def mae(y_true, y_pred):
    return np.mean(np.abs(y_true - y_pred))

# Linear Regression Implementation
class LinearRegression:
    def __init__(self):
        self.weights = None

    def fit(self, X, y):
        X_b = np.c_[np.ones((X.shape[0], 1)), X]
        self.weights = np.linalg.pinv(X_b.T.dot(X_b)).dot(X_b.T).dot(y)

    def predict(self, X):
        X_b = np.c_[np.ones((X.shape[0], 1)), X]
        return X_b.dot(self.weights)

# Decision Tree Implementation
class DecisionTree:
    def __init__(self, max_depth=5):
        self.tree = DecisionTreeRegressor(max_depth=max_depth)
        
    def fit(self, X, y):
        self.tree.fit(X, y)
        
    def predict(self, X):
        return self.tree.predict(X)

# Random Forest Implementation
class RandomForest:
    def __init__(self, n_estimators=100, max_depth=5):
        self.trees = [DecisionTree(max_depth) for _ in range(n_estimators)]
        
    def fit(self, X, y):
        for tree in self.trees:
            indices = np.random.choice(len(X), len(X), replace=True)
            tree.fit(X[indices], y[indices])
            
    def predict(self, X):
        predictions = np.array([tree.predict(X) for tree in self.trees])
        return np.mean(predictions, axis=0)

# Extra Trees Implementation
class ExtraTrees(RandomForest):
    def __init__(self, n_estimators=100, max_depth=5):
        super().__init__(n_estimators, max_depth)
        
    def fit(self, X, y):
        for tree in self.trees:
            indices = np.random.choice(len(X), len(X), replace=True)
            # Add random feature selection and split point selection
            X_sample = X[indices] + np.random.normal(0, 0.01, X[indices].shape)
            tree.fit(X_sample, y[indices])

# Fixed AdaBoost Implementation
class CustomAdaBoost:
    def __init__(self, n_estimators=50):
        self.model = AdaBoostRegressor(
            DecisionTreeRegressor(max_depth=3),
            n_estimators=n_estimators,
            random_state=42
        )
        
    def fit(self, X, y):
        self.model.fit(X, y)
        
    def predict(self, X):
        return self.model.predict(X)

# NGBoost Implementation
class CustomNGBoost:
    def __init__(self):
        self.model = NGBRegressor(random_state=42)
        
    def fit(self, X, y):
        self.model.fit(X, y)
        
    def predict(self, X):
        return self.model.predict(X)

# XGBoost Implementation
class CustomXGBoost:
    def __init__(self):
        self.model = XGBRegressor(
            objective='reg:squarederror',
            random_state=42
        )
        
    def fit(self, X, y):
        self.model.fit(X, y)
        
    def predict(self, X):
        return self.model.predict(X)

# Instantiate models
models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForest(n_estimators=100),
    "Extra Trees": ExtraTrees(n_estimators=100),
    "NGBoost": CustomNGBoost(),
    "XGBoost": CustomXGBoost(),
    "AdaBoost": CustomAdaBoost(n_estimators=50)
}

# Train and evaluate models
results = {}
print("\nModel Evaluation Results:")
print("-" * 50)
for name, model in models.items():
    print(f"\nTraining {name}...")
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    
    model_rmse = rmse(y_test, predictions)
    model_mae = mae(y_test, predictions)
    
    results[name] = {
        "RMSE": model_rmse,
        "MAE": model_mae
    }
    
    print(f"{name}:")
    print(f"RMSE: {model_rmse:.4f}")
    print(f"MAE: {model_mae:.4f}")

# Sort models by RMSE performance
sorted_results = dict(sorted(results.items(), key=lambda x: x[1]["RMSE"]))

print("\nFinal Rankings (sorted by RMSE):")
print("-" * 50)
for model_name, metrics in sorted_results.items():
    print(f"{model_name:20} - RMSE: {metrics['RMSE']:.4f}, MAE: {metrics['MAE']:.4f}")


Model Evaluation Results:
--------------------------------------------------

Training Linear Regression...
Linear Regression:
RMSE: 0.2463
MAE: 0.1888

Training Random Forest...
Random Forest:
RMSE: 0.5041
MAE: 0.3440

Training Extra Trees...
Extra Trees:
RMSE: 0.4957
MAE: 0.3398

Training NGBoost...
[iter 0] loss=3.6240 val_loss=0.0000 scale=1.0000 norm=7.8965
[iter 100] loss=2.4592 val_loss=0.0000 scale=2.0000 norm=2.5561
[iter 200] loss=1.4763 val_loss=0.0000 scale=2.0000 norm=1.1548
[iter 300] loss=0.5540 val_loss=0.0000 scale=2.0000 norm=1.0202
[iter 400] loss=-0.1393 val_loss=0.0000 scale=2.0000 norm=0.9171
NGBoost:
RMSE: 0.2312
MAE: 0.1633

Training XGBoost...
XGBoost:
RMSE: 0.2619
MAE: 0.1833

Training AdaBoost...
AdaBoost:
RMSE: 0.5796
MAE: 0.4328

Final Rankings (sorted by RMSE):
--------------------------------------------------
NGBoost              - RMSE: 0.2312, MAE: 0.1633
Linear Regression    - RMSE: 0.2463, MAE: 0.1888
XGBoost              - RMSE: 0.2619, MAE: 0.1833