# Random Forest

### X1

In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Step 1: Load the data
X_train = pd.read_csv("X_train.csv")
X_test = pd.read_csv("X_test.csv")
y_train = pd.read_csv("y_train.csv").values.ravel()  # Flatten to 1D
y_test = pd.read_csv("y_test.csv").values.ravel()

# Step 2: Define the RandomForestRegressor and grid of hyperparameters
rf = RandomForestRegressor(random_state=42)

param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
}

grid_search = GridSearchCV(estimator=rf,
                           param_grid=param_grid,
                           cv=5,
                           scoring='r2',
                           n_jobs=-1,
                           verbose=2)

# Step 4: Fit the model
grid_search.fit(X_train, y_train)

# Step 5: Make predictions
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# Step 6: Evaluate
print("Best Parameters:", grid_search.best_params_)
print("R² Score:", r2_score(y_test, y_pred))
print("Mean Absolute Error:", mean_absolute_error(y_test, y_pred))
print("Root Mean Squared Error:", np.sqrt(mean_squared_error(y_test, y_pred)))


Fitting 5 folds for each of 24 candidates, totalling 120 fits
Best Parameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}
R² Score: 0.49366471534241674
Mean Absolute Error: 4.5987917799727995
Root Mean Squared Error: 5.896726141111351


In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Step 1: Load the data
X_train = pd.read_csv("X_train_pca.csv")
X_test = pd.read_csv("X_test_pca.csv")
y_train = pd.read_csv("y_train.csv").values.ravel()  # Flatten to 1D
y_test = pd.read_csv("y_test.csv").values.ravel()

# Step 2: Define the RandomForestRegressor and grid of hyperparameters
rf = RandomForestRegressor(random_state=42)

param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
}

# Step 3: Set up GridSearchCV
grid_search = GridSearchCV(estimator=rf,
                           param_grid=param_grid,
                           cv=5,
                           scoring='r2',
                           n_jobs=-1,
                           verbose=2)

# Step 4: Fit the model
grid_search.fit(X_train, y_train)

# Step 5: Make predictions
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# Step 6: Evaluate
print("Best Parameters:", grid_search.best_params_)
print("R² Score:", r2_score(y_test, y_pred))
print("Mean Absolute Error:", mean_absolute_error(y_test, y_pred))
print("Root Mean Squared Error:", np.sqrt(mean_squared_error(y_test, y_pred)))


Fitting 5 folds for each of 24 candidates, totalling 120 fits
Best Parameters: {'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 200}
R² Score: 0.5130904407066471
Mean Absolute Error: 4.43789792785661
Root Mean Squared Error: 5.782504941428576


In [3]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

X_train = pd.read_csv("X_train.csv")
X_test = pd.read_csv("X_test.csv")
y_train = pd.read_csv("y_train.csv").values.ravel()  # Flatten to 1D
y_test = pd.read_csv("y_test.csv").values.ravel()
# Define the model
gbr = GradientBoostingRegressor(random_state=42)

# Define parameter grid
gbr_param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.05, 0.1],
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
}

# Grid search
gbr_grid = GridSearchCV(estimator=gbr,
                        param_grid=gbr_param_grid,
                        cv=5,
                        scoring='r2',
                        n_jobs=-1,
                        verbose=2)

# Fit
gbr_grid.fit(X_train, y_train)
gbr_best = gbr_grid.best_estimator_

# Predict and evaluate
y_pred_gbr = gbr_best.predict(X_test)
print("Gradient Boosting Regressor:")
print("Best Parameters:", gbr_grid.best_params_)
print("R² Score:", r2_score(y_test, y_pred_gbr))
print("MAE:", mean_absolute_error(y_test, y_pred_gbr))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred_gbr)))


Fitting 5 folds for each of 48 candidates, totalling 240 fits
Gradient Boosting Regressor:
Best Parameters: {'learning_rate': 0.05, 'max_depth': 3, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 100}
R² Score: 0.5397945457012445
MAE: 4.473165897497664
RMSE: 5.621700996141069


In [4]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

X_train = pd.read_csv("X_train_pca.csv")
X_test = pd.read_csv("X_test_pca.csv")
y_train = pd.read_csv("y_train.csv").values.ravel()  # Flatten to 1D
y_test = pd.read_csv("y_test.csv").values.ravel()
# Define the model
gbr = GradientBoostingRegressor(random_state=42)

# Define parameter grid
gbr_param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.05, 0.1],
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
}

# Grid search
gbr_grid = GridSearchCV(estimator=gbr,
                        param_grid=gbr_param_grid,
                        cv=5,
                        scoring='r2',
                        n_jobs=-1,
                        verbose=2)

# Fit
gbr_grid.fit(X_train, y_train)
gbr_best = gbr_grid.best_estimator_

# Predict and evaluate
y_pred_gbr = gbr_best.predict(X_test)
print("Gradient Boosting Regressor:")
print("Best Parameters:", gbr_grid.best_params_)
print("R² Score:", r2_score(y_test, y_pred_gbr))
print("MAE:", mean_absolute_error(y_test, y_pred_gbr))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred_gbr)))


Fitting 5 folds for each of 48 candidates, totalling 240 fits
Gradient Boosting Regressor:
Best Parameters: {'learning_rate': 0.1, 'max_depth': 5, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 100}
R² Score: 0.5165450015270603
MAE: 4.490186398062904
RMSE: 5.761955362250722


In [9]:
from sklearn.neighbors import KNeighborsRegressor
knn = KNeighborsRegressor()
X_train = pd.read_csv("X_train.csv")
X_test = pd.read_csv("X_test.csv")
y_train = pd.read_csv("y_train.csv").values.ravel()  # Flatten to 1D
y_test = pd.read_csv("y_test.csv").values.ravel()
knn_param_grid = {
    'n_neighbors': [3, 5, 7, 9],
    'weights': ['uniform', 'distance'],
    'p': [1, 2]  # 1 = Manhattan, 2 = Euclidean
}

knn_grid = GridSearchCV(knn, knn_param_grid, cv=5, scoring='r2', n_jobs=-1, verbose=2)
knn_grid.fit(X_train, y_train)

# Evaluate KNN
knn_best = knn_grid.best_estimator_
knn_pred = knn_best.predict(X_test)

print("\n--- KNN Regressor ---")
print("Best Parameters:", knn_grid.best_params_)
print("R² Score:", r2_score(y_test, knn_pred))
print("MAE:", mean_absolute_error(y_test, knn_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_test, knn_pred)))


Fitting 5 folds for each of 16 candidates, totalling 80 fits

--- KNN Regressor ---
Best Parameters: {'n_neighbors': 9, 'p': 1, 'weights': 'distance'}
R² Score: 0.6146933335309965
MAE: 4.04291041904129
RMSE: 5.143930872848712


In [7]:
knn = KNeighborsRegressor()

X_train = pd.read_csv("X_train_pca.csv")
X_test = pd.read_csv("X_test_pca.csv")
y_train = pd.read_csv("y_train.csv").values.ravel()  # Flatten to 1D
y_test = pd.read_csv("y_test.csv").values.ravel()
knn_param_grid = {
    'n_neighbors': [3, 5, 7, 9],
    'weights': ['uniform', 'distance'],
    'p': [1, 2]  # 1 = Manhattan, 2 = Euclidean
}

knn_grid = GridSearchCV(knn, knn_param_grid, cv=5, scoring='r2', n_jobs=-1, verbose=2)
knn_grid.fit(X_train, y_train)

# Evaluate KNN
knn_best = knn_grid.best_estimator_
knn_pred = knn_best.predict(X_test)

print("\n--- KNN Regressor ---")
print("Best Parameters:", knn_grid.best_params_)
print("R² Score:", r2_score(y_test, knn_pred))
print("MAE:", mean_absolute_error(y_test, knn_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_test, knn_pred)))


Fitting 5 folds for each of 16 candidates, totalling 80 fits

--- KNN Regressor ---
Best Parameters: {'n_neighbors': 9, 'p': 2, 'weights': 'distance'}
R² Score: 0.4994432263504104
MAE: 4.605770177863834
RMSE: 5.862981630079132
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   2.7s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time=   1.0s
[CV] END learning_rate=0.05, max_depth=3, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time=   1.8s
[CV] END learning_rate=0.1, max_depth=3, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time=   1.7s
[CV] END learning_rate=0.1, max_depth=5, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time=   2.7s
[CV] END learning_rate=0.05, max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.9s
[CV] END learning_rate=0.1, max_depth=3, min_samples_leaf=2, min_samples_split=5, n_estim