In [None]:
# Define which parameter to predict
user_input = int(input("Enter a nember(RH:5, GSP:4, SH:3, MAT:2, ENTHAL:1):"))

Enter a nember(RH:5, GSP:4, SH:3, MAT:2, ENTHAL:1):1


In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from scipy import stats
import warnings
from sklearn.model_selection import GridSearchCV
from sklearn.exceptions import FitFailedWarning
from sklearn.metrics import r2_score

# Import Data
data = pd.read_csv('data.csv')
X = data.iloc[:, :-5].values
y = data.iloc[:, -user_input].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

# Algorithm Selection
models = [
    ('Random Forest', RandomForestRegressor()),
    ('SVR', SVR()),
    ('Linear Regression', LinearRegression()),
    ('Decision Tree', DecisionTreeRegressor())
]

# Cross-Validation and Model Evaluation
results = {}
for name, model in models:
    scores = cross_val_score(model, X_train, y_train,cv=5, scoring='r2')
    results[name] = scores
# Final Selection
final_scores = {}
for name, model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    final_scores[name] = r2
print("Perfomance Before HPO")
best_model = max(final_scores, key=final_scores.get)
for name, r2 in final_scores.items():
    print(f"{name} R-squared: {r2}")
print("Best Model:", best_model, final_scores[best_model])

# Hyperparameter Optimization
warnings.filterwarnings("ignore", category=FitFailedWarning)
svr_param_grid = {
    'C': [0.1, 0.8, 2, 10],
    'kernel': ['linear', 'rbf', 'poly', 'sigmoid']
}

svr_optimized = GridSearchCV(SVR(), param_grid=svr_param_grid, cv=5, scoring='r2')
svr_optimized.fit(X_train, y_train)
best_svr_model = svr_optimized.best_estimator_
svr_pred = best_svr_model.predict(X_test)
svr_r2 = r2_score(y_test, svr_pred)

# Hyperparameter Optimization for Random Forest
rf_param_grid = {
    'n_estimators': [50, 150, 100, 200],
    'max_depth': [None, 10, 18, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 5]
}

rf_optimized = GridSearchCV(RandomForestRegressor(), param_grid=rf_param_grid, cv=5, scoring='r2')
rf_optimized.fit(X_train, y_train)
best_rf_model = rf_optimized.best_estimator_
rf_pred = best_rf_model.predict(X_test)
rf_r2 = r2_score(y_test, rf_pred)

# Hyperparameter Optimization for Linear Regression
lr_param_grid = {
    'fit_intercept': [True, False]
}

lr_optimized = GridSearchCV(LinearRegression(), param_grid=lr_param_grid, cv=5, scoring='r2')
lr_optimized.fit(X_train, y_train)
best_lr_model = lr_optimized.best_estimator_
lr_pred = best_lr_model.predict(X_test)
lr_r2 = r2_score(y_test, lr_pred)

# Hyperparameter Optimization for Decision Tree
dt_param_grid = {
    'max_depth': [None, 10, 20, 30, 50, 100],
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 4, 7]
}

dt_optimized = GridSearchCV(DecisionTreeRegressor(), param_grid=dt_param_grid, cv=5, scoring='r2')
dt_optimized.fit(X_train, y_train)
best_dt_model = dt_optimized.best_estimator_
dt_pred = best_dt_model.predict(X_test)
dt_r2 = r2_score(y_test, dt_pred)

# Final Selection after HPO
final_scores = {
    'Random Forest': rf_r2,
    'SVR': svr_r2,
    'Linear Regression': lr_r2,
    'Decision Tree': dt_r2
}
print("Perfomance After HPO")
best_model = max(final_scores, key=final_scores.get)

for name, r2 in final_scores.items():
    print(f"{name} R-squared: {r2}")

#print("Selected Algorithms:", models)
print("Best Model:", best_model, final_scores[best_model])


Perfomance Before HPO
Random Forest R-squared: 0.8197090565855836
SVR R-squared: 0.8769946021259977
Linear Regression R-squared: 0.8660118231585157
Decision Tree R-squared: 0.6988582200657647
Best Model: SVR 0.8769946021259977
Perfomance After HPO
Random Forest R-squared: 0.8215238890035359
SVR R-squared: 0.8982544985260306
Linear Regression R-squared: 0.8660118231585157
Decision Tree R-squared: 0.5498436434490469
Best Model: SVR 0.8982544985260306
