In [2]:
# Define which parameter to predict
user_input = int(input("Enter a nember(RH:5, GSP:4, SH:3, MAT:2, ENTHAL:1):"))

Enter a nember(RH:5, GSP:4, SH:3, MAT:2, ENTHAL:1):2


In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.exceptions import FitFailedWarning
from sklearn.metrics import r2_score
import warnings


data = pd.read_csv('data.csv')
X = data.iloc[:, :-5].values
y = data.iloc[:, -user_input].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)


models = [
    ('Random Forest', RandomForestRegressor()),
    ('SVR', SVR()),
    ('Linear Regression', LinearRegression()),
    ('Decision Tree', DecisionTreeRegressor())
]


results = {}
final_scores = {}
for name, model in models:
    param_grid = None
    if name == 'SVR':
        param_grid = {
            'C': [0.1, 0.8, 2, 10],
            'kernel': ['linear', 'rbf', 'poly', 'sigmoid']
        }
    elif name == 'Random Forest':
        param_grid = {
            'n_estimators': [50, 150, 100, 200],
            'max_depth': [None, 10, 18, 30],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 5]
        }
    elif name == 'Linear Regression':
        param_grid = {
            'fit_intercept': [True, False]
        }
    elif name == 'Decision Tree':
        param_grid = {
            'max_depth': [None, 10, 20, 30, 50, 100],
            'min_samples_split': [2, 5, 10, 20],
            'min_samples_leaf': [1, 2, 4, 7]
        }

    inner_cv = KFold(n_splits=5, shuffle=True, random_state=42)
    outer_cv = KFold(n_splits=5, shuffle=True, random_state=42)

    grid_search = GridSearchCV(model, param_grid=param_grid, cv=inner_cv, scoring='r2', n_jobs=-1)
    cv_scores = cross_val_score(grid_search, X_train, y_train, cv=outer_cv, scoring='r2')

    best_model = grid_search.fit(X_train, y_train).best_estimator_
    y_pred = best_model.predict(X_test)
    r2 = r2_score(y_test, y_pred)

    results[name] = cv_scores
    final_scores[name] = r2


print("Nested Cross-Validation Results:")
for name, scores in results.items():
    print(f"{name} CV R-squared: {scores.mean():.4f} (+/- {scores.std() * 2:.4f})")

print("\nFinal Model Evaluation:")
for name, r2 in final_scores.items():
    print(f"{name} Test R-squared: {r2:.4f}")

best_model = max(final_scores, key=final_scores.get)
print("\nBest Model:", best_model, final_scores[best_model])


Nested Cross-Validation Results:
Random Forest CV R-squared: 0.8908 (+/- 0.0302)
SVR CV R-squared: 0.9056 (+/- 0.0401)
Linear Regression CV R-squared: 0.8532 (+/- 0.0528)
Decision Tree CV R-squared: 0.7845 (+/- 0.0661)

Final Model Evaluation:
Random Forest Test R-squared: 0.9199
SVR Test R-squared: 0.9373
Linear Regression Test R-squared: 0.9311
Decision Tree Test R-squared: 0.7096

Best Model: SVR 0.937272606546334
