In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression, Lasso, Ridge, ElasticNet
from sklearn.metrics import r2_score, mean_squared_error, accuracy_score, mean_absolute_error, confusion_matrix, classification_report
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor , GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor

In [2]:
df = pd.read_csv('C:\\Users\\Arpit Kadam\\Desktop\\mlproject\\data\\stud.csv')

In [3]:
df.head()

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [4]:
df['total_score'] = df['math_score'] + df['reading_score'] + df['writing_score']
df['average_score'] = df['total_score'] / 3
df.head()

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score,total_score,average_score
0,female,group B,bachelor's degree,standard,none,72,72,74,218,72.666667
1,female,group C,some college,standard,completed,69,90,88,247,82.333333
2,female,group B,master's degree,standard,none,90,95,93,278,92.666667
3,male,group A,associate's degree,free/reduced,none,47,57,44,148,49.333333
4,male,group C,some college,standard,none,76,78,75,229,76.333333


In [5]:
## Performing Label Encoding
categorical_columns =['gender','race_ethnicity','parental_level_of_education','lunch','test_preparation_course']
encoder = LabelEncoder()

for column in categorical_columns:
    df[column] = encoder.fit_transform(df[column])

df.head()

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score,total_score,average_score
0,0,1,1,1,1,72,72,74,218,72.666667
1,0,2,4,1,0,69,90,88,247,82.333333
2,0,1,3,1,1,90,95,93,278,92.666667
3,1,0,0,0,1,47,57,44,148,49.333333
4,1,2,4,1,1,76,78,75,229,76.333333


In [6]:
## Removing Outliers

def remove_outliers(df, columns):
    df_clean = df.copy()
    for column in columns:
        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        df_clean = df_clean[(df_clean[column] >= lower_bound) & (df_clean[column] <= upper_bound)]
    return df_clean

numerical_columns = ['math_score', 'reading_score', 'writing_score', 'total_score', 'average_score']
df_clean = remove_outliers(df, numerical_columns)
print("Before removing Outliers, Shape: ", df.shape)
print("After removing Outliers, Shape: ", df_clean.shape)


Before removing Outliers, Shape:  (1000, 10)
After removing Outliers, Shape:  (988, 10)


In [7]:
## Performing Standatdization

x = df_clean.drop(['total_score'], axis=1)
y = df_clean['total_score']

scaler = StandardScaler()
x = scaler.fit_transform(x)
print(x)
print(y)
print("X Shape: ", x.shape)
print("Y Shape: ", y.shape)

[[-0.96811926 -1.02167248 -0.80607157 ...  0.16840602  0.37424098
   0.32393971]
 [-0.96811926 -0.15670898  0.83485984 ...  1.45323338  1.33856684
   1.0374089 ]
 [-0.96811926 -1.02167248  0.2878827  ...  1.81012987  1.68296893
   1.80008287]
 ...
 [-0.96811926 -0.15670898 -0.25909443 ...  0.09702672 -0.24568279
  -0.24191516]
 [-0.96811926  0.70825453  0.83485984 ...  0.5966818   0.58088224
   0.44695164]
 [-0.96811926  0.70825453  0.83485984 ...  1.16771619  1.200806
   1.08661368]]
0      218
1      247
2      278
3      148
4      229
      ... 
995    282
996    172
997    195
998    223
999    249
Name: total_score, Length: 988, dtype: int64
X Shape:  (988, 9)
Y Shape:  (988,)


In [8]:
## Splitting the data into training and testing data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
print("X Train Shape: ", x_train.shape)
print("X Test Shape: ", x_test.shape)
print("Y Train Shape: ", y_train.shape)
print("Y Test Shape: ", y_test.shape)

X Train Shape:  (790, 9)
X Test Shape:  (198, 9)
Y Train Shape:  (790,)
Y Test Shape:  (198,)


In [30]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor

def perform_hyperparameter_tuning(X_train, y_train):
    models = {
        'LogisticRegression': {
            'model': LogisticRegression(),
            'params': {
                'C': [0.1, 1, 10],
                'solver': ['liblinear', 'lbfgs'],
                'max_iter': [100, 200]
            }
        },
        'Lasso': {
            'model': Lasso(),
            'params': {
                'alpha': [0.1, 1.0, 10.0, 100.0]
            }
        },
        'Ridge': {
            'model': Ridge(),
            'params': {
                'alpha': [0.1, 1.0, 10.0, 100.0]
            }
        },
        'ElasticNet': {
            'model': ElasticNet(),
            'params': {
                'alpha'     : [0.1,1,10,0.01],
                'l1_ratio'  :  np.arange(0.40,1.00,0.10),
                'tol'       : [0.0001,0.001]
            }
        },
        'RandomForestRegressor': {
            'model': RandomForestRegressor(),
            'params': {
                'n_estimators': [50, 100],
                'max_depth': [None, 10],
                'min_samples_split': [2, 5]
            }
        },
        'AdaBoostRegressor': {
            'model': AdaBoostRegressor(),
            'params': {
                'n_estimators': [50, 100],
                'learning_rate': [0.01, 0.1, 1]
            }
        },
        'GradientBoostingRegressor': {
            'model': GradientBoostingRegressor(),
            'params': {
                'n_estimators': [50, 100],
                'learning_rate': [0.01, 0.1, 1],
                'max_depth': [3, 5, 7]
            }
        },
        'DecisionTreeRegressor': {
            'model': DecisionTreeRegressor(),
            'params': {
                'max_depth': [None, 10, 20],
                'min_samples_split': [2, 5, 10]
            }
        }
    }

    results = {}

    for model_name, config in models.items():
        print(f"Tuning {model_name}...")
        grid_search = GridSearchCV(config['model'], config['params'], cv=5, scoring='r2', n_jobs=1)
        grid_search.fit(X_train, y_train)
        best_model = grid_search.best_estimator_
        y_pred = best_model.predict(x_train)
        mae = mean_absolute_error(y_train, y_pred)
        mse = mean_squared_error(y_train, y_pred)
        results[model_name] = {
            'best_params': grid_search.best_params_,
            'best_score': grid_search.best_score_,
            'mse': mse,
            'mae': mae,
        }
        print(f"{model_name} Best Params: {grid_search.best_params_}")
        print(f"{model_name} Best Score: {grid_search.best_score_}\n")
        print(f"{model_name} Mean Absolute Error: {mae}")
        print(f"{model_name} Mean Squared Error: {mse}\n")

    return results


In [31]:
# Example: Call the function with your data
results = perform_hyperparameter_tuning(x_train, y_train)

# Print results
for model, info in results.items():
    print(f"{model}: Best Params: {info['best_params']}, Best Score: {info['best_score']}")


Tuning LogisticRegression...




LogisticRegression Best Params: {'C': 10, 'max_iter': 100, 'solver': 'lbfgs'}
LogisticRegression Best Score: 0.9257620143127696

LogisticRegression Mean Absolute Error: 5.262025316455696
LogisticRegression Mean Squared Error: 59.28227848101266

Tuning Lasso...
Lasso Best Params: {'alpha': 0.1}
Lasso Best Score: 0.9999934515877282

Lasso Mean Absolute Error: 0.08241568713622886
Lasso Mean Squared Error: 0.010397620146434866

Tuning Ridge...


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Ridge Best Params: {'alpha': 0.1}
Ridge Best Score: 0.9999999974222027

Ridge Mean Absolute Error: 0.0012995054765310395
Ridge Mean Squared Error: 2.5910000610109936e-06

Tuning ElasticNet...
ElasticNet Best Params: {'alpha': 0.01, 'l1_ratio': 0.8999999999999999, 'tol': 0.0001}
ElasticNet Best Score: 0.9999997712402555

ElasticNet Mean Absolute Error: 0.015429741739596114
ElasticNet Mean Squared Error: 0.0003622816630901358

Tuning RandomForestRegressor...
RandomForestRegressor Best Params: {'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 100}
RandomForestRegressor Best Score: 0.9998359245781924

RandomForestRegressor Mean Absolute Error: 0.05948101265822664
RandomForestRegressor Mean Squared Error: 0.03782367088607584

Tuning AdaBoostRegressor...
AdaBoostRegressor Best Params: {'learning_rate': 1, 'n_estimators': 100}
AdaBoostRegressor Best Score: 0.9966454404334651

AdaBoostRegressor Mean Absolute Error: 2.413149089827401
AdaBoostRegressor Mean Squared Error: 8.9915386187383

In [32]:
print(results)

{'LogisticRegression': {'best_params': {'C': 10, 'max_iter': 100, 'solver': 'lbfgs'}, 'best_score': 0.9257620143127696, 'mse': 59.28227848101266, 'mae': 5.262025316455696}, 'Lasso': {'best_params': {'alpha': 0.1}, 'best_score': 0.9999934515877282, 'mse': 0.010397620146434866, 'mae': 0.08241568713622886}, 'Ridge': {'best_params': {'alpha': 0.1}, 'best_score': 0.9999999974222027, 'mse': 2.5910000610109936e-06, 'mae': 0.0012995054765310395}, 'ElasticNet': {'best_params': {'alpha': 0.01, 'l1_ratio': 0.8999999999999999, 'tol': 0.0001}, 'best_score': 0.9999997712402555, 'mse': 0.0003622816630901358, 'mae': 0.015429741739596114}, 'RandomForestRegressor': {'best_params': {'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 100}, 'best_score': 0.9998359245781924, 'mse': 0.03782367088607584, 'mae': 0.05948101265822664}, 'AdaBoostRegressor': {'best_params': {'learning_rate': 1, 'n_estimators': 100}, 'best_score': 0.9966454404334651, 'mse': 8.991538618738357, 'mae': 2.413149089827401}, 'Gradi

In [33]:
import pandas as pd

def create_results_dataframe(results):
    # Initialize an empty list to store rows
    rows = []
    
    for model_name, model_info in results.items():
        row = {
            "Model Name": model_name,
            "Best Params": model_info.get('best_params'),
            "R2 Score": model_info.get('best_score'),
            "MAE": model_info.get('mae'),
            "MSE": model_info.get('mse'),
        }
        rows.append(row)
    
    # Create a DataFrame from the rows
    df = pd.DataFrame(rows)
    return df

# Example usage:
# Assuming `results` is the dictionary containing your hyperparameter tuning results
# Add MAE and MSE calculations for each model during tuning before passing results to this function.

results_df = create_results_dataframe(results)

# Display the DataFrame
results_df



Unnamed: 0,Model Name,Best Params,R2 Score,MAE,MSE
0,LogisticRegression,"{'C': 10, 'max_iter': 100, 'solver': 'lbfgs'}",0.925762,5.262025,59.282278
1,Lasso,{'alpha': 0.1},0.999993,0.082416,0.010398
2,Ridge,{'alpha': 0.1},1.0,0.0013,3e-06
3,ElasticNet,"{'alpha': 0.01, 'l1_ratio': 0.8999999999999999...",1.0,0.01543,0.000362
4,RandomForestRegressor,"{'max_depth': 10, 'min_samples_split': 2, 'n_e...",0.999836,0.059481,0.037824
5,AdaBoostRegressor,"{'learning_rate': 1, 'n_estimators': 100}",0.996645,2.413149,8.991539
6,GradientBoostingRegressor,"{'learning_rate': 0.1, 'max_depth': 5, 'n_esti...",0.999919,0.16894,0.044403
7,DecisionTreeRegressor,"{'max_depth': None, 'min_samples_split': 5}",0.999803,0.056118,0.046097
