In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.model_selection import train_test_split
#

In [5]:
df=pd.read_csv('/content/stud.csv')

In [6]:
df.head()

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


# **Exploring types of Features **

In [7]:
for i in df.columns:
  print(i)
  print(df[i].unique())

gender
['female' 'male']
race_ethnicity
['group B' 'group C' 'group A' 'group D' 'group E']
parental_level_of_education
["bachelor's degree" 'some college' "master's degree" "associate's degree"
 'high school' 'some high school']
lunch
['standard' 'free/reduced']
test_preparation_course
['none' 'completed']
math_score
[ 72  69  90  47  76  71  88  40  64  38  58  65  78  50  18  46  54  66
  44  74  73  67  70  62  63  56  97  81  75  57  55  53  59  82  77  33
  52   0  79  39  45  60  61  41  49  30  80  42  27  43  68  85  98  87
  51  99  84  91  83  89  22 100  96  94  48  35  34  86  92  37  28  24
  26  95  36  29  32  93  19  23   8]
reading_score
[ 72  90  95  57  78  83  43  64  60  54  52  81  53  75  89  32  42  58
  69  73  71  74  70  65  87  56  61  84  55  44  41  85  59  17  39  80
  37  63  51  49  26  68  45  47  86  34  79  66  67  91 100  76  77  82
  92  93  62  88  50  28  48  46  23  38  94  97  99  31  96  24  29  40]
writing_score
[ 74  88  93  44  75  78  92 

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   gender                       1000 non-null   object
 1   race_ethnicity               1000 non-null   object
 2   parental_level_of_education  1000 non-null   object
 3   lunch                        1000 non-null   object
 4   test_preparation_course      1000 non-null   object
 5   math_score                   1000 non-null   int64 
 6   reading_score                1000 non-null   int64 
 7   writing_score                1000 non-null   int64 
dtypes: int64(3), object(5)
memory usage: 62.6+ KB


In [9]:
df.describe()

Unnamed: 0,math_score,reading_score,writing_score
count,1000.0,1000.0,1000.0
mean,66.089,69.169,68.054
std,15.16308,14.600192,15.195657
min,0.0,17.0,10.0
25%,57.0,59.0,57.75
50%,66.0,70.0,69.0
75%,77.0,79.0,79.0
max,100.0,100.0,100.0


In [10]:
df.isnull().sum()

gender                         0
race_ethnicity                 0
parental_level_of_education    0
lunch                          0
test_preparation_course        0
math_score                     0
reading_score                  0
writing_score                  0
dtype: int64

# **Observations:**
1. **Lunch , Test prep , Gender can be made for Label Encoder as they can be classified in binary manner**
2.  **on parental_level_of_education featutre we can perform
   better analysis for possible merging of class**


# **Feature Extraction**

In [11]:
df['Total_Score']=df['math_score']+df['reading_score']+df['writing_score']
df['Average_Score']=df['Total_Score']/3
df.drop(['math_score','reading_score','writing_score'],axis=1,inplace=True)

# **Train Test Split **

In [12]:
X=df.drop(['Average_Score','Total_Score'],axis=1)
y=df['Average_Score']

In [13]:
X.head()

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course
0,female,group B,bachelor's degree,standard,none
1,female,group C,some college,standard,completed
2,female,group B,master's degree,standard,none
3,male,group A,associate's degree,free/reduced,none
4,male,group C,some college,standard,none


In [14]:
y.head()

0    72.666667
1    82.333333
2    92.666667
3    49.333333
4    76.333333
Name: Average_Score, dtype: float64

# Categorical Features and Numerical Features


In [15]:
cat_features=X.select_dtypes(include='object').columns
num_features=X.select_dtypes(exclude='object').columns

In [16]:
for i in cat_features:
  print(i)
  print(X[i].unique())

gender
['female' 'male']
race_ethnicity
['group B' 'group C' 'group A' 'group D' 'group E']
parental_level_of_education
["bachelor's degree" 'some college' "master's degree" "associate's degree"
 'high school' 'some high school']
lunch
['standard' 'free/reduced']
test_preparation_course
['none' 'completed']


In [17]:
for i in num_features:
  print(i)
  print(X[i].unique())

# **Label Encoding**

In [18]:
label=LabelEncoder()
onehot=OneHotEncoder(sparse=False)

In [19]:
X['gender']=label.fit_transform(X['gender'])
X['test_preparation_course']=label.fit_transform(X['test_preparation_course'])
X['lunch']=label.fit_transform(X['lunch'])

In [20]:
X.head()

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course
0,0,group B,bachelor's degree,1,1
1,0,group C,some college,1,0
2,0,group B,master's degree,1,1
3,1,group A,associate's degree,0,1
4,1,group C,some college,1,1


In [21]:
X['race_ethnicity'].unique()

array(['group B', 'group C', 'group A', 'group D', 'group E'],
      dtype=object)

In [22]:
onehot_encoded=onehot.fit_transform(X[['race_ethnicity']])

In [23]:
race_ethnicity_updated=pd.DataFrame(onehot_encoded, columns=onehot.get_feature_names_out(['race_ethnicity']))

In [24]:
X = pd.concat([X.drop('race_ethnicity', axis=1), race_ethnicity_updated], axis=1)

In [25]:
X.head()

Unnamed: 0,gender,parental_level_of_education,lunch,test_preparation_course,race_ethnicity_group A,race_ethnicity_group B,race_ethnicity_group C,race_ethnicity_group D,race_ethnicity_group E
0,0,bachelor's degree,1,1,0.0,1.0,0.0,0.0,0.0
1,0,some college,1,0,0.0,0.0,1.0,0.0,0.0
2,0,master's degree,1,1,0.0,1.0,0.0,0.0,0.0
3,1,associate's degree,0,1,1.0,0.0,0.0,0.0,0.0
4,1,some college,1,1,0.0,0.0,1.0,0.0,0.0


In [26]:
X['parental_level_of_education'].unique()

array(["bachelor's degree", 'some college', "master's degree",
       "associate's degree", 'high school', 'some high school'],
      dtype=object)

# Labeling  
'some high school'--->'high school'

'some college'--------->"associate's degree"

In [27]:
for data in X['parental_level_of_education']:
  if data=='some high school':
    X['parental_level_of_education']=X['parental_level_of_education'].replace('some high school','high school')
  if data=='some college':
    X['parental_level_of_education']=X['parental_level_of_education'].replace('some college',"associate's degree")

In [28]:
X.head()

Unnamed: 0,gender,parental_level_of_education,lunch,test_preparation_course,race_ethnicity_group A,race_ethnicity_group B,race_ethnicity_group C,race_ethnicity_group D,race_ethnicity_group E
0,0,bachelor's degree,1,1,0.0,1.0,0.0,0.0,0.0
1,0,associate's degree,1,0,0.0,0.0,1.0,0.0,0.0
2,0,master's degree,1,1,0.0,1.0,0.0,0.0,0.0
3,1,associate's degree,0,1,1.0,0.0,0.0,0.0,0.0
4,1,associate's degree,1,1,0.0,0.0,1.0,0.0,0.0


In [29]:
onehot_encoded=onehot.fit_transform(X[['parental_level_of_education']])

In [30]:
parental=pd.DataFrame(onehot_encoded, columns=onehot.get_feature_names_out(['parental_level_of_education']))

In [31]:
X = pd.concat([X.drop('parental_level_of_education', axis=1), parental], axis=1)

In [32]:
X.head()

Unnamed: 0,gender,lunch,test_preparation_course,race_ethnicity_group A,race_ethnicity_group B,race_ethnicity_group C,race_ethnicity_group D,race_ethnicity_group E,parental_level_of_education_associate's degree,parental_level_of_education_bachelor's degree,parental_level_of_education_high school,parental_level_of_education_master's degree
0,0,1,1,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0,1,0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0,1,1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,1,0,1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,1,1,1,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0


In [33]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)


# **HyperParameter Tuning**

In [60]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.metrics import mean_squared_error,r2_score




In [61]:
pip install xgboost



In [65]:
from xgboost import XGBRegressor as xgb

In [47]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.model_selection import cross_val_score

# **Standard Scaler**

In [48]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()

In [53]:
scaler.fit(X_train,y_train)

In [78]:
##Create a Function to Evaluate Model
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [67]:
models={
    'LinearRegression':LinearRegression(),
    'KNeighborsRegressor':KNeighborsRegressor(),
    'DecisionTreeRegressor':DecisionTreeRegressor(),
    'RandomForestRegressor':RandomForestRegressor(),
    'SVR':SVR(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    'GradientBoostingRegressor':GradientBoostingRegressor(),
    'AdaBoostRegressor':AdaBoostRegressor(),
    'BaggingRegressor':BaggingRegressor(),
    'ExtraTreesRegressor':ExtraTreesRegressor(),
    'HistGradientBoostingRegressor':HistGradientBoostingRegressor(),
    "xgbRegressor": xgb()

}

In [74]:
# Define your parameter grids for each model
ada_params = {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 1]}
grad_params = {'n_estimators': [100, 200, 300], 'max_depth': [3, 4, 5]}
xgboost_params = {'n_estimators': [100, 200, 300], 'learning_rate': [0.01, 0.1, 0.2]}
rf_params = {'n_estimators': [100, 200, 300], 'max_depth': [5, 10, 15]}
knn_params = {'n_neighbors': [3, 5, 7], 'weights': ['uniform', 'distance']}
svr_params = {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']}
lr_params = {}  # LinearRegression has no hyperparameters to tune in this context
lasso_params = {'alpha': [0.01, 0.1, 1]}
ridge_params = {'alpha': [0.01, 0.1, 1]}
dt_params = {'max_depth': [3, 5, 7], 'min_samples_split': [2, 5, 10]}
bag_params = {'n_estimators': [10, 20, 30], 'max_samples': [0.5, 0.7, 1.0]}
ext_params = {'n_estimators': [100, 200, 300], 'max_depth': [None, 5, 10]}
hist_params = {'max_iter': [100, 200, 300], 'learning_rate': [0.01, 0.1, 0.2]}

In [75]:
randomcv_models = [
    ("ADA", AdaBoostRegressor(), ada_params),
    ("GRAD", GradientBoostingRegressor(), grad_params),
    ("XGBoost", xgb(), xgboost_params),
    ("RF", RandomForestRegressor(), rf_params),
    ("KNN", KNeighborsRegressor(), knn_params),
    ("svr", SVR(), svr_params),
    ("lr", LinearRegression(), lr_params),
    ("lasso", Lasso(), lasso_params),
    ("ridge", Ridge(), ridge_params),
    ("dt", DecisionTreeRegressor(), dt_params),
    ("bag", BaggingRegressor(), bag_params),
    ("ext", ExtraTreesRegressor(), ext_params),
    ("hist", HistGradientBoostingRegressor(), hist_params)
]

In [76]:
# Dictionary to store the best parameters for each model
model_param = {}
for name, model, params in randomcv_models:
    random = RandomizedSearchCV(estimator=model,
                                param_distributions=params,
                                n_iter=100,
                                cv=3,
                                verbose=2,
                                n_jobs=-1)
    random.fit(X_train, y_train)
    model_param[name] = random.best_params_

# Print the best parameters for each model
for model_name in model_param:
    print(f"---------------- Best Params for {model_name} -------------------")
    print(model_param[model_name])


Fitting 3 folds for each of 9 candidates, totalling 27 fits
Fitting 3 folds for each of 9 candidates, totalling 27 fits
Fitting 3 folds for each of 9 candidates, totalling 27 fits
Fitting 3 folds for each of 9 candidates, totalling 27 fits
Fitting 3 folds for each of 6 candidates, totalling 18 fits
Fitting 3 folds for each of 6 candidates, totalling 18 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 3 candidates, totalling 9 fits
Fitting 3 folds for each of 3 candidates, totalling 9 fits
Fitting 3 folds for each of 9 candidates, totalling 27 fits
Fitting 3 folds for each of 9 candidates, totalling 27 fits
Fitting 3 folds for each of 9 candidates, totalling 27 fits
Fitting 3 folds for each of 9 candidates, totalling 27 fits
---------------- Best Params for ADA -------------------
{'n_estimators': 100, 'learning_rate': 0.1}
---------------- Best Params for GRAD -------------------
{'n_estimators': 100, 'max_depth': 3}
---------------- Best Para

In [80]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
## Retraining the models with best parameters
models ={

    "LinearRegression":LinearRegression(),
    "KNeighborsRegressor":KNeighborsRegressor(weights= 'uniform', n_neighbors= 7),
    "DecisionTreeRegressor":DecisionTreeRegressor(min_samples_split=2, max_depth= 3),
    "RandomForestRegressor":RandomForestRegressor(n_estimators=100, max_depth= 5),
    "SVR":SVR(kernel='linear', C=10),
    "Lasso": Lasso(alpha= 0.01),
    "Ridge": Ridge(alpha= 1),
    "bag": BaggingRegressor(n_estimators= 20, max_samples= 0.7),
    "ext": ExtraTreesRegressor(n_estimators= 300, max_depth= None),
    "hist": HistGradientBoostingRegressor(max_iter= 100, learning_rate= 0.2),
    "xgbRegressor": xgb(n_estimators= 300, learning_rate= 0.2)

}
for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train) # Train model

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    model_train_mae , model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)

    model_test_mae , model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)

    print(list(models.keys())[i])

    print('Model performance for Training set')
    print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
    print("- R2 Score: {:.4f}".format(model_train_r2))

    print('----------------------------------')

    print('Model performance for Test set')
    print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
    print("- R2 Score: {:.4f}".format(model_test_r2))

    print('='*35)
    print('\n')

LinearRegression
Model performance for Training set
- Root Mean Squared Error: 12.2110
- Mean Absolute Error: 9.9497
- R2 Score: 0.2524
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 13.2990
- Mean Absolute Error: 10.4198
- R2 Score: 0.1749


KNeighborsRegressor
Model performance for Training set
- Root Mean Squared Error: 12.1605
- Mean Absolute Error: 9.8786
- R2 Score: 0.2586
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 14.2034
- Mean Absolute Error: 11.2295
- R2 Score: 0.0589


DecisionTreeRegressor
Model performance for Training set
- Root Mean Squared Error: 12.4650
- Mean Absolute Error: 10.0332
- R2 Score: 0.2210
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 13.6165
- Mean Absolute Error: 10.7989
- R2 Score: 0.1351


RandomForestRegressor
Model performance for Training set
- Root Mean Squared Error: 11.7650
- Mean Absolute Error: 9.5348
- 