# Import necessary libraries for data manipulation

In [1]:
import pandas as pd, numpy as np
import matplotlib.pyplot as plt, seaborn as sns
from sklearn.model_selection import train_test_split

In [2]:
!unzip archive.zip

Archive:  archive.zip
  inflating: StudentPerformanceFactors.csv  


In [3]:
df = pd.read_csv("StudentPerformanceFactors.csv")

In [4]:
df.head(10)

Unnamed: 0,Hours_Studied,Attendance,Parental_Involvement,Access_to_Resources,Extracurricular_Activities,Sleep_Hours,Previous_Scores,Motivation_Level,Internet_Access,Tutoring_Sessions,Family_Income,Teacher_Quality,School_Type,Peer_Influence,Physical_Activity,Learning_Disabilities,Parental_Education_Level,Distance_from_Home,Gender,Exam_Score
0,23,84,Low,High,No,7,73,Low,Yes,0,Low,Medium,Public,Positive,3,No,High School,Near,Male,67
1,19,64,Low,Medium,No,8,59,Low,Yes,2,Medium,Medium,Public,Negative,4,No,College,Moderate,Female,61
2,24,98,Medium,Medium,Yes,7,91,Medium,Yes,2,Medium,Medium,Public,Neutral,4,No,Postgraduate,Near,Male,74
3,29,89,Low,Medium,Yes,8,98,Medium,Yes,1,Medium,Medium,Public,Negative,4,No,High School,Moderate,Male,71
4,19,92,Medium,Medium,Yes,6,65,Medium,Yes,3,Medium,High,Public,Neutral,4,No,College,Near,Female,70
5,19,88,Medium,Medium,Yes,8,89,Medium,Yes,3,Medium,Medium,Public,Positive,3,No,Postgraduate,Near,Male,71
6,29,84,Medium,Low,Yes,7,68,Low,Yes,1,Low,Medium,Private,Neutral,2,No,High School,Moderate,Male,67
7,25,78,Low,High,Yes,6,50,Medium,Yes,1,High,High,Public,Negative,2,No,High School,Far,Male,66
8,17,94,Medium,High,No,6,80,High,Yes,0,Medium,Low,Private,Neutral,1,No,College,Near,Male,69
9,23,98,Medium,Medium,Yes,8,71,Medium,Yes,0,High,High,Public,Positive,5,No,High School,Moderate,Male,72


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6607 entries, 0 to 6606
Data columns (total 20 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   Hours_Studied               6607 non-null   int64 
 1   Attendance                  6607 non-null   int64 
 2   Parental_Involvement        6607 non-null   object
 3   Access_to_Resources         6607 non-null   object
 4   Extracurricular_Activities  6607 non-null   object
 5   Sleep_Hours                 6607 non-null   int64 
 6   Previous_Scores             6607 non-null   int64 
 7   Motivation_Level            6607 non-null   object
 8   Internet_Access             6607 non-null   object
 9   Tutoring_Sessions           6607 non-null   int64 
 10  Family_Income               6607 non-null   object
 11  Teacher_Quality             6529 non-null   object
 12  School_Type                 6607 non-null   object
 13  Peer_Influence              6607 non-null   obje

In [6]:
df['Teacher_Quality'].isnull().sum()

78

In [7]:
df['Teacher_Quality'].value_counts()

Unnamed: 0_level_0,count
Teacher_Quality,Unnamed: 1_level_1
Medium,3925
High,1947
Low,657


In [8]:
df['Teacher_Quality'].fillna("Medium", inplace=True)

In [9]:
df.columns

Index(['Hours_Studied', 'Attendance', 'Parental_Involvement',
       'Access_to_Resources', 'Extracurricular_Activities', 'Sleep_Hours',
       'Previous_Scores', 'Motivation_Level', 'Internet_Access',
       'Tutoring_Sessions', 'Family_Income', 'Teacher_Quality', 'School_Type',
       'Peer_Influence', 'Physical_Activity', 'Learning_Disabilities',
       'Parental_Education_Level', 'Distance_from_Home', 'Gender',
       'Exam_Score'],
      dtype='object')

In [10]:
# List of columns with 'low', 'medium', 'high' values
columns_to_encode = ['Parental_Involvement',
       'Access_to_Resources','Motivation_Level','Teacher_Quality']
df[columns_to_encode] = df[columns_to_encode].apply(lambda x: x.str.strip())

df[columns_to_encode] = df[columns_to_encode].replace({'Low': 1, 'Medium': 2, 'High': 3})

In [11]:
df[['Extracurricular_Activities','Learning_Disabilities']] = df[['Extracurricular_Activities', 'Learning_Disabilities']].replace({'No': 0, 'Yes': 1})

In [12]:
df.head(10)

Unnamed: 0,Hours_Studied,Attendance,Parental_Involvement,Access_to_Resources,Extracurricular_Activities,Sleep_Hours,Previous_Scores,Motivation_Level,Internet_Access,Tutoring_Sessions,Family_Income,Teacher_Quality,School_Type,Peer_Influence,Physical_Activity,Learning_Disabilities,Parental_Education_Level,Distance_from_Home,Gender,Exam_Score
0,23,84,1,3,0,7,73,1,Yes,0,Low,2,Public,Positive,3,0,High School,Near,Male,67
1,19,64,1,2,0,8,59,1,Yes,2,Medium,2,Public,Negative,4,0,College,Moderate,Female,61
2,24,98,2,2,1,7,91,2,Yes,2,Medium,2,Public,Neutral,4,0,Postgraduate,Near,Male,74
3,29,89,1,2,1,8,98,2,Yes,1,Medium,2,Public,Negative,4,0,High School,Moderate,Male,71
4,19,92,2,2,1,6,65,2,Yes,3,Medium,3,Public,Neutral,4,0,College,Near,Female,70
5,19,88,2,2,1,8,89,2,Yes,3,Medium,2,Public,Positive,3,0,Postgraduate,Near,Male,71
6,29,84,2,1,1,7,68,1,Yes,1,Low,2,Private,Neutral,2,0,High School,Moderate,Male,67
7,25,78,1,3,1,6,50,2,Yes,1,High,3,Public,Negative,2,0,High School,Far,Male,66
8,17,94,2,3,0,6,80,3,Yes,0,Medium,1,Private,Neutral,1,0,College,Near,Male,69
9,23,98,2,2,1,8,71,2,Yes,0,High,3,Public,Positive,5,0,High School,Moderate,Male,72


In [13]:
X = df[['Hours_Studied', 'Attendance', 'Parental_Involvement',
       'Access_to_Resources', 'Extracurricular_Activities', 'Sleep_Hours',
       'Previous_Scores', 'Motivation_Level', 'Tutoring_Sessions', 'Teacher_Quality', 'Physical_Activity', 'Learning_Disabilities']]

y = df['Exam_Score']

In [14]:
X.head(4)

Unnamed: 0,Hours_Studied,Attendance,Parental_Involvement,Access_to_Resources,Extracurricular_Activities,Sleep_Hours,Previous_Scores,Motivation_Level,Tutoring_Sessions,Teacher_Quality,Physical_Activity,Learning_Disabilities
0,23,84,1,3,0,7,73,1,0,2,3,0
1,19,64,1,2,0,8,59,1,2,2,4,0
2,24,98,2,2,1,7,91,2,2,2,4,0
3,29,89,1,2,1,8,98,2,1,2,4,0


In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((5285, 12), (1322, 12), (5285,), (1322,))

In [17]:
from sklearn.linear_model import LinearRegression

model_1LR = LinearRegression()
model_1LR.fit(X_train, y_train)
y_pred_1LR = model_1LR.predict(X_test)

In [18]:
from sklearn.metrics import mean_squared_error, r2_score

mse_1LR = mean_squared_error(y_test, y_pred_1LR)
print(f"Mean Squared Error: {mse_1LR}")
print("R-squared:", r2_score(y_test, y_pred_1LR))

Mean Squared Error: 3.8133155427194376
R-squared: 0.7302231346893786


In [19]:
y_pred_1LRi = model_1LR.predict(X_train)

mse_i = mean_squared_error(y_train, y_pred_1LRi)
print(f"Mean Squared Error: {mse_i}")
print("R-squared:", r2_score(y_train, y_pred_1LRi))

Mean Squared Error: 4.972339981316689
R-squared: 0.676721151491871


In [20]:
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

In [21]:
pipeline = make_pipeline(StandardScaler(), Ridge())

# Define parameter grid for Ridge
param_grid_LR = {'ridge__alpha': [0.1, 1, 10, 100]}

# Perform Grid Search
grid_search_LR = GridSearchCV(pipeline, param_grid_LR, cv=5, scoring='neg_mean_squared_error')
grid_search_LR.fit(X_train, y_train)

# Best parameters and score
print("Best parameters:", grid_search_LR.best_params_)
print("Best score:", -grid_search_LR.best_score_)

Best parameters: {'ridge__alpha': 10}
Best score: 4.997835013656842


In [22]:
pipeline = make_pipeline(StandardScaler(), Lasso())

# Define parameter grid for Lasso
param_grid_LR2 = {'lasso__alpha': [0.1, 1, 10, 100]}

# Perform Grid Search
grid_search_LR2 = GridSearchCV(pipeline, param_grid_LR2, cv=5, scoring='neg_mean_squared_error')
grid_search_LR2.fit(X_train, y_train)

# Best parameters and score
print("Best parameters:", grid_search_LR2.best_params_)
print("Best score:", -grid_search_LR2.best_score_)

Best parameters: {'lasso__alpha': 0.1}
Best score: 5.11105684255895


In [23]:
pipeline = make_pipeline(StandardScaler(), LinearRegression())

# Perform cross-validation
# cv=5 means 5-fold cross-validation
scores = cross_val_score(pipeline, X, y, cv=5, scoring='neg_mean_squared_error')

# Convert negative MSE to positive MSE for interpretation
mse_scores = -scores

print("Mean Squared Error for each fold:", mse_scores)
print("Average MSE across folds:", np.mean(mse_scores))
print("Standard deviation of MSE:", np.std(mse_scores))

Mean Squared Error for each fold: [6.51663995 4.72687653 4.85258235 4.67844087 3.04179933]
Average MSE across folds: 4.763267807265372
Standard deviation of MSE: 1.1003876247120186


In [24]:
model_o = LinearRegression()

model_o.fit(X_train, y_train)
train_preds = model_o.predict(X_train)
train_mse = mean_squared_error(y_train, train_preds)

print("Training MSE:", train_mse)

Training MSE: 4.972339981316689


In [25]:
from sklearn.ensemble import RandomForestRegressor

model_2RF = RandomForestRegressor()
model_2RF.fit(X_train, y_train)
y_pred_2RF = model_2RF.predict(X_test)

mse = mean_squared_error(y_test, y_pred_2RF)
print(f"Mean Squared Error: {mse}")
print("R-squared:", r2_score(y_test, y_pred_2RF))

Mean Squared Error: 4.916950832072616
R-squared: 0.6521453397960787


In [26]:
param_grid = {
    'n_estimators': [100, 200, 300],  # Number of trees
    'max_depth': [3, 5, 7],           # Depth of each tree
    'learning_rate': [0.01, 0.1, 0.3],# Step size shrinkage
    'subsample': [0.8, 1.0],          # Fraction of samples to use for training
    'colsample_bytree': [0.8, 1.0],   # Fraction of features to use for each tree
    'reg_alpha': [0, 0.1, 1],         # L1 regularization
    'reg_lambda': [1, 1.5, 2]         # L2 regularization
}


In [27]:
from xgboost import XGBRegressor
model_xgb = XGBRegressor()
grid_search = GridSearchCV(estimator=model_xgb, param_grid=param_grid,
                           scoring='neg_mean_squared_error', cv=5, verbose=1)
grid_search.fit(X_train, y_train)
print("Best parameters:", grid_search.best_params_)
print("Best score:", -grid_search.best_score_)  # Negating the score to get positive MSE

Fitting 5 folds for each of 972 candidates, totalling 4860 fits
Best parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 200, 'reg_alpha': 1, 'reg_lambda': 2, 'subsample': 1.0}
Best score: 5.2719634032924


In [28]:
import pickle
# Save the model
with open('student_performance_model.pkl', 'wb') as file:
    pickle.dump(model_1LR, file)
