In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
df = pd.read_csv('Student_Performance.csv')

In [4]:
df.head()

Unnamed: 0,Hours Studied,Previous Scores,Extracurricular Activities,Sleep Hours,Sample Question Papers Practiced,Performance Index
0,7,99,Yes,9,1,91.0
1,4,82,No,4,2,65.0
2,8,51,Yes,7,2,45.0
3,5,52,Yes,5,2,36.0
4,7,75,No,8,5,66.0


In [5]:
df.isnull().sum()

Hours Studied                       0
Previous Scores                     0
Extracurricular Activities          0
Sleep Hours                         0
Sample Question Papers Practiced    0
Performance Index                   0
dtype: int64

In [6]:
df['Extracurricular Activities'] = np.where(df['Extracurricular Activities'] == 'Yes', 1, 0)

In [7]:
df['Extracurricular Activities']

0       1
1       0
2       1
3       1
4       0
       ..
9995    1
9996    1
9997    1
9998    1
9999    0
Name: Extracurricular Activities, Length: 10000, dtype: int32

In [8]:
df['Extracurricular Activities'].value_counts()

Extracurricular Activities
0    5052
1    4948
Name: count, dtype: int64

In [9]:
df.head()

Unnamed: 0,Hours Studied,Previous Scores,Extracurricular Activities,Sleep Hours,Sample Question Papers Practiced,Performance Index
0,7,99,1,9,1,91.0
1,4,82,0,4,2,65.0
2,8,51,1,7,2,45.0
3,5,52,1,5,2,36.0
4,7,75,0,8,5,66.0


In [10]:
X = df.iloc[:, :-1]

In [11]:
y = df['Performance Index']

In [12]:
X.head()

Unnamed: 0,Hours Studied,Previous Scores,Extracurricular Activities,Sleep Hours,Sample Question Papers Practiced
0,7,99,1,9,1
1,4,82,0,4,2
2,8,51,1,7,2
3,5,52,1,5,2
4,7,75,0,8,5


In [13]:
y.head()

0    91.0
1    65.0
2    45.0
3    36.0
4    66.0
Name: Performance Index, dtype: float64

In [15]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge, RidgeCV, LassoCV, ElasticNet, ElasticNetCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=16)

In [17]:
scaler = StandardScaler()

In [18]:
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [25]:
LinearRegressionModel = LinearRegression()
LinearRegressionModel.fit(X_train_scaled, y_train)

y_preds = LinearRegressionModel.predict(X_test_scaled)
print(mean_absolute_error(y_test, y_preds))
print(mean_squared_error(y_test, y_preds))
print(mean_squared_error(y_test, y_preds)**0.5)
print(r2_score(y_test, y_preds))

1.6360714008992343
4.289811935065571
2.0711861179202535
0.9888151568004038


In [26]:
RidgeModel = Ridge()
RidgeModel.fit(X_train_scaled, y_train)

y_preds2 = RidgeModel.predict(X_test_scaled)
print(mean_absolute_error(y_test, y_preds2))
print(mean_squared_error(y_test, y_preds2))
print(mean_squared_error(y_test, y_preds2)**0.5)
print(r2_score(y_test, y_preds2))

1.6360156612227323
4.289532175162231
2.0711185806617234
0.9888158862194785


In [27]:
LassoModel = Lasso()
LassoModel.fit(X_train_scaled, y_train)

y_preds3 = LassoModel.predict(X_test_scaled)
print(mean_absolute_error(y_test, y_preds3))
print(mean_squared_error(y_test, y_preds3))
print(mean_squared_error(y_test, y_preds3)**0.5)
print(r2_score(y_test, y_preds3))

2.176920949545953
7.432195070394744
2.726205250966028
0.9806220091347883


In [28]:
ElasticModel = ElasticNet()
ElasticModel.fit(X_train_scaled, y_train)

y_preds4 = ElasticModel.predict(X_test_scaled)
print(mean_absolute_error(y_test, y_preds4))
print(mean_squared_error(y_test, y_preds4))
print(mean_squared_error(y_test, y_preds4)**0.5)
print(r2_score(y_test, y_preds4))

6.091098754087972
53.793886095424874
7.334431545486321
0.8597429933568711


In [39]:
RidgeModelCV = RidgeCV(cv=10)
RidgeModelCV.fit(X_train_scaled, y_train)

y_preds5 = RidgeModelCV.predict(X_test_scaled)
print(mean_absolute_error(y_test, y_preds5))
print(mean_squared_error(y_test, y_preds5))
print(mean_squared_error(y_test, y_preds5)**0.5)
print(r2_score(y_test, y_preds5))

1.6360656646958671
4.289783165872611
2.0711791728077538
0.9888152318104315


In [40]:
LassoModelCV = LassoCV(cv=10)
LassoModelCV.fit(X_train_scaled, y_train)

y_preds6 = LassoModelCV.predict(X_test_scaled)
print(mean_absolute_error(y_test, y_preds6))
print(mean_squared_error(y_test, y_preds6))
print(mean_squared_error(y_test, y_preds6)**0.5)
print(r2_score(y_test, y_preds6))
print(LassoModelCV.alpha_)

1.63577096371224
4.286241902059377
2.07032410555917
0.9888244649612267
0.0173982196219884


In [41]:
ElasticModelCV = ElasticNetCV(cv=10)
ElasticModelCV.fit(X_train_scaled, y_train)

y_preds7 = ElasticModelCV.predict(X_test_scaled)
print(mean_absolute_error(y_test, y_preds7))
print(mean_squared_error(y_test, y_preds7))
print(mean_squared_error(y_test, y_preds7)**0.5)
print(r2_score(y_test, y_preds7))

1.653727650446129
4.385249670556185
2.0940987728749056
0.988566321624656


In [38]:
import pickle

In [42]:
pickle.dump(LassoModelCV, open('Marks.pkl', 'wb'))
pickle.dump(scaler, open('scaler.pkl', 'wb'))

In [43]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 5 columns):
 #   Column                            Non-Null Count  Dtype
---  ------                            --------------  -----
 0   Hours Studied                     10000 non-null  int64
 1   Previous Scores                   10000 non-null  int64
 2   Extracurricular Activities        10000 non-null  int32
 3   Sleep Hours                       10000 non-null  int64
 4   Sample Question Papers Practiced  10000 non-null  int64
dtypes: int32(1), int64(4)
memory usage: 351.7 KB


In [45]:
X.head(1)

Unnamed: 0,Hours Studied,Previous Scores,Extracurricular Activities,Sleep Hours,Sample Question Papers Practiced
0,7,99,1,9,1
