##Import Library and Load Dataset

In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [8]:
student_data = pd.read_csv('Student_Performance.csv')

In [9]:
student_data.head()

Unnamed: 0,Hours Studied,Previous Scores,Extracurricular Activities,Sleep Hours,Sample Question Papers Practiced,Performance Index
0,7,99,Yes,9,1,91.0
1,4,82,No,4,2,65.0
2,8,51,Yes,7,2,45.0
3,5,52,Yes,5,2,36.0
4,7,75,No,8,5,66.0


##Data Cleaning

In [10]:
student_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 6 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Hours Studied                     10000 non-null  int64  
 1   Previous Scores                   10000 non-null  int64  
 2   Extracurricular Activities        10000 non-null  object 
 3   Sleep Hours                       10000 non-null  int64  
 4   Sample Question Papers Practiced  10000 non-null  int64  
 5   Performance Index                 10000 non-null  float64
dtypes: float64(1), int64(4), object(1)
memory usage: 468.9+ KB


In [11]:
student_data.isnull().sum()

Unnamed: 0,0
Hours Studied,0
Previous Scores,0
Extracurricular Activities,0
Sleep Hours,0
Sample Question Papers Practiced,0
Performance Index,0


In [12]:
X = student_data.iloc[:, :-1].values
y = student_data.iloc[:, -1].values

In [13]:
print(X)

[[7 99 'Yes' 9 1]
 [4 82 'No' 4 2]
 [8 51 'Yes' 7 2]
 ...
 [6 83 'Yes' 8 5]
 [9 97 'Yes' 7 0]
 [7 74 'No' 8 1]]


In [14]:
print(y)

[91. 65. 45. ... 74. 95. 64.]


## Encoding Categorical Data

In [15]:
student_data['Extracurricular Activities'].unique()

array(['Yes', 'No'], dtype=object)

In [16]:
Ext = ['Yes', 'No']

In [17]:
from sklearn.preprocessing import OrdinalEncoder

In [18]:
student_data.head()

Unnamed: 0,Hours Studied,Previous Scores,Extracurricular Activities,Sleep Hours,Sample Question Papers Practiced,Performance Index
0,7,99,Yes,9,1,91.0
1,4,82,No,4,2,65.0
2,8,51,Yes,7,2,45.0
3,5,52,Yes,5,2,36.0
4,7,75,No,8,5,66.0


In [19]:
enc = OrdinalEncoder(categories=[Ext])

In [20]:
enc.fit_transform(student_data[['Extracurricular Activities']])
#Yes = 0
#No = 1

array([[0.],
       [1.],
       [0.],
       ...,
       [0.],
       [0.],
       [1.]])

In [21]:
student_data['Extracurricular Activities'] = enc.fit_transform(student_data[['Extracurricular Activities']])
student_data.head()

Unnamed: 0,Hours Studied,Previous Scores,Extracurricular Activities,Sleep Hours,Sample Question Papers Practiced,Performance Index
0,7,99,0.0,9,1,91.0
1,4,82,1.0,4,2,65.0
2,8,51,0.0,7,2,45.0
3,5,52,0.0,5,2,36.0
4,7,75,1.0,8,5,66.0


In [22]:
X = student_data.iloc[:, :-1].values
y = student_data.iloc[:, -1].values

In [23]:
print(X)

[[ 7. 99.  0.  9.  1.]
 [ 4. 82.  1.  4.  2.]
 [ 8. 51.  0.  7.  2.]
 ...
 [ 6. 83.  0.  8.  5.]
 [ 9. 97.  0.  7.  0.]
 [ 7. 74.  1.  8.  1.]]


In [24]:
print(y)

[91. 65. 45. ... 74. 95. 64.]


## Splitting the dataset into the Training set and Test set

In [25]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

## Training the Model

In [26]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train, y_train)

In [27]:
#Predict the Test set result
y_pred = model.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[50.45 53.  ]
 [53.09 50.  ]
 [78.25 80.  ]
 ...
 [64.57 66.  ]
 [25.9  27.  ]
 [18.83 21.  ]]


In [28]:
model.coef_

array([ 2.85,  1.02, -0.67,  0.48,  0.19])

In [29]:
model.intercept_

-33.40990475561878

In [30]:
model.score(X_test, y_test)

0.9880686410711422

## Save ML Model

In [31]:
import joblib

In [32]:
joblib.dump(model, 'Student_Performance.pkl')

['Student_Performance.pkl']

In [33]:
model = joblib.load('Student_Performance.pkl')