In [35]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

In [36]:
df = pd.read_csv('Student_Performance.csv')

In [37]:
df.head()

Unnamed: 0,Hours Studied,Previous Scores,Extracurricular Activities,Sleep Hours,Sample Question Papers Practiced,Performance Index
0,7,99,Yes,9,1,91.0
1,4,82,No,4,2,65.0
2,8,51,Yes,7,2,45.0
3,5,52,Yes,5,2,36.0
4,7,75,No,8,5,66.0


In [38]:
df.columns = df.columns.str.replace(' ', '_')
df

Unnamed: 0,Hours_Studied,Previous_Scores,Extracurricular_Activities,Sleep_Hours,Sample_Question_Papers_Practiced,Performance_Index
0,7,99,Yes,9,1,91.0
1,4,82,No,4,2,65.0
2,8,51,Yes,7,2,45.0
3,5,52,Yes,5,2,36.0
4,7,75,No,8,5,66.0
...,...,...,...,...,...,...
9995,1,49,Yes,4,2,23.0
9996,7,64,Yes,8,5,58.0
9997,6,83,Yes,8,5,74.0
9998,9,97,Yes,7,0,95.0


In [39]:
x = df.drop('Performance_Index', axis = 1)
y = df['Performance_Index']

In [40]:
cat_col = ['Extracurricular_Activities']
num_cols = ['Hours_Studied', 'Previous_Scores',
       'Sleep_Hours', 'Sample_Question_Papers_Practiced']

In [41]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=20, random_state=43)

In [42]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(drop='first'), cat_col),
    ],
    remainder='passthrough'
)

In [43]:
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [44]:
model_pipeline.fit(x_train, y_train)

predictions = model_pipeline.predict(x_test)

In [45]:
from sklearn.metrics import mean_squared_error       
from sklearn.metrics import root_mean_squared_error     
from sklearn.metrics import r2_score 

In [46]:
print(root_mean_squared_error(y_test, predictions))

1.477963770734239


In [47]:
print(mean_squared_error(y_test, predictions))

2.18437690760297


In [48]:
print(r2_score(y_test, predictions))

0.9938883161981954


In [49]:
import pickle

In [50]:
pickle.dump(model_pipeline, open('pipeline.pkl', 'wb'))