In [85]:
import numpy as np
import pandas as pd 
import pickle
from sklearn.model_selection import train_test_split 
from sklearn.impute import SimpleImputer 
from sklearn.preprocessing import OneHotEncoder 
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import MinMaxScaler 
from sklearn.tree import DecisionTreeClassifier 
from sklearn.metrics import accuracy_score

In [65]:
df = pd.read_csv('student_performance_new.csv')

In [66]:
df.head()

Unnamed: 0,StudentID,Name,Gender,AttendanceRate,StudyHoursPerWeek,PreviousGrade,ExtracurricularActivities,ParentalSupport,FinalGrade
0,1,John,Male,85,15.0,78,1,High,80
1,2,Sarah,Female,90,20.0,85,2,Medium,87
2,3,Alex,Male,78,,65,0,Low,68
3,4,Michael,Male,92,25.0,90,3,High,92
4,5,Emma,Female,88,18.0,82,2,Medium,85


In [67]:
df.isnull().sum()

StudentID                    0
Name                         0
Gender                       0
AttendanceRate               0
StudyHoursPerWeek            2
PreviousGrade                0
ExtracurricularActivities    0
ParentalSupport              0
FinalGrade                   0
dtype: int64

In [68]:
df.drop(columns=['Name', 'AttendanceRate', 'PreviousGrade', 'ExtracurricularActivities'], inplace=True)

In [69]:
df.head()

Unnamed: 0,StudentID,Gender,StudyHoursPerWeek,ParentalSupport,FinalGrade
0,1,Male,15.0,High,80
1,2,Female,20.0,Medium,87
2,3,Male,,Low,68
3,4,Male,25.0,High,92
4,5,Female,18.0,Medium,85


In [70]:
# train_test_split 
x_train, x_test, y_train, y_test = train_test_split(df.drop(columns=['FinalGrade']), 
                                                    df['FinalGrade'], 
                                                    test_size = 0.2,
                                                    random_state=42)

In [71]:
x_train.head(2)

Unnamed: 0,StudentID,Gender,StudyHoursPerWeek,ParentalSupport
5,6,Female,,High
0,1,Male,15.0,High


In [72]:
y_train.head(2)

5    90
0    80
Name: FinalGrade, dtype: int64

In [89]:
#Applying imputation 
si = SimpleImputer()
x_train_Studyhrs = si.fit_transform(x_train[['StudyHoursPerWeek']])
x_test_Studyhrs = si.fit_transform(x_test[['StudyHoursPerWeek']])

In [74]:
x_train_Studyhrs

array([[17.5],
       [15. ],
       [17. ],
       [17.5],
       [22. ],
       [18. ],
       [25. ],
       [ 8. ]])

In [75]:
# One hot encoding 
ohe = OneHotEncoder(sparse = False, handle_unknown='ignore')
x_train_gender = ohe.fit_transform(x_train[['Gender']])
x_test_gender = ohe.fit_transform(x_test[['Gender']])



In [76]:
x_train_gender

array([[1., 0.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [0., 1.]])

In [77]:
# Ordinal Encoding
oe = OrdinalEncoder(categories = [['Low', 'Medium', 'High']])
x_train_parentSupport = oe.fit_transform(x_train[['ParentalSupport']])
x_test_parentSupport = oe.fit_transform(x_test[['ParentalSupport']])

In [78]:
x_train_parentSupport

array([[2.],
       [2.],
       [1.],
       [0.],
       [2.],
       [1.],
       [2.],
       [0.]])

In [79]:
x_train_rem = x_train.drop(columns=['Gender','StudyHoursPerWeek','ParentalSupport'])
x_test_rem = x_test.drop(columns=['Gender','StudyHoursPerWeek','ParentalSupport'])

In [80]:
x_train_transformed = np.concatenate((x_train_rem, x_train_gender, x_train_Studyhrs, x_train_parentSupport), axis=1)
x_test_transformed = np.concatenate((x_test_rem, x_test_gender, x_test_Studyhrs, x_test_parentSupport), axis=1)

In [81]:
x_test_transformed.shape

(2, 5)

In [82]:
clf = DecisionTreeClassifier()
clf.fit(x_train_transformed, y_train)

In [83]:
y_pred = clf.predict(x_test_transformed)
y_pred

array([62, 85], dtype=int64)

In [84]:
accuracy_score(y_pred, y_test)

0.0

In [87]:
pickle.dump(ohe,open('ohe.pkl','wb'))
pickle.dump(oe,open('oe.pkl','wb'))
pickle.dump(clf,open('clf.pkl','wb'))