In [1]:
import numpy as np
import tensorflow as tf
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import accuracy_score, classification_report
import seaborn as sns

In [13]:
raw_data = pd.read_csv("students.csv")
raw_data

Unnamed: 0,Hours_Studied,Attendance,Parental_Involvement,Access_to_Resources,Extracurricular_Activities,Sleep_Hours,Previous_Scores,Motivation_Level,Internet_Access,Tutoring_Sessions,Family_Income,Teacher_Quality,School_Type,Peer_Influence,Physical_Activity,Learning_Disabilities,Parental_Education_Level,Distance_from_Home,Gender,Exam_Score
0,23,84,Low,High,No,7,73,Low,Yes,0,Low,Medium,Public,Positive,3,No,High School,Near,Male,67
1,19,64,Low,Medium,No,8,59,Low,Yes,2,Medium,Medium,Public,Negative,4,No,College,Moderate,Female,61
2,24,98,Medium,Medium,Yes,7,91,Medium,Yes,2,Medium,Medium,Public,Neutral,4,No,Postgraduate,Near,Male,74
3,29,89,Low,Medium,Yes,8,98,Medium,Yes,1,Medium,Medium,Public,Negative,4,No,High School,Moderate,Male,71
4,19,92,Medium,Medium,Yes,6,65,Medium,Yes,3,Medium,High,Public,Neutral,4,No,College,Near,Female,70
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6602,25,69,High,Medium,No,7,76,Medium,Yes,1,High,Medium,Public,Positive,2,No,High School,Near,Female,68
6603,23,76,High,Medium,No,8,81,Medium,Yes,3,Low,High,Public,Positive,2,No,High School,Near,Female,69
6604,20,90,Medium,Low,Yes,6,65,Low,Yes,3,Low,Medium,Public,Negative,2,No,Postgraduate,Near,Female,68
6605,10,86,High,High,Yes,6,91,High,Yes,2,Low,Medium,Private,Positive,3,No,High School,Far,Female,68


In [334]:
raw_data['Hours_Studied'].mean()

19.975329196306948

In [66]:
data = raw_data.copy()

In [104]:
data.isnull().sum()

Hours_Studied                 0
Attendance                    0
Parental_Involvement          0
Access_to_Resources           0
Extracurricular_Activities    0
Sleep_Hours                   0
Previous_Scores               0
Motivation_Level              0
Internet_Access               0
Tutoring_Sessions             0
Family_Income                 0
Teacher_Quality               0
School_Type                   0
Peer_Influence                0
Physical_Activity             0
Learning_Disabilities         0
Parental_Education_Level      0
Distance_from_Home            0
Gender                        0
Exam_Score                    0
dtype: int64

- NULL Value from 3 attributes.
- The way to resolve it is by replacing the NaN value with average
- However, it seems like we need to convert values into numeric, and also dispose any non-important attribute.

In [156]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6607 entries, 0 to 6606
Data columns (total 20 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   Hours_Studied               6607 non-null   int64 
 1   Attendance                  6607 non-null   int64 
 2   Parental_Involvement        6607 non-null   object
 3   Access_to_Resources         6607 non-null   object
 4   Extracurricular_Activities  6607 non-null   object
 5   Sleep_Hours                 6607 non-null   int64 
 6   Previous_Scores             6607 non-null   int64 
 7   Motivation_Level            6607 non-null   object
 8   Internet_Access             6607 non-null   object
 9   Tutoring_Sessions           6607 non-null   int64 
 10  Family_Income               6607 non-null   object
 11  Teacher_Quality             6607 non-null   object
 12  School_Type                 6607 non-null   object
 13  Peer_Influence              6607 non-null   obje

- Handling NaN value dulu.

In [84]:
data.Teacher_Quality.unique()

array(['Medium', 'High', 'Low'], dtype=object)

In [82]:
data['Teacher_Quality'] = data['Teacher_Quality'].fillna('Medium')

In [92]:
data.Parental_Education_Level.unique()

array(['High School', 'College', 'Postgraduate'], dtype=object)

In [90]:
data['Parental_Education_Level'] = data['Parental_Education_Level'].fillna('College')

In [102]:
data.Distance_from_Home.unique()

array(['Near', 'Moderate', 'Far'], dtype=object)

In [100]:
data['Distance_from_Home'] = data['Distance_from_Home'].fillna('Moderate')

- NaN value handled
- Aplikasi encoding untuk berbagai feature

In [204]:
data_encode = data.copy()

In [206]:
data_encode['Parental_Involvement'] = data_encode['Parental_Involvement'].map({'Low':0, 'Medium':1, 'High':2})

In [208]:
data_encode.Parental_Involvement.unique()

array([0, 1, 2], dtype=int64)

In [210]:
data_encode['Access_to_Resources'] = data_encode['Access_to_Resources'].map({'Low':0, 'Medium':1, 'High':2})

In [212]:
data_encode.Access_to_Resources.unique()

array([2, 1, 0], dtype=int64)

In [214]:
data_encode['Extracurricular_Activities'] = data_encode['Extracurricular_Activities'].map({'No':0, 'Yes':1})

In [216]:
data_encode.Extracurricular_Activities.unique()

array([0, 1], dtype=int64)

In [218]:
data_encode['Motivation_Level'] = data_encode['Motivation_Level'].map({'Low':0, 'Medium':1, 'High':2})

In [219]:
data_encode.Motivation_Level.unique()

array([0, 1, 2], dtype=int64)

In [222]:
data_encode['Internet_Access'] = data_encode['Internet_Access'].map({'No':0, 'Yes':1})

In [223]:
data_encode.Internet_Access.unique()

array([1, 0], dtype=int64)

In [226]:
data_encode['Family_Income'] = data_encode['Family_Income'].map({'Low':0, 'Medium':1, 'High':2})

In [228]:
data_encode.Family_Income.unique()

array([0, 1, 2], dtype=int64)

In [230]:
data_encode['Teacher_Quality'] = data_encode['Teacher_Quality'].map({'Low':0, 'Medium':1, 'High':2})

In [232]:
data_encode.Teacher_Quality.unique()

array([1, 2, 0], dtype=int64)

In [234]:
data_encode['Peer_Influence'] = data_encode['Peer_Influence'].map({'Negative':0, 'Neutral':1, 'Positive':2})

In [236]:
data_encode.Peer_Influence.unique()

array([2, 0, 1], dtype=int64)

In [238]:
data_encode['Learning_Disabilities'] = data_encode['Learning_Disabilities'].map({'No':0, 'Yes':1})

In [240]:
data_encode.Learning_Disabilities.unique()

array([0, 1], dtype=int64)

- Feature to be disposed off
- Gender & School Type, and also Parental Education Level & Distance From Home

In [247]:
data_encode = data_encode.drop(['Parental_Education_Level', 'Distance_from_Home', 'Gender', 'School_Type'], axis=1)

In [249]:
data_encode

Unnamed: 0,Hours_Studied,Attendance,Parental_Involvement,Access_to_Resources,Extracurricular_Activities,Sleep_Hours,Previous_Scores,Motivation_Level,Internet_Access,Tutoring_Sessions,Family_Income,Teacher_Quality,Peer_Influence,Physical_Activity,Learning_Disabilities,Exam_Score
0,23,84,0,2,0,7,73,0,1,0,0,1,2,3,0,67
1,19,64,0,1,0,8,59,0,1,2,1,1,0,4,0,61
2,24,98,1,1,1,7,91,1,1,2,1,1,1,4,0,74
3,29,89,0,1,1,8,98,1,1,1,1,1,0,4,0,71
4,19,92,1,1,1,6,65,1,1,3,1,2,1,4,0,70
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6602,25,69,2,1,0,7,76,1,1,1,2,1,2,2,0,68
6603,23,76,2,1,0,8,81,1,1,3,0,2,2,2,0,69
6604,20,90,1,0,1,6,65,0,1,3,0,1,0,2,0,68
6605,10,86,2,2,1,6,91,2,1,2,0,1,2,3,0,68


- export first into CSV for further uses

In [341]:
data_encode.to_csv('students_encoded.csv', index = False)

In [255]:
data_encode.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6607 entries, 0 to 6606
Data columns (total 16 columns):
 #   Column                      Non-Null Count  Dtype
---  ------                      --------------  -----
 0   Hours_Studied               6607 non-null   int64
 1   Attendance                  6607 non-null   int64
 2   Parental_Involvement        6607 non-null   int64
 3   Access_to_Resources         6607 non-null   int64
 4   Extracurricular_Activities  6607 non-null   int64
 5   Sleep_Hours                 6607 non-null   int64
 6   Previous_Scores             6607 non-null   int64
 7   Motivation_Level            6607 non-null   int64
 8   Internet_Access             6607 non-null   int64
 9   Tutoring_Sessions           6607 non-null   int64
 10  Family_Income               6607 non-null   int64
 11  Teacher_Quality             6607 non-null   int64
 12  Peer_Influence              6607 non-null   int64
 13  Physical_Activity           6607 non-null   int64
 14  Learning

- Normalize the data. Separate the featrue from target

In [276]:
data_normalized = data_encode.copy()

In [278]:
data_normalized_x = data_normalized.iloc[:,:-1]

In [280]:
data_normalized_x

Unnamed: 0,Hours_Studied,Attendance,Parental_Involvement,Access_to_Resources,Extracurricular_Activities,Sleep_Hours,Previous_Scores,Motivation_Level,Internet_Access,Tutoring_Sessions,Family_Income,Teacher_Quality,Peer_Influence,Physical_Activity,Learning_Disabilities
0,23,84,0,2,0,7,73,0,1,0,0,1,2,3,0
1,19,64,0,1,0,8,59,0,1,2,1,1,0,4,0
2,24,98,1,1,1,7,91,1,1,2,1,1,1,4,0
3,29,89,0,1,1,8,98,1,1,1,1,1,0,4,0
4,19,92,1,1,1,6,65,1,1,3,1,2,1,4,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6602,25,69,2,1,0,7,76,1,1,1,2,1,2,2,0
6603,23,76,2,1,0,8,81,1,1,3,0,2,2,2,0
6604,20,90,1,0,1,6,65,0,1,3,0,1,0,2,0
6605,10,86,2,2,1,6,91,2,1,2,0,1,2,3,0


In [283]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler

class CustomScaler(BaseEstimator, TransformerMixin):
    def __init__(self,columns):
        self.scaler = StandardScaler()
        self.columns = columns       

    def fit(self, X):       
        self.scaler.fit(X[self.columns])       
        return self

    def transform(self, X):       
        init_col_order = X.columns
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns = self.columns)
        X_not_scaled = X.loc[:,~X.columns.isin(self.columns)]
        return pd.concat([X_not_scaled, X_scaled], axis = 1) [init_col_order]

In [287]:
col_omit = []
col_scale = [x for x in data_normalized_x.columns.values if x not in col_omit]
abs_scaler = CustomScaler(col_scale)
abs_scaler.fit(data_normalized_x)

In [291]:
data_normalized_x_scaled = abs_scaler.transform(data_normalized_x)

In [293]:
data_normalized_x_scaled

Unnamed: 0,Hours_Studied,Attendance,Parental_Involvement,Access_to_Resources,Extracurricular_Activities,Sleep_Hours,Previous_Scores,Motivation_Level,Internet_Access,Tutoring_Sessions,Family_Income,Teacher_Quality,Peer_Influence,Physical_Activity,Learning_Disabilities
0,0.504942,0.348375,-1.562146,1.288574,-1.214685,-0.019796,-0.143800,-1.302866,0.285825,-1.213934,-1.060721,-0.327233,1.070550,0.031411,-0.342867
1,-0.162822,-1.383736,-1.562146,-0.143488,-1.214685,0.661399,-1.116110,-1.302866,0.285825,0.411451,0.285971,-0.327233,-1.575587,1.001199,-0.342867
2,0.671882,1.560853,-0.124267,-0.143488,0.823259,-0.019796,1.106313,0.134442,0.285825,0.411451,0.285971,-0.327233,-0.252518,1.001199,-0.342867
3,1.506587,0.781403,-1.562146,-0.143488,0.823259,0.661399,1.592469,0.134442,0.285825,-0.401242,0.285971,-0.327233,-1.575587,1.001199,-0.342867
4,-0.162822,1.041220,-0.124267,-0.143488,0.823259,-0.700990,-0.699406,0.134442,0.285825,1.224144,0.285971,1.348757,-0.252518,1.001199,-0.342867
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6602,0.838823,-0.950708,1.313613,-0.143488,-1.214685,-0.019796,0.064552,0.134442,0.285825,-0.401242,1.632663,-0.327233,1.070550,-0.938377,-0.342867
6603,0.504942,-0.344469,1.313613,-0.143488,-1.214685,0.661399,0.411806,0.134442,0.285825,1.224144,-1.060721,1.348757,1.070550,-0.938377,-0.342867
6604,0.004119,0.868009,-0.124267,-1.575549,0.823259,-0.700990,-0.699406,-1.302866,0.285825,1.224144,-1.060721,-0.327233,-1.575587,-0.938377,-0.342867
6605,-1.665291,0.521587,1.313613,1.288574,0.823259,-0.700990,1.106313,1.571749,0.285825,0.411451,-1.060721,-0.327233,1.070550,0.031411,-0.342867


- Data normalized, combine it with target value

In [305]:
data_preprocessed = pd.concat([data_normalized_x_scaled, data_normalized['Exam_Score']], axis=1)

In [311]:
data_preprocessed

Unnamed: 0,Hours_Studied,Attendance,Parental_Involvement,Access_to_Resources,Extracurricular_Activities,Sleep_Hours,Previous_Scores,Motivation_Level,Internet_Access,Tutoring_Sessions,Family_Income,Teacher_Quality,Peer_Influence,Physical_Activity,Learning_Disabilities,Exam_Score
0,0.504942,0.348375,-1.562146,1.288574,-1.214685,-0.019796,-0.143800,-1.302866,0.285825,-1.213934,-1.060721,-0.327233,1.070550,0.031411,-0.342867,67
1,-0.162822,-1.383736,-1.562146,-0.143488,-1.214685,0.661399,-1.116110,-1.302866,0.285825,0.411451,0.285971,-0.327233,-1.575587,1.001199,-0.342867,61
2,0.671882,1.560853,-0.124267,-0.143488,0.823259,-0.019796,1.106313,0.134442,0.285825,0.411451,0.285971,-0.327233,-0.252518,1.001199,-0.342867,74
3,1.506587,0.781403,-1.562146,-0.143488,0.823259,0.661399,1.592469,0.134442,0.285825,-0.401242,0.285971,-0.327233,-1.575587,1.001199,-0.342867,71
4,-0.162822,1.041220,-0.124267,-0.143488,0.823259,-0.700990,-0.699406,0.134442,0.285825,1.224144,0.285971,1.348757,-0.252518,1.001199,-0.342867,70
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6602,0.838823,-0.950708,1.313613,-0.143488,-1.214685,-0.019796,0.064552,0.134442,0.285825,-0.401242,1.632663,-0.327233,1.070550,-0.938377,-0.342867,68
6603,0.504942,-0.344469,1.313613,-0.143488,-1.214685,0.661399,0.411806,0.134442,0.285825,1.224144,-1.060721,1.348757,1.070550,-0.938377,-0.342867,69
6604,0.004119,0.868009,-0.124267,-1.575549,0.823259,-0.700990,-0.699406,-1.302866,0.285825,1.224144,-1.060721,-0.327233,-1.575587,-0.938377,-0.342867,68
6605,-1.665291,0.521587,1.313613,1.288574,0.823259,-0.700990,1.106313,1.571749,0.285825,0.411451,-1.060721,-0.327233,1.070550,0.031411,-0.342867,68


- Save to csv

In [308]:
data_preprocessed.to_csv('students_preprocessed.csv', index = False)