In [20]:
# Yippee
#Removing teacher quality and previous scores from the dataset. 
#Feature engineering, combining features like grade/hour studied.
#Categorical variables with natural ordering, like low-med-high are encoded into ordinal variables, 1, 2, 3. 
#Boolean values are converted into 0 1 for yes no.
#Other features such as school type are one hot encoded.


#Import libraries
import pandas as pd
import numpy as np
import sklearn as sk
# Assuming file is in the same directory, import data.
dataf = pd.read_csv("StudentPerformanceFactors.csv")

#Remove Teacher quality, previous scores, and Distance from home, since they are either subjective or not useful.
#[Teacher_Quality], [Previous_Scores], [Distance_from_Home]
dataf = dataf.drop(labels=["Teacher_Quality","Previous_Scores","Distance_from_Home"], axis=1)

#Encode Low --> 1, Medium --> 2, High --> 3
mapper = {"Low":1, "Medium":2, "High":3}
dataf["Parental_Involvement"] = dataf["Parental_Involvement"].map(mapper)
dataf["Access_to_Resources"] = dataf["Access_to_Resources"].map(mapper)
dataf["Family_Income"] = dataf["Family_Income"].map(mapper)

mapper = {"Low":0.33, "Medium":0.66, "High":1} # For interaction term
dataf["Motivation_Level"] = dataf["Motivation_Level"].map(mapper) # Can move this up 2 lines to get the 1-2-3 encoding for this column.


#Encode Boolean values into binary currently 1 for yes, 0 for no. Can change depending on how we want it.
mapper = {"Yes":1, "No":0}
dataf["Extracurricular_Activities"] = dataf["Extracurricular_Activities"].map(mapper)
dataf["Internet_Access"] = dataf["Internet_Access"].map(mapper)
dataf["Learning_Disabilities"] = dataf["Learning_Disabilities"].map(mapper)

# One-hot encoding, for School_Type, Peer_Incluence, Parental_Education_Level. True = 1, False = 0.
f = pd.get_dummies(dataf['Parental_Education_Level'])
d = pd.get_dummies(dataf['School_Type'])
g = pd.get_dummies(dataf['Peer_Influence'])
f = f.apply(lambda x: x.apply(lambda y: 1 if y else 0))
d = d.apply(lambda x: x.apply(lambda y: 1 if y else 0))
g = g.apply(lambda x: x.apply(lambda y: 1 if y else 0))
dataf = dataf.drop(labels=["Parental_Education_Level","School_Type","Peer_Influence"], axis=1)



res = pd.concat([dataf,f.reset_index(drop=True),d.reset_index(drop=True),g.reset_index(drop=True)], axis=1, ignore_index=True)

colnames = [
    list(dataf.columns),
    list(f.columns),
    list(d.columns),
    list(g.columns)
]

flatten = lambda nested_lists: [item for sublist in nested_lists for item in sublist]

res.columns = flatten(colnames)
dataf = res
# College, High School, Postgraduate are the Parental_Education_Level
# Private, Public are the school types
# Negatve, Neutral, Positive are the Peer_Influence Types.

# Unknown whether or not we want to use Gender. Below includes code to delete the column, or change it to 0 = male 1 = female.

#mapper = {"Male":0, "Female":1} # Encode the values
#dataf["Gender"] = dataf["Gender"].map(mapper) 

#dataf = dataf.drop(labels="Gender", axis=1) # Drop column


#Feature engineering. Combining Features, like Motivation_Level and Hours_Studied.

#Combining Motivation_Level and Hours_Studied into an ordinal variable Motivation, then normalizing it to be between 0 and 1.
#We take Motivation_Level and split it into 3, low = 0.33, medium = 0.66, high = 1.
#Multiply it against Hours_Studied to get effective study hours

dataf['Motivation_Level'] = dataf['Motivation_Level'].multiply(dataf["Hours_Studied"], axis="index")
# Hours studied, 1-44, Median 20, Average 19.75->20~.
dataf['Motivation_Level'] = dataf['Motivation_Level'].div(44) # 44 because thats the max amount of hours_studied. This brings us from 0-1.
#dataf['Motivation_Level'] = dataf['Motivation_Level'].div(44).round(4) # If you want to round to 4 decimal points. Or any amount, rather.
#Not too sure what else to do.



# Splitting the data into 60/20/20 for train/validate/test.
train, validate, test = np.split(dataf.sample(frac=1, random_state=42), [int(.6*len(dataf)), int(.8*len(dataf))])
# Random_state=42 for reproducibility. 3964 rows of data for train, 1321 for validate/test, 1322 for the other validate/test.


dataf # The dataframe at the end.


  return bound(*args, **kwds)


Unnamed: 0,Hours_Studied,Attendance,Parental_Involvement,Access_to_Resources,Extracurricular_Activities,Sleep_Hours,Motivation_Level,Internet_Access,Tutoring_Sessions,Family_Income,...,Gender,Exam_Score,College,High School,Postgraduate,Private,Public,Negative,Neutral,Positive
1009,17,61,1,2,1,6,0.386364,1,0,2,...,Male,63,0,0,1,0,1,0,1,0
3366,17,92,2,2,1,8,0.255000,1,1,1,...,Male,70,0,0,1,0,1,0,0,1
6449,22,74,2,3,1,9,0.500000,1,3,2,...,Female,68,0,1,0,0,1,0,1,0
637,20,65,2,1,1,7,0.454545,1,0,2,...,Male,80,0,0,1,0,1,0,1,0
3331,11,95,2,2,1,9,0.250000,1,4,1,...,Male,68,0,1,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1920,8,99,1,3,1,6,0.181818,1,2,2,...,Male,70,0,0,1,0,1,0,0,1
513,15,90,2,1,1,6,0.112500,1,2,3,...,Male,69,0,1,0,0,1,0,0,1
4177,22,62,3,2,1,9,0.165000,1,1,3,...,Male,64,1,0,0,1,0,1,0,0
1308,27,81,2,1,1,9,0.202500,1,1,2,...,Female,68,1,0,0,0,1,0,1,0
