# Project 2 - Analyzing Student Performance Data Cleaning


We will be using the [UCI ML Student Performance dataset](https://archive.ics.uci.edu/ml/datasets/Student+Performance). This dataset collects various attributes from two schools in Portugal and gathers student performance.

In [1]:
# YOUR CODE HERE
import numpy as np
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats

pd.set_option("display.max_rows", 15, "display.max_columns", None)
df_mat = pd.read_csv('student-mat.csv', sep = ';')
df_por = pd.read_csv('student-por.csv', sep = ';')

In [2]:
df_mat

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,reason,guardian,traveltime,studytime,failures,schoolsup,famsup,paid,activities,nursery,higher,internet,romantic,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,course,mother,2,2,0,yes,no,no,no,yes,yes,no,no,4,3,4,1,1,3,6,5,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,course,father,1,2,0,no,yes,no,no,no,yes,yes,no,5,3,3,1,1,3,4,5,5,6
2,GP,F,15,U,LE3,T,1,1,at_home,other,other,mother,1,2,3,yes,no,yes,no,yes,yes,yes,no,4,3,2,2,3,3,10,7,8,10
3,GP,F,15,U,GT3,T,4,2,health,services,home,mother,1,3,0,no,yes,yes,yes,yes,yes,yes,yes,3,2,2,1,1,5,2,15,14,15
4,GP,F,16,U,GT3,T,3,3,other,other,home,father,1,2,0,no,yes,yes,no,yes,yes,no,no,4,3,2,1,2,5,4,6,10,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
390,MS,M,20,U,LE3,A,2,2,services,services,course,other,1,2,2,no,yes,yes,no,yes,yes,no,no,5,5,4,4,5,4,11,9,9,9
391,MS,M,17,U,LE3,T,3,1,services,services,course,mother,2,1,0,no,no,no,no,no,yes,yes,no,2,4,5,3,4,2,3,14,16,16
392,MS,M,21,R,GT3,T,1,1,other,other,course,other,1,1,3,no,no,no,no,no,yes,no,no,5,5,3,3,3,3,3,10,8,7
393,MS,M,18,R,LE3,T,3,2,services,other,course,mother,3,1,0,no,no,no,no,no,yes,yes,no,4,4,1,3,4,5,0,11,12,10


In [3]:
df_por

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,reason,guardian,traveltime,studytime,failures,schoolsup,famsup,paid,activities,nursery,higher,internet,romantic,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,course,mother,2,2,0,yes,no,no,no,yes,yes,no,no,4,3,4,1,1,3,4,0,11,11
1,GP,F,17,U,GT3,T,1,1,at_home,other,course,father,1,2,0,no,yes,no,no,no,yes,yes,no,5,3,3,1,1,3,2,9,11,11
2,GP,F,15,U,LE3,T,1,1,at_home,other,other,mother,1,2,0,yes,no,no,no,yes,yes,yes,no,4,3,2,2,3,3,6,12,13,12
3,GP,F,15,U,GT3,T,4,2,health,services,home,mother,1,3,0,no,yes,no,yes,yes,yes,yes,yes,3,2,2,1,1,5,0,14,14,14
4,GP,F,16,U,GT3,T,3,3,other,other,home,father,1,2,0,no,yes,no,no,yes,yes,no,no,4,3,2,1,2,5,0,11,13,13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
644,MS,F,19,R,GT3,T,2,3,services,other,course,mother,1,3,1,no,no,no,yes,no,yes,yes,no,5,4,2,1,2,5,4,10,11,10
645,MS,F,18,U,LE3,T,3,1,teacher,services,course,mother,1,2,0,no,yes,no,no,yes,yes,yes,no,4,3,4,1,1,1,4,15,15,16
646,MS,F,18,U,GT3,T,1,1,other,other,course,mother,2,2,0,no,no,no,yes,yes,yes,no,no,1,1,1,1,1,5,6,11,12,9
647,MS,M,17,U,LE3,T,3,1,services,services,course,mother,2,1,0,no,no,no,no,no,yes,yes,no,2,4,5,3,4,2,6,10,10,10


As we can see, there are two datasets here. df_mat is the performance of these students in Math class while df_por is their performance in Portuguese class.

We can start cleaning our data by renaming the columns to be a lot more clear as to what they contain.

In [4]:
df_por = df_por.rename(columns={"school":"School", "sex":"Sex", "age":"Age","address":"addressType","famsize":"famSize","Pstatus":"parentCohabitation", "Medu":"momEducation", "Fedu":"dadEducation", "Mjob":"momJob","Fjob":"dadJob","reason":"schoolReason","guardian":"Guardian","traveltime":"schoolCommuteTime","studytime":"wklyStudyTime","failures":"pastFailures","schoolsup":"schoolSupport","famsup":"familySupport","paid":"extraPaidClasses","activities":"extraActivities","nursery":"nurserySchool","higher":"pursueHigherEdu","internet":"internetAccess","romantic":"inRelationsip","famrel":"familyRelQuality","freetime":"freeTime","goout":"outWithFriends","Dalc":"workdayAlcoholConsumption","Walc":"weekendAlcoholConsumption", "health":"healthStatus","absences":"schoolAbsences", "G1":"firstPeriodGrade", "G2":"secondPeriodGrade", "G3":"finalGrade"})
#df_por.drop(['famSize','parentCohabitation','momJob','dadJob','nurserySchool'], axis=1, inplace=True)
df_por

Unnamed: 0,School,Sex,Age,addressType,famSize,parentCohabitation,momEducation,dadEducation,momJob,dadJob,schoolReason,Guardian,schoolCommuteTime,wklyStudyTime,pastFailures,schoolSupport,familySupport,extraPaidClasses,extraActivities,nurserySchool,pursueHigherEdu,internetAccess,inRelationsip,familyRelQuality,freeTime,outWithFriends,workdayAlcoholConsumption,weekendAlcoholConsumption,healthStatus,schoolAbsences,firstPeriodGrade,secondPeriodGrade,finalGrade
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,course,mother,2,2,0,yes,no,no,no,yes,yes,no,no,4,3,4,1,1,3,4,0,11,11
1,GP,F,17,U,GT3,T,1,1,at_home,other,course,father,1,2,0,no,yes,no,no,no,yes,yes,no,5,3,3,1,1,3,2,9,11,11
2,GP,F,15,U,LE3,T,1,1,at_home,other,other,mother,1,2,0,yes,no,no,no,yes,yes,yes,no,4,3,2,2,3,3,6,12,13,12
3,GP,F,15,U,GT3,T,4,2,health,services,home,mother,1,3,0,no,yes,no,yes,yes,yes,yes,yes,3,2,2,1,1,5,0,14,14,14
4,GP,F,16,U,GT3,T,3,3,other,other,home,father,1,2,0,no,yes,no,no,yes,yes,no,no,4,3,2,1,2,5,0,11,13,13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
644,MS,F,19,R,GT3,T,2,3,services,other,course,mother,1,3,1,no,no,no,yes,no,yes,yes,no,5,4,2,1,2,5,4,10,11,10
645,MS,F,18,U,LE3,T,3,1,teacher,services,course,mother,1,2,0,no,yes,no,no,yes,yes,yes,no,4,3,4,1,1,1,4,15,15,16
646,MS,F,18,U,GT3,T,1,1,other,other,course,mother,2,2,0,no,no,no,yes,yes,yes,no,no,1,1,1,1,1,5,6,11,12,9
647,MS,M,17,U,LE3,T,3,1,services,services,course,mother,2,1,0,no,no,no,no,no,yes,yes,no,2,4,5,3,4,2,6,10,10,10


In [5]:
df_mat = df_mat.rename(columns={"school":"School", "sex":"Sex", "age":"Age","address":"addressType","famsize":"famSize","Pstatus":"parentCohabitation", "Medu":"momEducation", "Fedu":"dadEducation", "Mjob":"momJob","Fjob":"dadJob","reason":"schoolReason","guardian":"Guardian","traveltime":"schoolCommuteTime","studytime":"wklyStudyTime","failures":"pastFailures","schoolsup":"schoolSupport","famsup":"familySupport","paid":"extraPaidClasses","activities":"extraActivities","nursery":"nurserySchool","higher":"pursueHigherEdu","internet":"internetAccess","romantic":"inRelationsip","famrel":"familyRelQuality","freetime":"freeTime","goout":"outWithFriends","Dalc":"workdayAlcoholConsumption","Walc":"weekendAlcoholConsumption", "health":"healthStatus","absences":"schoolAbsences", "G1":"firstPeriodGrade", "G2":"secondPeriodGrade", "G3":"finalGrade"})
#df_mat.drop(['famSize','parentCohabitation','momJob','dadJob','nurserySchool'], axis=1, inplace=True)
df_mat

Unnamed: 0,School,Sex,Age,addressType,famSize,parentCohabitation,momEducation,dadEducation,momJob,dadJob,schoolReason,Guardian,schoolCommuteTime,wklyStudyTime,pastFailures,schoolSupport,familySupport,extraPaidClasses,extraActivities,nurserySchool,pursueHigherEdu,internetAccess,inRelationsip,familyRelQuality,freeTime,outWithFriends,workdayAlcoholConsumption,weekendAlcoholConsumption,healthStatus,schoolAbsences,firstPeriodGrade,secondPeriodGrade,finalGrade
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,course,mother,2,2,0,yes,no,no,no,yes,yes,no,no,4,3,4,1,1,3,6,5,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,course,father,1,2,0,no,yes,no,no,no,yes,yes,no,5,3,3,1,1,3,4,5,5,6
2,GP,F,15,U,LE3,T,1,1,at_home,other,other,mother,1,2,3,yes,no,yes,no,yes,yes,yes,no,4,3,2,2,3,3,10,7,8,10
3,GP,F,15,U,GT3,T,4,2,health,services,home,mother,1,3,0,no,yes,yes,yes,yes,yes,yes,yes,3,2,2,1,1,5,2,15,14,15
4,GP,F,16,U,GT3,T,3,3,other,other,home,father,1,2,0,no,yes,yes,no,yes,yes,no,no,4,3,2,1,2,5,4,6,10,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
390,MS,M,20,U,LE3,A,2,2,services,services,course,other,1,2,2,no,yes,yes,no,yes,yes,no,no,5,5,4,4,5,4,11,9,9,9
391,MS,M,17,U,LE3,T,3,1,services,services,course,mother,2,1,0,no,no,no,no,no,yes,yes,no,2,4,5,3,4,2,3,14,16,16
392,MS,M,21,R,GT3,T,1,1,other,other,course,other,1,1,3,no,no,no,no,no,yes,no,no,5,5,3,3,3,3,3,10,8,7
393,MS,M,18,R,LE3,T,3,2,services,other,course,mother,3,1,0,no,no,no,no,no,yes,yes,no,4,4,1,3,4,5,0,11,12,10
