In [1]:
%matplotlib inline
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
import matplotlib.pylab as plt
from dmba import plotDecisionTree, classificationSummary, regressionSummary
import time 
import scipy.stats as stats

### μ1 = morning class performance
### μ2 = not morning class performance

### H0 => μ1 = u2
### HA => μ1 != μ2

In [2]:
# create df and set StudentID as index
student_class_df = pd.read_csv('StudentClass.csv')
student_class_df.set_index('StudentID', inplace=True)
student_class_df = student_class_df.dropna()

In [3]:
# replace grades with quantative values
# A = 4, B = 3, C = 2, D =1, F = 0
student_class_df.loc[student_class_df['GRADE'] == 'A', 'GRADE'] = '4'
student_class_df.loc[student_class_df['GRADE'] == 'B', 'GRADE'] = '3'
student_class_df.loc[student_class_df['GRADE'] == 'C', 'GRADE'] = '2'
student_class_df.loc[student_class_df['GRADE'] == 'D', 'GRADE'] = '1'
student_class_df.loc[student_class_df['GRADE'] == 'F', 'GRADE'] = '0'

In [4]:
# convert GRADE to integer as the former datatype will be string
student_class_df['GRADE'] = student_class_df['GRADE'].astype('int')

In [5]:
# categorize classes as either morning or not morning
# there were 3 classes that had ClassMeetTime as NaN so they do not appear
student_class_df['MorningClass'] = np.where(
    pd.to_timedelta(student_class_df['ClassMeetTime']).between("00:00:00", "11:59:59"), "True", "False")
student_class_df

Unnamed: 0_level_0,GRADE,ClassWeekDay,ClassMeetTime,ClassDate,ClassTitle,MorningClass
StudentID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1.0,4,MW,15:30:00,9/1/2004,Math,False
1.0,4,M,19:00:00,9/1/2004,English,False
1.0,2,W,19:00:00,9/1/2004,Physics,False
1.0,3,MW,11:00:00,9/1/2004,Art,True
1.0,3,MW,12:30:00,9/1/2005,SocialStudy,False
...,...,...,...,...,...,...
1000.0,2,MW,15:30:00,9/1/2008,Physics,False
1000.0,2,MW,14:00:00,9/1/2008,Math,False
1000.0,2,TTH,12:30:00,9/1/2008,ForeignLanguage,False
1000.0,3,TU,19:00:00,9/1/2009,Music,False


In [6]:
# find morning class students
# output is total morning classes for all students
student_morning_class_df = student_class_df[student_class_df['MorningClass'] == 'True']
total_morning_classes =  len(student_morning_class_df)
morning_class_gpas = student_morning_class_df.groupby('StudentID').mean('GRADE')
morning_class_gpas

Unnamed: 0_level_0,GRADE
StudentID,Unnamed: 1_level_1
1.0,3.000000
3.0,3.333333
4.0,3.166667
5.0,3.000000
6.0,2.666667
...,...
996.0,3.200000
997.0,4.000000
998.0,1.428571
999.0,3.000000


In [7]:
# find not morning class students
# ouput is total not morning classes for all students
student_not_morning_class_df = student_class_df[student_class_df['MorningClass'] == 'False']
total_not_morning_classes = len(student_not_morning_class_df)
not_morning_class_gpas = student_not_morning_class_df.groupby('StudentID').mean('GRADE')
not_morning_class_gpas

Unnamed: 0_level_0,GRADE
StudentID,Unnamed: 1_level_1
1.0,3.000000
2.0,3.200000
3.0,3.571429
4.0,3.250000
5.0,3.600000
...,...
996.0,3.000000
997.0,3.375000
998.0,2.000000
999.0,2.888889


In [8]:
# math adds up to the length of the dataset of 9997
print('Total number of morning classes taken: ', total_morning_classes)
print('Morning class class averages: ', (round(morning_class_gpas['GRADE'].mean(),2)))
print('Total number of not morning classes taken: ', total_not_morning_classes)
print('Not morning class  averages: ', (round(not_morning_class_gpas['GRADE'].mean(),2)))
print('Total number of classes taken: ', total_morning_classes + total_not_morning_classes)

Total number of morning classes taken:  3112
Morning class class averages:  2.76
Total number of not morning classes taken:  6885
Not morning class  averages:  2.78
Total number of classes taken:  9997


In [9]:
# find variances to see if they are similar
var_student_morning_class = round(np.var(morning_class_gpas['GRADE']), 6)
print('Morning student class variance: ', var_student_morning_class)

var_student_not_morning_class = round(np.var(not_morning_class_gpas['GRADE']), 6)
print('Morning student class variance: ', var_student_not_morning_class)

Morning student class variance:  0.464021
Morning student class variance:  0.316714


In [15]:
# perform a two sample t-test to check p-value
# we use equal_var = false because our sample sizes are different 3112 vs 6885
stats.ttest_ind(a = morning_class_gpas['GRADE'], b = not_morning_class_gpas['GRADE'], equal_var=False)

Ttest_indResult(statistic=-0.6837973690386011, pvalue=0.4941919621986579)

# Conclusion
The p-value is 0.49, which means there is a 50% change that we will reject the null hypothesis μ1 = u2. It is inconslusive for us to say whether or not the two means are, in fact, equal, or if there is a difference between the two. Comparing the two groups visually, however, tell us that we are comparing 2.76 average for morning classes and 2.78 for not morning classes. We may be able to say there is no difference, but it may be due to random chance. 