In [1]:
import pandas as pd
import csv

## read input training file
df = pd.read_csv('training.psv', sep='|')
df.columns = ['student_id', 'level', 'course', 'grade','major']

## check the data type, it turns out all the columns are strings
var = list(df.select_dtypes(include=['object']).columns)
print var

## split course, only keep the course description part, which has better prediction power
df['course'] = df['course'].str.split(':',n=1,expand=True)[0]

## data transformation >> to generate model development data
    ## Per student id has only one major, 
    ## the ideal MDD would be one row of record per student. 
    ## The dependent variable is "major, while the independent varaible are level, course, and grade. 
    ## student id has no prediction power, it is considered as index. 

df = pd.get_dummies(df, columns=['level','course','grade'])
df[df['student_id']=='01W7KB8TDNWNx4YW']

['student_id', 'level', 'course', 'grade', 'major']


Unnamed: 0,student_id,major,level_Freshman,level_Junior,level_Senior,level_Sophomore,course_006,course_ABRD,course_ACB,course_ACCT,...,grade_D,grade_D+,grade_D-,grade_I,grade_N,grade_P,grade_R,grade_S,grade_U,grade_WX
46480,01W7KB8TDNWNx4YW,Psychology,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
46481,01W7KB8TDNWNx4YW,Psychology,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
46482,01W7KB8TDNWNx4YW,Psychology,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
46483,01W7KB8TDNWNx4YW,Psychology,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
46484,01W7KB8TDNWNx4YW,Psychology,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
46485,01W7KB8TDNWNx4YW,Psychology,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
46486,01W7KB8TDNWNx4YW,Psychology,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
46487,01W7KB8TDNWNx4YW,Psychology,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
46488,01W7KB8TDNWNx4YW,Psychology,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
46489,01W7KB8TDNWNx4YW,Psychology,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [2]:
## compress the data to one student one record. 
df1 = df.groupby(['student_id','major']).agg('sum')

for i in df1.columns:
    if i.split('_')[0] in ['level','course']:
        df1.loc[df1[i]>0,i]=1

## make student id as the index. 
df2 = df1.reset_index(drop=False)
df2 = df2.set_index('student_id')

## if tehre is missing valure, replace NA with 0. 
df2.fillna(0)
df2.head()

Unnamed: 0_level_0,major,level_Freshman,level_Junior,level_Senior,level_Sophomore,course_006,course_ABRD,course_ACB,course_ACCT,course_ACTS,...,grade_D,grade_D+,grade_D-,grade_I,grade_N,grade_P,grade_R,grade_S,grade_U,grade_WX
student_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
01DiJuoJAB395ucJ,Business Analytics,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
01MhxeQl5FhRsf3f,Business,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
01W7KB8TDNWNx4YW,Psychology,1,1,1,1,0,0,0,0,0,...,0,0,0,0,0,0,1,1,0,0
042Rmpv5B2kXdfBR,Communication Studies,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
04DuzbneGqk0o0jY,Environmental Sciences,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
## split the data into train and test data set, 75% and 25% respectively 
from sklearn.cross_validation import train_test_split
X = df2.drop(['major'],axis=1)
y = df2['major']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=4)

print X_train.shape,X_test.shape
print y_train.shape, y_test.shape

(7500, 207) (2500, 207)
(7500L,) (2500L,)




In [28]:
## deploy the algorithm and predict
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

## prediction performance measurement. 
from sklearn import metrics
from sklearn.metrics import accuracy_score
confusion = metrics.confusion_matrix(y_test, y_pred)
print 'The prediction accuracy rate on testing set is: {}'.format(accuracy_score(y_test, y_pred))

The prediction accuracy rate on testing set is: 0.788


In [None]:
## prediction performance measurement. 
from sklearn import metrics
from sklearn.metrics import accuracy_score
confusion = metrics.confusion_matrix(y_train, y_pred)
print 'The prediction accuracy rate on testing set is: {}'.format(accuracy_score(y_test, y_pred))

In [21]:
## print out the importance score
importance ={'Variable':X_train.columns.values,'Importance_Score':rf.feature_importances_}
importance = pd.DataFrame.from_dict(importance)
print importance.sort_values(['Importance_Score'],ascending=False).head(10)

     Importance_Score     Variable
166          0.074194  course_SPAN
142          0.053152  course_POLI
85           0.042989   course_HHP
176          0.037438  course_THTR
73           0.031955   course_ESL
168          0.030226   course_SRM
186          0.028839      grade_A
204          0.025728      grade_S
185          0.024998  course_WRIT
150          0.023749  course_RHET


array(['Electrical Engineering', 'Economics', 'Communication Studies', ...,
       'Interdepartmental Studies', 'Interdepartmental Studies', 'Finance'], dtype=object)