In [1]:
import numpy as np
import pandas as pd
import matplotlib as plt
import seaborn as sns


from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.datasets import make_classification
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestClassifier

# Import the data

In [20]:
df = pd.read_spss("1ResearchProjectData.sav")

Unnamed: 0,Student,Score
Student,1.0,0.365915
Score,0.365915,1.0


# Preprocess the data

## Replace the name of teacher with 1, 2 or 3
 Ruger = 1
 Wesson = 2
 Smith = 3
## Replace lunch status with 0 and 1
 Free lunch = 0
 Paid lunch = 1
## Replace Gender with 1's and 0's
 Female = 0
 Male = 1
## Replace ethinicity with 1 through 4
 African American = 1
 Hispanic = 2
 Caucasian = 3
 Asian = 4


In [3]:
df = df.drop(columns=['wesson'])
df = df.dropna()
# Teacher
df.replace('Ruger',1,inplace=True)
df.replace('Wesson',2,inplace=True)
df.replace('Smith',3,inplace=True)

# Lunch status
df.replace('Free lunch',0,inplace=True)
df.replace('Paid lunch',1,inplace=True)

# Gender:
df.replace('Female',0,inplace=True)
df.replace('Male',1,inplace=True)

#Ethnicity
df.replace('African-American',1,inplace=True)
df.replace('Hispanic',2,inplace=True)
df.replace('Caucasian',3,inplace=True)
df.replace('Asian',4,inplace=True)

In [18]:
df

Unnamed: 0,Student,Teacher,Gender,Ethnic,Freeredu,Score
0,1.0,1,0,4,0,76.0
1,2.0,1,0,2,1,56.0
2,3.0,1,0,1,0,34.0
3,4.0,1,0,4,1,59.0
4,5.0,1,1,2,0,73.0
...,...,...,...,...,...,...
211,212.0,2,1,1,1,56.0
212,213.0,2,1,2,0,94.0
213,214.0,2,1,2,1,91.0
214,215.0,2,0,1,1,53.0


Unnamed: 0,Student,Score
Student,1.0,0.365915
Score,0.365915,1.0


# Training test split

In [5]:
X = pd.DataFrame(df, columns=['Gender', 'Ethnic', 'Freeredu','Score'])
y = pd.DataFrame(df, columns=['Teacher'])


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 10)

Unnamed: 0,Score
Score,1.0


# KNN Classifier:

In [6]:
knn = KNeighborsClassifier(n_neighbors=6)

knn.fit(X_train,y_train.values.ravel())
yPred = knn.predict(X_test)

MSE = mean_squared_error(y_true=y_test,y_pred=yPred)

score = knn.score(X_test,y_test.values.ravel())

print("The MSE for this model is {} and the accuracy is {}".format(MSE,score))

The MSE for this model is 0.8769230769230769 and the accuracy is 0.5846153846153846


### Take the KNN classifer and apply grid search to find the best params:

In [8]:
n = np.arange(1, 20, 1)
p = np.arange(1,3,1)

params = { 'n_neighbors' : n, 'algorithm': ('auto', 'ball_tree', 'kd_tree', 'brute'), 'p': p  }# The param we want to figure out using the grid search
kNN = KNeighborsClassifier()
kNN_gscv = GridSearchCV(kNN, params, cv=5)# Grid search with five fold cross validation
#fit model to data
kNN_gscv.fit(X_train, y_train.values.ravel())
print(kNN_gscv.best_params_)

{'algorithm': 'brute', 'n_neighbors': 11, 'p': 2}


### Re Train the KNN clssifier with these new params:


In [15]:
knn = KNeighborsClassifier(n_neighbors=11,algorithm='brute',p=2)

knn.fit(X_train,y_train.values.ravel())
yPred = knn.predict(X_test)

MSE = mean_squared_error(y_true=y_test,y_pred=yPred)

score = knn.score(X_test,y_test.values.ravel())

print("The MSE for this model is {} and the accuracy is {}".format(MSE,score))

The MSE for this model is 0.8769230769230769 and the accuracy is 0.49230769230769234


# Random Forest Classifier

In [11]:
rfc = RandomForestClassifier(max_depth=9,n_estimators=100,ccp_alpha=0.002)
rfc.fit(X_train,y_train.values.ravel())

score = rfc.score(X_test,y_test.values.ravel())

score


0.49230769230769234

## Accuracy is extremely low. Use grid search again to find the best values:

In [13]:
depth = np.arange(1,50,2)
est = np.arange(10,200,10)
ccp_alpha = 0.001*np.arange(0,10,2)
rfc = RandomForestClassifier()

params = {'max_depth' : depth, 'n_estimators' : est, 'ccp_alpha' : ccp_alpha}
rfc_gscv = GridSearchCV(rfc, params, cv=5)# Grid search with five fold cross validation
#fit model to data
rfc_gscv.fit(X_train, y_train.values.ravel())
print(rfc_gscv.best_params_)

{'ccp_alpha': 0.0, 'max_depth': 1, 'n_estimators': 100}


## Retrain using the new best params

In [14]:
rfc = RandomForestClassifier(max_depth=1,ccp_alpha=0,n_estimators=100)

rfc.fit(X_train,y_train.values.ravel())

score = rfc.score(X_test,y_test.values.ravel())

score

0.49230769230769234