In [1]:
# Load libraries
from pandas import read_csv
from pandas.plotting import scatter_matrix
from matplotlib import pyplot
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
impNumeric = SimpleImputer(missing_values=np.nan, strategy='mean')
impCategorical = SimpleImputer(missing_values=np.nan, 
                               strategy='most_frequent')

# explicitly require this experimental feature
from sklearn.experimental import enable_iterative_imputer
# now you can import normally from sklearn.impute
from sklearn.impute import IterativeImputer
impIterative = IterativeImputer(missing_values=np.nan, sample_posterior=True, imputation_order='random', random_state=1)

from sklearn.metrics import plot_confusion_matrix
import matplotlib.pyplot as plt

In [None]:
import pandas as pd

In [None]:
url = "124 variable K-8 education dataset.csv"

In [None]:
from pandas import read_csv

In [None]:
dataset=read_csv(url)

In [None]:
dataset

In [None]:
dataset = dataset[["P4CMPEDU","P5CMPEDU","P6CMPEDU","C2R4MPF","C4R4MPF","W1INCCAT","W5INCCAT","W8INCCAT","WKMOMED","WKDADED","P1HMEMP","P1HDEMP","P7HMEMP","P7HDEMP","C7R4MPF"]]

In [None]:
dataset.head()

In [None]:
import numpy as np

In [None]:
#Drop NA's for Y-variable
cleandf=dataset.dropna(subset=["C7R4MPF"])

In [None]:
print(cleandf.groupby('C7R4MPF').size())

In [None]:
dataset= cleandf[cleandf["C7R4MPF"]>-9]
print(dataset.groupby('C7R4MPF').size())

In [None]:
dataset.shape

In [None]:
# Group some categories together
conditions = [
    (dataset['C7R4MPF'] >= 3.0) & (dataset['C7R4MPF'] <= 5.0),
    (dataset['C7R4MPF'] >= 6.0) & (dataset['C7R4MPF'] <= 7.0),
    (dataset['C7R4MPF'] >= 8.0)& (dataset['C7R4MPF'] <= 9.0)]
choices = ['C - Low', 'B - Middle', 'A - High']
dataset['Score'] = np.select(conditions, choices)
dataset

In [None]:
#remove the C7R4MPF column, now that we have the 'Score' category 
dataset=dataset.drop('C7R4MPF',1)
dataset.head()

In [None]:
print(dataset.groupby('Score').size())

In [None]:
dataset2=dataset[dataset['Score']=='C - Low'].copy()

In [None]:
print(dataset2.groupby('Score').size())

In [None]:
dataset2=dataset2.append(dataset[dataset['Score']=='A - High'].sample(n=756))

In [None]:
dataset2

In [None]:
print(dataset2.groupby('Score').size())

In [None]:
dataset2=dataset2.append(dataset[dataset['Score']=='B - Middle'].sample(n=756))

In [None]:
print(dataset2.groupby('Score').size())

In [None]:
len(dataset2)

In [None]:
# Use 75% of the data for training
0.75*2268

In [None]:
# Creating our sample of random lines
traindf=dataset2.sample(n=1701)
#Listing what rows our sample is using
traindf.index

In [None]:
# Printing training dataset which imputation will be performed on
traindf

In [None]:
# Defining our validation data as rows in dataset2 that are not in our training data
validationdf=dataset2.drop(traindf.index)
len(validationdf)

In [None]:
# taking out all NA's from validation dataset
validationdf=validationdf.dropna()
len(validationdf)

In [None]:
# Do imputation on ALL columns of TRAIN 
X = traindf[["P4CMPEDU","P5CMPEDU","P6CMPEDU","C2R4MPF","C4R4MPF","W1INCCAT","W5INCCAT","W8INCCAT","WKMOMED","WKDADED","P1HMEMP","P1HDEMP","P7HMEMP","P7HDEMP"]]

In [None]:
X

In [None]:
data_imputed=pd.DataFrame(impIterative.fit_transform(X),columns=["P4CMPEDU","P5CMPEDU","P6CMPEDU","C2R4MPF","C4R4MPF","W1INCCAT","W5INCCAT","W8INCCAT","WKMOMED","WKDADED","P1HMEMP","P1HDEMP","P7HMEMP","P7HDEMP"])

In [None]:
data_imputed

In [None]:
len(data_imputed)

In [None]:
#Create a new column in data_imputed that doesn't exist yet to tack on the y-variable
data_imputed["Score"]=traindf['Score'].values

In [None]:
data_imputed

In [None]:
data_imputed.shape

In [None]:
print(data_imputed.groupby('Score').size())

In [None]:
# Split-out train dataset
array = data_imputed.values
X_train = array[:,0:14]
Y_train= array[:,14]
#X_train, X_validation, Y_train, Y_validation = train_test_split(X, y, test_size=0.20, random_state=1)

In [None]:
# Spot Check Algorithms
models = []
models.append(('LR', LogisticRegression(solver='liblinear', multi_class='ovr')))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC(gamma='auto')))

In [None]:
# evaluate each model in turn
results = []
names = []
for name, model in models:
	kfold = StratifiedKFold(n_splits=7, random_state=1, shuffle=True)
	cv_results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring='accuracy')
	results.append(cv_results)
	names.append(name)
	print('%s: %f (%f)' % (name, cv_results.mean(), cv_results.std()))

In [None]:
#Set validation set
X_validation=validationdf.iloc[:,0:14]
Y_validation=validationdf.iloc[:,14]

In [None]:
validationdf

In [None]:
# Make predictions on validation dataset
 
model = SVC(gamma='auto')
model.fit(X_train, Y_train)
predictions = model.predict(X_validation)

In [None]:
# Evaluate predictions
print(accuracy_score(Y_validation, predictions))
plot_confusion_matrix(model.fit(X_train, Y_train),X_validation,Y_validation,cmap=plt.cm.Blues)
print(classification_report(Y_validation, predictions))