# ATDA 5340 - Final - Part 7 - Evaluate and Compare models - Denis Shilkin
Dataset - adult_salary.csv. I removed leading "space" symbol from cells with Excel

In [None]:
# import libraries 
import pandas as pd
import numpy as np

from pandas import Series

from pandas.plotting import scatter_matrix
from matplotlib import pyplot
from sklearn.metrics import classification_report

from sklearn.cluster import KMeans
from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

# supress warnings
import warnings
warnings.filterwarnings("ignore")

print("Diagnostic: Libraries loaded")

In [None]:
#load dataset
filename = "adult_salary.csv"
df = pd.read_csv(filename, index_col=False)

#print shape (rows/records, columns/variables)
print(df.shape)

In [None]:
#print top 5 rows for initial data set
print(df.head(5))

In [None]:
#remove column Fnlwgt and check shape (to verify column is removed)
df2 = df[["Age","Emp_type","Education","Education_num","Marital","Occupation","Relationship",
       "Race","Sex","Capital_gain","Capital_loss","weekly_hours","Country","Income"]]
print(df2.shape)

In [None]:
#print top 5 rows for reduced dataset
print(df2.head(5))

In [None]:
# check NaNs
print(df2.isnull().sum())

In [None]:
# need to recode Income
print(df2.groupby('Income').size())

In [None]:
# recoding
df2[['Income']]=df2[['Income']].replace("<=50K.",0)
df2[['Income']]=df2[['Income']].replace(">50K.",1)
df2[['Income']]=df2[['Income']].replace("<=50K",0)
df2[['Income']]=df2[['Income']].replace(">50K",1)

# print Income split: 0 - less than $50k, 1 - more than $50k
print(df2.groupby('Income').size())

In [None]:
#verify types
print(df2.dtypes)

In [None]:
# convert strings to numeric values
Sex = Series([0,1],index=['Male','Female'])    
df2['Sex']=df2.Sex.map(Sex)

df2.Emp_type = pd.Categorical(df2.Emp_type)
df2['Emp_type'] = df2.Emp_type.cat.codes

df2.Education = pd.Categorical(df2.Education)
df2['Education'] = df2.Education.cat.codes

df2.Marital = pd.Categorical(df2.Marital)
df2['Marital'] = df2.Marital.cat.codes

df2.Occupation = pd.Categorical(df2.Occupation)
df2['Occupation'] = df2.Occupation.cat.codes

df2.Relationship = pd.Categorical(df2.Relationship)
df2['Relationship'] = df2.Relationship.cat.codes

df2.Race = pd.Categorical(df2.Race)
df2['Race'] = df2.Race.cat.codes

df2.Country = pd.Categorical(df2.Country)
df2['Country'] = df2.Country.cat.codes

#check type after encoding Categorical values 
print(df2.dtypes)

In [None]:
# describe dataset
print(df2.describe())

In [None]:
#split array into train and test 
array = df2.values

X = array [:,0:13]
Y = array [:,13]

test_size = 0.33
seed = 7

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=seed)

print("Diagnostic: dataset split into train and test arrays")

In [None]:
# Supervised ML
model1 = DecisionTreeClassifier()
model1.fit(X_train, Y_train)

In [None]:
# check prediction
predict = model1.predict(X_test)
report = classification_report(Y_test, predict)
print(report)

In [None]:
# accuracy level
score = model1.score(X_test, Y_test)
print(("%.3f%%") % (score*100.0))

In [None]:
#Evaluate
num_folds = 10
seed = 7

kfold = KFold(n_splits=num_folds, random_state=seed)
results = cross_val_score(model1, X, Y, cv=kfold, scoring='accuracy')

print("%.3f (%.3f)" % (results.mean(), results.std()))

In [None]:
#unsupervised ML
model2 = KMeans(n_clusters=2)
model2.fit(X)

In [None]:
#centers of the clusters
centroids = model2.cluster_centers_
print(centroids)

In [None]:
cluster_labels = model2.labels_[::10]
print(cluster_labels)

In [None]:
cluster_labels = model2.labels_
print(cluster_labels)

In [None]:
pyplot.scatter(X[:,0], X[:,1], c=model2.labels_, cmap='rainbow')

lines= pyplot.plot(centroids[0,0], centroids[0,1],'kx', color='black')
pyplot.setp(lines, ms=15.0)
pyplot.setp(lines, mew=2.0)

lines= pyplot.plot(centroids[1,0], centroids[1,1],'kx', color='black')
pyplot.setp(lines, ms=15.0)
pyplot.setp(lines, mew=2.0)

pyplot.show()

In [None]:
#prediction: set 1
model1.predict([[39,1,0,0,1,1,1,0,0,10000,500,60,0]])

In [None]:
model2.predict([[39,1,0,0,1,1,1,0,0,10000,500,60,0]])

In [None]:
#prediction: set 2
model1.predict([[40,1,0,0,1,1,1,0,0,1000,50,80,0]])

In [None]:
model2.predict([[40,1,0,0,1,1,1,0,0,1000,50,80,0]])