In [29]:
import pandas as pd
import numpy as np
import seaborn as sb

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm

from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, f1_score

In [30]:
# Core dataset
glass = pd.read_csv('datatset/glass.csv', header=None)
iris = pd.read_csv('datatset/iris.csv', header=None)
wine = pd.read_csv('datatset/wine.csv', header=None)
pima = pd.read_csv('datatset/pima.csv', header=None)
tictactoe = pd.read_csv('datatset/tic-tac-toe.csv', header=None)

# Other dataset
caesarian = pd.read_csv('datatset/caesarian.csv', header=None)
car = pd.read_csv('datatset/car.csv', header=None)

### Data Preprocessing

In [31]:
iris.head()

Unnamed: 0,0,1,2,3,4
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [32]:
columns_iris = ['sepal length ',
                'sepal width ',
                'petal length ',
                'petal width ',
                'class']
iris.columns = columns_iris
iris.head()

Unnamed: 0,sepal length,sepal width,petal length,petal width,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [33]:
wine.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [34]:
columns_wine = ['Class',
                'Alcohol',
                'Malic acid',
                'Ash',
                'Alcalinity of ash',
                'Magnesium',
                'Total phenols',
                'Flavanoids',
                'Nonflavanoid phenols',
                'Proanthocyanins',
                'Color intensity',
                'Hue',
                'OD280/OD315 of diluted wines',
                'Proline']

wine.columns = columns_wine
wine.head()

Unnamed: 0,Class,Alcohol,Malic acid,Ash,Alcalinity of ash,Magnesium,Total phenols,Flavanoids,Nonflavanoid phenols,Proanthocyanins,Color intensity,Hue,OD280/OD315 of diluted wines,Proline
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [35]:
caesarian.head()

Unnamed: 0,0,1,2,3,4,5
0,22,1,0,2,0,0
1,26,2,0,1,0,1
2,26,2,1,1,0,0
3,28,1,0,2,0,0
4,22,2,0,1,0,1


In [36]:
columns_caesarian =['Age',
                    'Delivery number',
                    'Delivery time',
                    'Blood of Pressure',
                    'Heart Problem',
                    'Caesarian?']

caesarian.columns = columns_caesarian
caesarian.head()

Unnamed: 0,Age,Delivery number,Delivery time,Blood of Pressure,Heart Problem,Caesarian?
0,22,1,0,2,0,0
1,26,2,0,1,0,1
2,26,2,1,1,0,0
3,28,1,0,2,0,0
4,22,2,0,1,0,1


In [37]:
pima.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [38]:
columns_pima = ['Number of times pregnant',
               'Plasma glucose concentration a 2 hours in an oral glucose tolerance test',
               'Diastolic blood pressure',
               'Triceps skin fold thickness',
               '2-Hour serum insulin',
               'Body mass index',
               'Diabetes pedigree function',
               'Age',
               'class']

pima.columns = columns_pima
pima.head()

Unnamed: 0,Number of times pregnant,Plasma glucose concentration a 2 hours in an oral glucose tolerance test,Diastolic blood pressure,Triceps skin fold thickness,2-Hour serum insulin,Body mass index,Diabetes pedigree function,Age,class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [39]:
car

Unnamed: 0,0,1,2,3,4,5,6
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc
...,...,...,...,...,...,...,...
1723,low,low,5more,more,med,med,good
1724,low,low,5more,more,med,high,vgood
1725,low,low,5more,more,big,low,unacc
1726,low,low,5more,more,big,med,good


In [40]:
# Discretization
car = car.replace({'med':1, 'low':2, 'vhigh':3, 'high':4,
                  '3':1, '2':2, '4':3, '5more':4,
                  'more':1,
                  'small':2, 'big':3})

In [41]:
car

Unnamed: 0,0,1,2,3,4,5,6
0,3,3,2,2,2,2,unacc
1,3,3,2,2,2,1,unacc
2,3,3,2,2,2,4,unacc
3,3,3,2,2,1,2,unacc
4,3,3,2,2,1,1,unacc
...,...,...,...,...,...,...,...
1723,2,2,4,1,1,1,good
1724,2,2,4,1,1,4,vgood
1725,2,2,4,1,3,2,unacc
1726,2,2,4,1,3,1,good


In [42]:
columns_car = ['buying','maint','doors','persons','lug_boot','safety','class']

car.columns = columns_car
car

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
0,3,3,2,2,2,2,unacc
1,3,3,2,2,2,1,unacc
2,3,3,2,2,2,4,unacc
3,3,3,2,2,1,2,unacc
4,3,3,2,2,1,1,unacc
...,...,...,...,...,...,...,...
1723,2,2,4,1,1,1,good
1724,2,2,4,1,1,4,vgood
1725,2,2,4,1,3,2,unacc
1726,2,2,4,1,3,1,good


In [43]:
car.dtypes

buying       int64
maint        int64
doors        int64
persons      int64
lug_boot     int64
safety       int64
class       object
dtype: object

In [44]:
tictactoe.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,x,x,x,x,o,o,x,o,o,positive
1,x,x,x,x,o,o,o,x,o,positive
2,x,x,x,x,o,o,o,o,x,positive
3,x,x,x,x,o,o,o,b,b,positive
4,x,x,x,x,o,o,b,o,b,positive


In [45]:
# to predict religion .iloc[:,7]
columns_tictactoe = ['top left','top middle','top right',
                     'middle left','middle middle','middle right',
                    'bottom left', 'bottom middle', 'bottom right', 
                    'class']

tictactoe.columns = columns_tictactoe
tictactoe['class'].replace({"positive":1, "negative":0}, inplace=True)
tictactoe.replace({"b":2,"x":1, "o":0}, inplace=True)

tictactoe.head()

Unnamed: 0,top left,top middle,top right,middle left,middle middle,middle right,bottom left,bottom middle,bottom right,class
0,1,1,1,1,0,0,1,0,0,1
1,1,1,1,1,0,0,0,1,0,1
2,1,1,1,1,0,0,0,0,1,1
3,1,1,1,1,0,0,0,2,2,1
4,1,1,1,1,0,0,2,0,2,1


In [46]:
glass.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,1.52101,13.64,4.49,1.1,71.78,0.06,8.75,0.0,0.0,1
1,1.51761,13.89,3.6,1.36,72.73,0.48,7.83,0.0,0.0,1
2,1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0.0,0.0,1
3,1.51766,13.21,3.69,1.29,72.61,0.57,8.22,0.0,0.0,1
4,1.51742,13.27,3.62,1.24,73.08,0.55,8.07,0.0,0.0,1


In [47]:
columns_glass = ['RI','Na','Mg','Al','Si','K','Ca','Ba','Fe','Type of glass']

glass.columns = columns_glass
glass

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,Type of glass
0,1.52101,13.64,4.49,1.10,71.78,0.06,8.75,0.00,0.0,1
1,1.51761,13.89,3.60,1.36,72.73,0.48,7.83,0.00,0.0,1
2,1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0.00,0.0,1
3,1.51766,13.21,3.69,1.29,72.61,0.57,8.22,0.00,0.0,1
4,1.51742,13.27,3.62,1.24,73.08,0.55,8.07,0.00,0.0,1
...,...,...,...,...,...,...,...,...,...,...
209,1.51623,14.14,0.00,2.88,72.61,0.08,9.18,1.06,0.0,7
210,1.51685,14.92,0.00,1.99,73.06,0.00,8.40,1.59,0.0,7
211,1.52065,14.36,0.00,2.02,73.42,0.00,8.44,1.64,0.0,7
212,1.51651,14.38,0.00,1.94,73.61,0.00,8.48,1.57,0.0,7


### Preparing the input data to feed into classifier

In [48]:
# 5 core datatsets

x_pima = pima.iloc[:, 0:8]  
y_pima = pima.iloc[:, 8]

x_iris = iris.iloc[:, 0:4]
y_iris = iris.iloc[:, 4]

x_wine = wine.iloc[:, 1:13]
y_wine = wine.iloc[:, 0]

x_glass = glass.iloc[:, 0:9]
y_glass = glass.iloc[:, 9]

x_tictactoe = tictactoe.iloc[:, 0:9]
y_tictactoe = tictactoe.iloc[:, 9]

# other datasets

x_caesarian = caesarian.iloc[:, 0:5]
y_caesarian = caesarian.iloc[:, 5]

x_car = car.iloc[:, 0:6]
y_car = car.iloc[:, 6]

In [49]:
dt_accuracy = []
rf_accuracy = []
svm_accuracy =[]

dt_fscore = []
rf_fscore = []
svm_fscore = []

In [50]:
X = [x_glass, x_wine, x_iris, x_pima, x_tictactoe, x_caesarian, x_car]
Y = [y_glass, y_wine, y_iris, y_pima, y_tictactoe, y_caesarian, y_car]

### Decision Tree

In [51]:
# Create the Decisio Tree
dectree = DecisionTreeClassifier()

for i in range(len(X)):
    
    dectree.fit(X[i], Y[i])

    dt_accuracy.append(np.mean(cross_val_score(dectree, X[i], Y[i], cv=10, scoring='accuracy')))
    dt_fscore.append(np.mean(cross_val_score(dectree, X[i], Y[i], cv=10, scoring='f1_weighted')))



### Random Forest

In [52]:
# Create the Random Forest object
rforest = RandomForestClassifier()

for i in range(len(X)):
    
    rforest.fit(X[i], Y[i])

    rf_accuracy.append(np.mean(cross_val_score(rforest, X[i], Y[i], cv=10, scoring='accuracy')))
    rf_fscore.append(np.mean(cross_val_score(rforest, X[i], Y[i], cv=10, scoring='f1_weighted')))  



### Support Vector Machine

In [53]:
SVM = svm.SVC()

for i in range(len(X)):
    
    SVM.fit(X[i], Y[i])

    svm_accuracy.append(np.mean(cross_val_score(SVM, X[i], Y[i], cv=10, scoring='accuracy')))
    svm_fscore.append(np.mean(cross_val_score(SVM, X[i], Y[i], cv=10, scoring='f1_weighted')))  



In [54]:
dataset = ['glass', 'wine', 'iris', 'pima', 'tic-tac-toe', 'caesarian', 'car']

final = pd.DataFrame(list(zip(dataset, dt_accuracy, dt_fscore, rf_accuracy, rf_fscore, svm_accuracy, svm_fscore)), 
                     columns=['Dataset', 'DT accuracy', 'DT f_score', 'RF accuracy', 'RF f_score', 'SVM accuracy', 'SVM f_score'])
final

Unnamed: 0,Dataset,DT accuracy,DT f_score,RF accuracy,RF f_score,SVM accuracy,SVM f_score
0,glass,0.630519,0.569297,0.72381,0.717128,0.354978,0.186279
1,wine,0.911111,0.908121,0.972222,0.972006,0.551634,0.465381
2,iris,0.96,0.959731,0.96,0.959731,0.973333,0.973064
3,pima,0.706972,0.709474,0.760441,0.768451,0.757861,0.740124
4,tic-tac-toe,0.855022,0.849748,0.908158,0.914558,0.873739,0.87152
5,caesarian,0.5625,0.551479,0.5625,0.512633,0.575,0.421795
6,car,0.897611,0.902672,0.869236,0.867818,0.710626,0.679711
