In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model  import LogisticRegression
from sklearn.svm import SVC 
from sklearn.ensemble import RandomForestClassifier,ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.cross_validation import cross_val_score, train_test_split

In [None]:
data = pd.read_csv('c://Users/amras/Desktop/Dataset/TurkeyStudent/student.csv')

In [None]:
data.head()

## Preprocessing

In [None]:
plt.figure(figsize=(9, 7), dpi=90)
plt.grid()
plt.title("Class")
sns.countplot(x='class', data=data, palette='viridis', hue='instr')

In [None]:
data.head()

In [None]:
data.describe()

## Create Model

In [None]:
data.columns

In [None]:
x = data[['class', 'nb.repeat', 'attendance', 'difficulty', 'Q1', 'Q2',
       'Q3', 'Q4', 'Q5', 'Q6', 'Q7', 'Q8', 'Q9', 'Q10', 'Q11', 'Q12', 'Q13',
       'Q14', 'Q15', 'Q16', 'Q17', 'Q18', 'Q19', 'Q20', 'Q21', 'Q22', 'Q23',
       'Q24', 'Q25', 'Q26', 'Q27', 'Q28']].values
y = data[['instr']].values

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42, test_size=0.33)

In [None]:
models   = []
names    = []
accuracy = []

In [None]:
models.append(('knn', KNeighborsClassifier(n_neighbors=4)))
models.append(('NV', GaussianNB()))
models.append(('RandomForest', RandomForestClassifier()))
models.append(('Tree', DecisionTreeClassifier()))
models.append(('SVM', SVC()))
models.append(('EXTRA', ExtraTreesClassifier()))
models.append(('LogisticRegression',LogisticRegression()))

In [None]:
models

In [None]:
from sklearn.cross_validation import KFold

In [None]:
sc = StandardScaler()

In [None]:
x = sc.fit_transform(x)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42, test_size=0.33)

In [None]:
for name, model in models:
    kfold = KFold(10, random_state=10)
    cross_val = cross_val_score(model, x_train, y_train, cv=kfold, scoring='accuracy')
    accuracy.append(cross_val)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cross_val.mean(), cross_val.std())
    print(msg)

In [None]:
fig = plt.figure(figsize=(13,7))
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(accuracy)
ax.set_xticklabels(names)
plt.show()

In [None]:
x_valid = data[['instr','class','nb.repeat','attendance','difficulty']].values
y_valid = data[['instr']].values

In [None]:
x_tr, x_te, y_tr, y_te = train_test_split(x_valid, y_valid, random_state=42, test_size=0.33)

In [None]:
names_val = []
result = []
for name, model in models:
    k      = KFold(10, random_state=7)
    score  = cross_val_score(model, x_tr, y_tr, scoring='accuracy', cv=k)
    result.append(score)
    names_val.append(name)
    msg = "%s: %f (%f)" % (name, score.mean(), score.std())
    print(msg)
    
    

    

In [None]:
fig = plt.figure(figsize=(13,7))
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(result)
ax.set_xticklabels(names)
plt.show()

In [None]:
tree = DecisionTreeClassifier()

In [None]:
tree.fit(x_tr, y_tr)

In [None]:
y_pred = tree.predict(x_te)

In [None]:
print("Confusion Matrix :\n")
print(confusion_matrix(y_te, y_pred))
print()
print("Classifiction Report:\n")
print(classification_report(y_te, y_pred))
print()
print("Accuracy :{0:.2f}".format(accuracy_score(y_te, y_pred)))