In [1]:
import pandas as pd
from sklearn.model_selection import RepeatedStratifiedKFold, train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import classification_report, confusion_matrix, plot_confusion_matrix, plot_roc_curve
from sklearn.calibration import calibration_curve, CalibratedClassifierCV
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC, SVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from matplotlib import pyplot as plt
import seaborn as sns
import csv
import numpy as np

In [2]:
with open('train.csv', 'r') as f:
    train = pd.read_csv(f)
    
with open('test.csv', 'r') as f:
    test = pd.read_csv(f)

In [3]:
train.head()

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
train.shape

(42000, 785)

In [5]:
train['Sum'] = train.sum(axis=1)
train.Sum = pd.qcut(train.Sum, 5, labels=range(5))
train.Sum = train.Sum.astype('int')


train['Mean'] = train.mean(axis=1)
train.Mean = pd.qcut(train.Mean, 10, labels=range(10))
train.Mean = train.Mean.astype('int')
train.head()

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783,Sum,Mean
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,4,9
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,4,9


In [6]:
test['Sum'] = test.sum(axis=1)
test.Sum = pd.qcut(test.Sum, 5, labels=range(5))
test.Sum = test.Sum.astype('int')


test['Mean'] = test.mean(axis=1)
test.Mean = pd.qcut(test.Mean, 10, labels=range(10))
test.Mean = test.Mean.astype('int')
test.head()

Unnamed: 0,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,pixel9,...,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783,Sum,Mean
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,4,9
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,4,9
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2,4


In [7]:
train.label.unique()

array([1, 0, 4, 7, 3, 5, 8, 9, 2, 6], dtype=int64)

In [8]:
X = train.iloc[:,1:].copy()
y = train.label.copy()
X

Unnamed: 0,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,pixel9,...,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783,Sum,Mean
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,4,9
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,4,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,3,6
41996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
41997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,3,7
41998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2,5


In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=7)

# XGBClassifier

In [11]:
xgb = XGBClassifier(objective='mutli:softmax', verbose=3, n_estimators=150)
xgb.fit(X_train, y_train, eval_set=[(X_test, y_test)], eval_metric='merror', early_stopping_rounds=5)

Parameters: { verbose } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	validation_0-merror:0.16829
[1]	validation_0-merror:0.11686
[2]	validation_0-merror:0.10057
[3]	validation_0-merror:0.08990
[4]	validation_0-merror:0.08352
[5]	validation_0-merror:0.07876
[6]	validation_0-merror:0.07467
[7]	validation_0-merror:0.06990
[8]	validation_0-merror:0.06733
[9]	validation_0-merror:0.06448
[10]	validation_0-merror:0.06171
[11]	validation_0-merror:0.05962
[12]	validation_0-merror:0.05752
[13]	validation_0-merror:0.05533


KeyboardInterrupt: 

In [None]:
xgb.score(X_test, y_test)

In [None]:
xgb_pred = xgb.predict(X_test)
xgb_pred[:10]

In [None]:
print(classification_report(y_test, xgb_pred))

In [None]:
fig = plt.figure(figsize=(12, 9))
ax = fig.add_subplot()
plot_confusion_matrix(xgb, X_test, y_test, cmap='plasma', ax=ax)
ax.set_title('Confusion Matrix')
plt.show()

In [None]:
one_xgb = OneVsRestClassifier(XGBClassifier(n_estimators=150, objective='binary:logistic', verbose=3), n_jobs=-1)
one_xgb.fit(X_train, y_train)

In [None]:
one_xgb.score(X_test, y_test)

In [None]:
fig = plt.figure(figsize=(10, 7))
ax = fig.add_subplot()
for x in range(len(one_xgb.estimators_)):
    plot_roc_curve(one_xgb.estimators_[x], X_test, y_test, name=x, ax=ax)
plt.legend(loc='best')
plt.tight_layout()

# RandomForestClassifier 

In [None]:
one_rf = OneVsRestClassifier(ExtraTreesClassifier(n_estimators=600, verbose=1, n_jobs=-1), n_jobs=-1)
one_rf.fit(X_train, y_train)

In [None]:
one_rf.score(X_test, y_test)

In [None]:
fig = plt.figure(figsize=(10, 7))
ax = fig.add_subplot()
for x in range(len(one_rf.estimators_)):
    plot_roc_curve(one_rf.estimators_[x], X_test, y_test, name=x, ax=ax)
plt.legend(loc='best')
plt.tight_layout()

# LogisticRegression 

In [None]:
one_logreg = OneVsRestClassifier(LogisticRegression(n_jobs=-1, verbose=2, warm_start=True, max_iter=300), n_jobs=-1)
one_logreg.fit(X_train, y_train)

In [None]:
one_logreg.score(X_test, y_test)

In [None]:
print(classification_report(y_test, one_logreg.predict(X_test)))

In [None]:
fig = plt.figure(figsize=(10, 7))
ax = fig.add_subplot()
for x in range(len(one_logreg.estimators_)):
    plot_roc_curve(one_logreg.estimators_[x], X_test, y_test, name=x, ax=ax)
plt.legend(loc='best')
plt.tight_layout()

# Result 

In [None]:
clf1 = XGBClassifier(objective='multi:softmax', n_estimators=150, num_class=10, n_jobs=-1, tree_method='gpu_hist', 
                    predictor='gpu_predictor', eval_metric='merror').fit(X_train, y_train)
print(clf1.score(X_test, y_test))

Parameters: { verbose } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




In [None]:
clf1 = XGBClassifier(objective='multi:softmax', verbose=3, n_estimators=150, num_class=10, n_jobs=-1, tree_method='gpu_exact', 
                    predictor='gpu_predictor')
clf2 =ExtraTreesClassifier(n_estimators=600, verbose=1, n_jobs=-1)
clf3 = LogisticRegression(n_jobs=-1, verbose=2, warm_start=True, max_iter=300)

vote = VotingClassifier(estimators=[('clf1', clf1), ('clf2', clf2), ('clf3', clf3)], voting='hard', n_jobs=-1, verbose=True)
vote.fit(X_train, y_train)

In [None]:
vote.score(X_test, y_test)

In [None]:
print(classification_report(y_test, vote.predict(X_test)))

In [None]:
rows = list(zip(range(1, len(test) +1), vote.predict(test)))
rows[:5]

In [None]:
with open('submission.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['ImageId', 'Label'])
    writer.writerows(rows)