In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score

In [2]:
X_train = pd.read_csv('./Engineered_csv/Engineered_X_train.csv')
X_test = pd.read_csv('./Engineered_csv/Engineered_X_test.csv')
y_train = pd.read_csv('./Engineered_csv/X_train_label.csv')
y_test = pd.read_csv('./Engineered_csv/X_test_label.csv')

In [37]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.decomposition import PCA

In [4]:
def model_performance(trained_model, testing_features, testing_labels, metrics = []):
    '''
    A functions that applies multiple scoring metrics given by the user on the testing set
    
    input:
            trained_model (sklearn machine learning algorithm api): An already trained machine learning algorithm
            testing_features (DataFrame): (X_test) 
            testing_label (DataFrame): (y_test)
            metrics (list of sklearn metric api): a list which contains the desired scoring metrics
    
    output:
            scores (dictionary): a dictionary which contains the scoring method as a key and the score as value
            
    '''
    scores = {}
    num_metrics = 1
    predictions = trained_model.predict(testing_features.values)
    for metric in metrics:
        
        if str(metric) == str(f1_score):
            score = f1_score(testing_labels.values.ravel(), predictions, average='micro')
            scores[num_metrics] = score
            num_metrics += 1
        else:
            score = metric(testing_labels.values.ravel(), predictions)
            scores[num_metrics] = score
            num_metrics += 1
        
    return scores

In [5]:
rf = RandomForestClassifier(verbose=1)

In [6]:
rf.fit(X_train.values, y_train.values.ravel())

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:   11.1s finished


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=1, warm_start=False)

In [17]:
rf.score(X_train.values, y_train.values.ravel())

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.9s finished


1.0

In [7]:
rf_scores = model_performance(rf, X_test, y_test, [confusion_matrix, accuracy_score, f1_score])

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished


In [9]:
print(rf_scores[1])
print(rf_scores[2])
print(rf_scores[3])

[[408   0   0   0   0   0   0   0   0   0]
 [  0 471   0   0   0   0   0   0   0   0]
 [  0   0 420   0   0   0   0   0   0   0]
 [  0   0   0 506   0   0   0   0   0   0]
 [  0   0   0   0 397   0   0   0   0   0]
 [  0   0   0   0   0 339   0   0   0   0]
 [  0   0   0   0   0   0 402   0   0   0]
 [  0   0   0   0   0   0   0 438   0   0]
 [  0   0   0   0   0   0   0   0 403   0]
 [  0   0   0   0   0   0   0   0   0 416]]
1.0
1.0


In [18]:
rf.feature_importances_

array([0.40203811, 0.36979641, 0.05140329, 0.05207677, 0.01209002,
       0.04036192, 0.03328578, 0.02623787, 0.01270983])

In [11]:
svm = SVC(kernel='linear')

In [13]:
svm.fit(X_train.values, y_train.values.ravel())

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [14]:
svm.score(X_train.values, y_train.values.ravel())

0.9407142857142857

In [15]:
svm_scores = model_performance(svm, X_test, y_test, [confusion_matrix, accuracy_score, f1_score])

In [16]:
print(svm_scores[1])
print(svm_scores[2])
print(svm_scores[3])

[[396   0   1   0   0   0   0   0  11   0]
 [  0 467   0   0   3   0   0   1   0   0]
 [  6   0 398   0   0   0   0   0  16   0]
 [  0   0   0 455   0  28   5   0   0  18]
 [  0   2   0   0 382   0   0  13   0   0]
 [  0   0   0  17   0 299  14   0   0   9]
 [  0   0   0  12   0  13 376   0   0   1]
 [  0  14   0   0  19   0   0 405   0   0]
 [  4   0  16   0   0   0   0   0 383   0]
 [  0   0   0  10   0  18   0   0   0 388]]
0.9402380952380952
0.9402380952380952


In [20]:
svm.class_weight

In [89]:
data = pd.read_csv('test.csv')

In [90]:
data.head()

Unnamed: 0,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,pixel9,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [91]:
data['avg_pixel_val'] = np.nan

In [92]:
for index in range(len(data)):
    img = data.iloc[[index], :-1].values.reshape(-1, 28, 28, 1)
    avg = np.average(img[0][:,:,0])/255.0
    data.loc[[index], 'avg_pixel_val'] = avg

In [93]:
data['avg_pixel_val'].head()

0    0.197099
1    0.199745
2    0.074380
3    0.081072
4    0.121519
Name: avg_pixel_val, dtype: float64

In [94]:
data['avg_pixel_used'] = np.nan

In [95]:
for i in range(len(data)):
    
    val = data.iloc[[i], :-2]
    counter = 0
        
    for col in val.columns:
    
        if val[col].unique() > 0:
                
            counter += 1
                
        else: 
            continue
        
    data.loc[[i], 'avg_pixel_used'] = (counter/784)

In [96]:
data.shape

(28000, 786)

In [97]:
data.iloc[:, :-2] = data.iloc[:, :-2]/255.0

In [98]:
pca = PCA(n_components=50)
components = pca.fit_transform(data.iloc[:, :-2])

In [99]:
for x in range(7):
    data['PCA'+str(x)] = components[:, [x]]

data.head()

Unnamed: 0,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,pixel9,...,pixel783,avg_pixel_val,avg_pixel_used,PCA0,PCA1,PCA2,PCA3,PCA4,PCA5,PCA6
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.197099,0.274235,3.305606,0.567347,2.914199,2.637555,1.514525,1.919663,0.806149
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.199745,0.271684,4.481195,0.285946,0.990474,-0.248639,-3.668353,1.751423,2.882699
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.07438,0.15051,-1.953681,-0.527386,0.681696,-0.30423,-0.663703,0.531303,-1.291072
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.081072,0.139031,-0.917955,-2.397433,1.865791,1.67097,-0.790602,0.624107,-0.306629
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.121519,0.191327,-0.481039,2.779916,0.008171,0.600696,1.144494,0.181592,0.270334


In [100]:
data.drop(data.columns[:784], axis=1, inplace=True)

In [101]:
data

Unnamed: 0,avg_pixel_val,avg_pixel_used,PCA0,PCA1,PCA2,PCA3,PCA4,PCA5,PCA6
0,0.197099,0.274235,3.305606,0.567347,2.914199,2.637555,1.514525,1.919663,0.806149
1,0.199745,0.271684,4.481195,0.285946,0.990474,-0.248639,-3.668353,1.751423,2.882699
2,0.074380,0.150510,-1.953681,-0.527386,0.681696,-0.304230,-0.663703,0.531303,-1.291072
3,0.081072,0.139031,-0.917955,-2.397433,1.865791,1.670970,-0.790602,0.624107,-0.306629
4,0.121519,0.191327,-0.481039,2.779916,0.008171,0.600696,1.144494,0.181592,0.270334
...,...,...,...,...,...,...,...,...,...
27995,0.131052,0.196429,0.698582,-2.859273,-3.435590,1.176831,1.476658,0.108946,-0.863134
27996,0.076266,0.119898,-0.970196,-2.298255,-2.042659,0.720215,-2.047013,-0.018801,-0.658255
27997,0.124820,0.188776,-0.570393,2.568809,-3.420184,-2.113449,0.293541,0.222361,-0.411227
27998,0.134029,0.191327,-0.070832,-2.220301,-1.393994,-1.520254,1.584550,-2.323370,-1.464280


In [102]:
labels = rf.predict(data)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.2s finished


In [103]:
data['Label'] = labels

In [104]:
submission = data.drop(data.columns[:-1], axis=1, inplace=False)

In [105]:
submission.head()

Unnamed: 0,Label
0,2
1,2
2,4
3,4
4,9


In [106]:
submission.to_csv('results.csv', index=True)

In [107]:
df = pd.read_csv('results.csv')

In [108]:
df.head()

Unnamed: 0.1,Unnamed: 0,Label
0,0,2
1,1,2
2,2,4
3,3,4
4,4,9


In [109]:
df.rename(columns={'Unnamed: 0':'ImageId'}, inplace=True)

In [110]:
df['ImageId'] += 1

In [111]:
df.to_csv('sub.csv', index=False)

In [112]:
df1 = pd.read_csv('sub.csv')

In [113]:
df1

Unnamed: 0,ImageId,Label
0,1,2
1,2,2
2,3,4
3,4,4
4,5,9
...,...,...
27995,27996,9
27996,27997,1
27997,27998,9
27998,27999,9
