In [1]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
import xgboost
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LassoCV
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_squared_error, r2_score

In [2]:
import warnings
warnings.filterwarnings("ignore")

# Data Set 4: Optical Recognition of Handwritten Digits

## Content
We used preprocessing programs made available by NIST to extract normalized bitmaps of handwritten digits from a preprinted form. From a total of 43 people, 30 contributed to the training set and different 13 to the test set. 32x32 bitmaps are divided into nonoverlapping blocks of 4x4 and the number of on pixels are counted in each block. This generates an input matrix of 8x8 where each element is an integer in the range 0..16. This reduces dimensionality and gives invariance to small distortions.

## Task
All input attributes are integers in the range 0..16.
The last attribute is the class code 0..9

In [3]:
ds4_test = pd.read_csv('optdigits.tes',header=None)
ds4_train = pd.read_csv('optdigits.tra', header=None)

In [4]:
ds4_test.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,55,56,57,58,59,60,61,62,63,64
0,0,0,5,13,9,1,0,0,0,0,...,0,0,0,6,13,10,0,0,0,0
1,0,0,0,12,13,5,0,0,0,0,...,0,0,0,0,11,16,10,0,0,1
2,0,0,0,4,15,12,0,0,0,0,...,0,0,0,0,3,11,16,9,0,2
3,0,0,7,15,13,1,0,0,0,8,...,0,0,0,7,13,13,9,0,0,3
4,0,0,0,1,11,0,0,0,0,0,...,0,0,0,0,2,16,4,0,0,4


In [5]:
ds4_train.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,55,56,57,58,59,60,61,62,63,64
0,0,1,6,15,12,1,0,0,0,7,...,0,0,0,6,14,7,1,0,0,0
1,0,0,10,16,6,0,0,0,0,7,...,0,0,0,10,16,15,3,0,0,0
2,0,0,8,15,16,13,0,0,0,1,...,0,0,0,9,14,0,0,0,0,7
3,0,0,0,3,11,16,0,0,0,0,...,0,0,0,0,1,15,2,0,0,4
4,0,0,5,14,4,0,0,0,0,0,...,0,0,0,4,12,14,7,0,0,6


## Train and Test Data

In [6]:
X_train = ds4_train.loc[:: ,:63]
y_train = ds4_train.loc[:: ,64]

X_test = ds4_test.loc[:: ,:63]
y_test = ds4_test.loc[:: ,64]

## Selected Parameters

In [7]:
param_grid_sgb = dict(max_depth = [2,3,5,7,10],
                  learning_rate = [0.1, 0.2, 0.5, 0.7],
                  n_estimators = [10,30,50,100,200])
param_grid_tree = dict(ccp_alpha = [0.001, 0.002, 0.003, 0.004], 
                       min_samples_leaf = [2,3,5,7,10])
param_grid_rf = dict(n_estimators = [500],
                     min_samples_leaf = [5],
                     max_features = [2,8,10,30,40,50])

In [8]:
kfold = KFold(n_splits = 10, shuffle = True, random_state = 42)

# Classifiers and Regressors

#### Lasso

In [47]:
#Lasso
lasso_reg = LassoCV(cv = kfold)
lasso_reg = lasso_reg.fit(np.array(X_train), np.array(y_train))
lasso_reg.alpha_

0.005605014160132827

In [48]:
p_lasso_val = lasso_reg.predict(np.array(X_test))
#p_lasso_val = np.where(p_lasso_val > 0.5, 1, 0).tolist()
print(r2_score(p_lasso_val, y_test) / len(y_test))

7.620738322546095e-05


In [49]:
def lasso_correction(prediction_val):
    if prediction_val >= 8.5 : 
        prediction_val = 9
    elif prediction_val >= 7.5 : 
        prediction_val = 8
    elif prediction_val >= 6.5 : 
        prediction_val = 7
    elif prediction_val >= 5.5 : 
        prediction_val = 6
    elif prediction_val >= 4.5 : 
        prediction_val = 5
    elif prediction_val >= 3.5 : 
        prediction_val = 4
    elif prediction_val >= 2.5 : 
        prediction_val = 3
    elif prediction_val >= 1.5 : 
        prediction_val = 2
    elif prediction_val >= 0.5 : 
        prediction_val = 1
    else :
        prediction_val=0
    return prediction_val     

In [50]:
lasso_df = pd.DataFrame(p_lasso_val)
lasso_df

Unnamed: 0,0
0,2.884449
1,1.055901
2,1.737564
3,3.067995
4,4.114187
...,...
1792,7.067875
1793,1.908901
1794,5.321360
1795,4.832469


In [51]:
p_lasso_val = lasso_df.loc[::,0].apply(lasso_correction)

In [52]:
p_lasso_val

0       3
1       1
2       2
3       3
4       4
       ..
1792    7
1793    2
1794    5
1795    5
1796    7
Name: 0, Length: 1797, dtype: int64

#### Decision Tree

In [11]:
tree_clf = DecisionTreeClassifier()
grid_search_tree_clf = GridSearchCV(tree_clf, param_grid_tree, cv = kfold )
results_tree_clf = grid_search_tree_clf.fit(np.array(X_train), np.array(y_train))

#tree_reg = DecisionTreeRegressor()
#grid_search_tree_reg = GridSearchCV(tree_reg, param_grid_tree, cv = kfold)
#results_tree_reg = grid_search_tree_reg.fit(np.array(X_train), np.array(y_train))


#### Random Forest

In [12]:
rf_clf = RandomForestClassifier()
grid_search_rf_clf = GridSearchCV(rf_clf, param_grid_rf, cv = kfold )
results_rf_clf = grid_search_rf_clf.fit(np.array(X_train), np.array((y_train)))

#rf_reg = RandomForestRegressor()
#grid_search_rf_reg = GridSearchCV(rf_reg, param_grid_rf, cv = kfold)
#results_rf_reg = grid_search_rf_reg.fit(np.array(X_train), np.array(y_train))


#### Stochastic Gradient Boosting

In [13]:
sgb_classifier = xgboost.XGBClassifier(min_child_weight=10, verbosity = 0)
grid_search_clf = GridSearchCV(sgb_classifier, param_grid_sgb, cv = kfold)
results_sgb_clf = grid_search_clf.fit(np.array(X_train), np.array(y_train))

#sgb_regressor = xgboost.XGBRegressor(min_child_weight=10)
#grid_search = GridSearchCV(sgb_regressor, param_grid_sgb, cv = kfold )
#results_sgb_reg = grid_search.fit(np.array(X_train), np.array(y_train))

### Best Parameters

In [53]:
#Classification
print("Best parameters of \n")
print("Alpha Value of Lasso: \n{}\n".format(lasso_reg.alpha_))
print("Best parameters of Decision Tree: \n{}\n".format(results_tree_clf.best_params_))
print("Best parameters of Random Tree: \n{}\n".format(results_rf_clf.best_params_))
print("Best parameters of Stochastic Gradient Boosting: \n{}\n".format(results_sgb_clf.best_params_))

#Regression
#print("Alpha Value of Lasso: \n{}\n".format(lasso_reg.alpha_))
#print("Best parameters of Decision Tree: \n{}\n".format(results_tree_reg.best_params_))
#print("Best parameters of Random Tree: \n{}\n".format(results_rf_reg.best_params_))
#print("Best parameters of Stochastic Gradient Boosting: \n{}\n".format(results_sgb_reg.best_params_))

Best parameters of 

Alpha Value of Lasso: 
0.005605014160132827

Best parameters of Decision Tree: 
{'ccp_alpha': 0.001, 'min_samples_leaf': 2}

Best parameters of Random Tree: 
{'max_features': 8, 'min_samples_leaf': 5, 'n_estimators': 500}

Best parameters of Stochastic Gradient Boosting: 
{'learning_rate': 0.1, 'max_depth': 10, 'n_estimators': 200}



### Accuracy Score and  Best Score

In [54]:
d = ["LassoCV", "Decision Tree","Random Forest","Stochastic Gradient Boosting"]
score_table = pd.DataFrame(d, columns=['Models'])

accuracy_list = [accuracy_score(p_lasso_val,(y_test)),
                 accuracy_score(results_tree_clf.best_estimator_.predict(np.array(X_test)),(y_test)),
                 accuracy_score(results_rf_clf.best_estimator_.predict(np.array(X_test)), (y_test)),
                 accuracy_score(results_sgb_clf.best_estimator_.predict(np.array(X_test)), (y_test))]

score_table['Accuracy Score'] = accuracy_list

bestscore_list = [np.nan, results_tree_clf.best_score_, results_rf_clf.best_score_, results_sgb_clf.best_score_]
score_table['Best Score'] = bestscore_list
score_table

Unnamed: 0,Models,Accuracy Score,Best Score
0,LassoCV,0.213689,
1,Decision Tree,0.849193,0.899027
2,Random Forest,0.967724,0.975935
3,Stochastic Gradient Boosting,0.957151,0.976717


### Classification Report

In [55]:
from sklearn.metrics import classification_report
print('Decision Tree')
print(classification_report(y_test,results_tree_clf.best_estimator_.predict(np.array(X_test))))
print('Random Forest')
print(classification_report(y_test,results_rf_clf.best_estimator_.predict(np.array(X_test))))
print('Stochastic Gradient Boost')
print(classification_report(y_test,results_sgb_clf.best_estimator_.predict(np.array(X_test))))

Decision Tree
              precision    recall  f1-score   support

           0       0.95      0.94      0.94       178
           1       0.82      0.88      0.85       182
           2       0.81      0.81      0.81       177
           3       0.85      0.79      0.82       183
           4       0.79      0.79      0.79       181
           5       0.93      0.86      0.89       182
           6       0.92      0.96      0.94       181
           7       0.88      0.79      0.83       179
           8       0.77      0.84      0.81       174
           9       0.79      0.83      0.81       180

    accuracy                           0.85      1797
   macro avg       0.85      0.85      0.85      1797
weighted avg       0.85      0.85      0.85      1797

Random Forest
              precision    recall  f1-score   support

           0       0.98      0.99      0.99       178
           1       0.95      0.98      0.96       182
           2       0.99      0.99      0.99       

In [56]:
print(r2_score(p_lasso_val, y_test) / len(y_test))

7.107925077575349e-05


# Comment Section

In this particular data set, it is obvious that tree-based models work better than lasso model considering. Lasso regression gives the worst result with very low accuracy score. In tree-based models, accuracy scores are quite well however there may be overfitting since the accuracy scores are higher than %95.

The best models for this particular data set are random forest and stochastic gradient boosting according to corresponding accuracy scores. Even though it took quite long time to fit the model for both of them, decision tree worked very fast. Unfortunately, there is an obvious difference between accuracy score of the random forest and other tree-based models. Thus, I would choose the random forest because it works faster than sgb and gives better results.