In [1]:
import cv2
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV, train_test_split,RandomizedSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler


## Disclaimer
<font size='3'>before loading the data I extracted the zip files that contain the data.</font>

<font size='3'>the train data should be in the same directory as this notebook and in the `train` folder</font>

# Processing colorful images
<font size="4">In this section I will try to process the images with colors but a resized version of each images.</font>

In [2]:
data = []
label = []
path = "./train/"
IMG_SIZE = 32

for file in os.listdir(path):
    img=cv2.imread(path+file)
    img=cv2.resize(img,(IMG_SIZE,IMG_SIZE),interpolation=cv2.INTER_AREA)
    img=img.astype('float32')
    img = img.ravel()
#     img = img.reshape(-1,1024)
    file_label = file.split('.')[0]
    
    if file_label == 'cat':
        label.append(1)
    else:
        label.append(0)
    data.append(img)
data = np.array(data)

In [3]:
len(data[1])

3072

In [4]:
x_train, x_test, y_train, y_test = train_test_split(data,label , test_size=0.1 ,random_state=21)


In [5]:
from sklearn.neighbors import KNeighborsClassifier


In [6]:
scaling_pipeline = Pipeline(
    [
        ('std_scaler', StandardScaler()),
        ('pca',PCA(n_components=0.90, random_state=0,svd_solver='auto')),       
    ]
)

x_train_scaled = scaling_pipeline.fit_transform(x_train)
x_test_scaled = scaling_pipeline.transform(x_test)


In [7]:
knn_cls = KNeighborsClassifier()
knn_cls.fit(x_train_scaled,y_train)
knn_cls.score(x_test_scaled,y_test)

0.6048

In [8]:
knn_param_grid =     {
     'n_neighbors': list(range(3,100,10)),
     'weights':['uniform','distance'],
     'metric': ['euclidean','manhattan','minkowski'],
    }

knn_grid_search = RandomizedSearchCV(
    KNeighborsClassifier(n_jobs=-1),
    knn_param_grid,
    cv=3,
    scoring='accuracy',
    return_train_score=True,
    random_state=0,
    n_iter=15
)
knn_grid_search.fit(x_train_scaled,y_train)

RandomizedSearchCV(cv=3, estimator=KNeighborsClassifier(n_jobs=-1),
                   param_distributions={'metric': ['euclidean', 'manhattan',
                                                   'minkowski'],
                                        'n_neighbors': [3, 13, 23, 33, 43, 53,
                                                        63, 73, 83, 93],
                                        'weights': ['uniform', 'distance']},
                   random_state=0, return_train_score=True, scoring='accuracy')

In [9]:
knn_grid_search.best_estimator_

KNeighborsClassifier(metric='euclidean', n_jobs=-1, n_neighbors=53,
                     weights='distance')

In [10]:
knn_grid_search.best_score_

0.6130222222222222

In [11]:
from sklearn.ensemble import RandomForestClassifier
forest_clf = RandomForestClassifier(random_state=0,n_estimators=200, n_jobs=-1)


forest_clf.fit(x_train_scaled,y_train)
forest_clf.score(x_test_scaled,y_test)


0.6472

In [12]:
forest_params = {
     'n_estimators': [100,200,400],
     'max_depth':[None,30,100],
     'max_features': ['auto','sqrt','log2'],
     'random_state':[0]
}

In [13]:
forest_grid_search = RandomizedSearchCV(
    RandomForestClassifier(n_jobs=-1),
    forest_params,
    cv=3,
    scoring='accuracy',
    return_train_score=True,
    random_state=0
)

forest_grid_search.fit(x_train_scaled,y_train)

RandomizedSearchCV(cv=3, estimator=RandomForestClassifier(n_jobs=-1),
                   param_distributions={'max_depth': [None, 30, 100],
                                        'max_features': ['auto', 'sqrt',
                                                         'log2'],
                                        'n_estimators': [100, 200, 400],
                                        'random_state': [0]},
                   random_state=0, return_train_score=True, scoring='accuracy')

In [27]:
forest_grid_search.best_estimator_.score(x_test_scaled,y_test)

0.6484

In [16]:
from sklearn.ensemble import AdaBoostClassifier

ada_b = AdaBoostClassifier(n_estimators=100, random_state=0)
ada_b.fit(x_train_scaled,y_train)
ada_b.score(x_test_scaled,y_test)

0.6252

In [20]:
ada_param_grid = {
    "learning_rate": [0.1,0.4,1],
    "n_estimators": [100,200,400,800]
 }

In [22]:
ada_grid_search = RandomizedSearchCV(
    AdaBoostClassifier(random_state=0),
    ada_param_grid,
    cv=3,
    scoring='accuracy',
    return_train_score=True,
    random_state=0,
    n_jobs=-1
)

ada_grid_search.fit(x_train_scaled,y_train)

RandomizedSearchCV(cv=3, estimator=AdaBoostClassifier(random_state=0),
                   n_jobs=-1,
                   param_distributions={'learning_rate': [0.1, 0.4, 1],
                                        'n_estimators': [100, 200, 400, 800]},
                   random_state=0, return_train_score=True, scoring='accuracy')

In [23]:
ada_grid_search.best_score_

0.6276

In [25]:
ada_grid_search.best_estimator_.score(x_test_scaled,y_test)

0.6364

In [30]:
from sklearn.ensemble import StackingClassifier
stacking_clf = StackingClassifier(estimators=[
    ('ada', ada_grid_search.best_estimator_),
    ('rf', forest_grid_search.best_estimator_)])

stacking_clf.fit(x_train_scaled,y_train)

stacking_clf.score(x_test_scaled,y_test)

0.648

In [31]:
import xgboost as xgb
from xgboost import XGBClassifier

xgb_clf = XGBClassifier(n_jobs=-1, max_depth=6,gamma = 0,colsample_bytree = 0.8)

xgb_clf.fit(x_train_scaled,y_train)
xgb_clf.score(x_test_scaled,y_test)

0.6528

In [32]:
xgb_params = {
    'max_depth': [3,6,10],
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [100, 500, 1000],
    'colsample_bytree': [0.3, 0.7]
}


XGBGridsearch = RandomizedSearchCV(XGBClassifier(), xgb_params,scoring='accuracy',cv=3, random_state=0,n_jobs=-1)
XGBGridsearch.fit(x_train_scaled,y_train)

RandomizedSearchCV(cv=3,
                   estimator=XGBClassifier(base_score=None, booster=None,
                                           colsample_bylevel=None,
                                           colsample_bynode=None,
                                           colsample_bytree=None, gamma=None,
                                           gpu_id=None, importance_type='gain',
                                           interaction_constraints=None,
                                           learning_rate=None,
                                           max_delta_step=None, max_depth=None,
                                           min_child_weight=None, missing=nan,
                                           monotone_constraints=None,
                                           n_estimators=100,...None,
                                           num_parallel_tree=None,
                                           random_state=None, reg_alpha=None,
                               

In [33]:
XGBGridsearch.best_estimator_

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.7, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.1, max_delta_step=0, max_depth=10,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=1000, n_jobs=0, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [34]:
XGBGridsearch.best_score_

0.6716000000000001

In [49]:
XGBGridsearch.best_estimator_.score(x_test_scaled,y_test)

0.678

In [35]:
stacking_clf = StackingClassifier(estimators=[
    ('ada', ada_grid_search.best_estimator_),
    ('rf', forest_grid_search.best_estimator_)], final_estimator=XGBGridsearch.best_estimator_)

stacking_clf.fit(x_train_scaled,y_train)

stacking_clf.score(x_test_scaled,y_test)

0.6312

In [44]:
from sklearn.ensemble import BaggingClassifier


bagging = BaggingClassifier(base_estimator=XGBGridsearch.best_estimator_,n_estimators=10, random_state=0,n_jobs=-1)
bagging.fit(x_train_scaled,y_train)

BaggingClassifier(base_estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                               colsample_bylevel=1,
                                               colsample_bynode=1,
                                               colsample_bytree=0.7, gamma=0,
                                               gpu_id=-1,
                                               importance_type='gain',
                                               interaction_constraints='',
                                               learning_rate=0.1,
                                               max_delta_step=0, max_depth=10,
                                               min_child_weight=1, missing=nan,
                                               monotone_constraints='()',
                                               n_estimators=1000, n_jobs=0,
                                               num_parallel_tree=1,
                                               random_state

In [45]:
bagging.score(x_test_scaled,y_test)

0.684

In [50]:
bagging_params = {
    'n_estimators': [10, 50, 100],
}


bagging_grid = GridSearchCV(bagging, bagging_params,scoring='accuracy',cv=3)
bagging_grid.fit(x_train_scaled,y_train)

GridSearchCV(cv=3,
             estimator=BaggingClassifier(base_estimator=XGBClassifier(base_score=0.5,
                                                                      booster='gbtree',
                                                                      colsample_bylevel=1,
                                                                      colsample_bynode=1,
                                                                      colsample_bytree=0.7,
                                                                      gamma=0,
                                                                      gpu_id=-1,
                                                                      importance_type='gain',
                                                                      interaction_constraints='',
                                                                      learning_rate=0.1,
                                                                      max_delta_step=0,
     

In [52]:
bagging_grid.best_estimator_

BaggingClassifier(base_estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                               colsample_bylevel=1,
                                               colsample_bynode=1,
                                               colsample_bytree=0.7, gamma=0,
                                               gpu_id=-1,
                                               importance_type='gain',
                                               interaction_constraints='',
                                               learning_rate=0.1,
                                               max_delta_step=0, max_depth=10,
                                               min_child_weight=1, missing=nan,
                                               monotone_constraints='()',
                                               n_estimators=1000, n_jobs=0,
                                               num_parallel_tree=1,
                                               random_state

In [51]:
bagging_grid.best_estimator_.score(x_test_scaled,y_test)

0.688

------------

# Processing images in black and white

<font size='4'>in this section I have decided to take the models that performed the best without the tunning so that I can check if I can acheive better results but in black and white.<font>