In [1]:
import random
import numpy as np
import os      #We write a program that will run on both Windows and GNU/Linux.
import glob    #For useful easier file or extansion search
import cv2

In [2]:
def resize(path, img_height, img_width):
    img = cv2.imread(path)
    resized = cv2.resize(img, (img_height, img_width))# Every img converting to
    return resized                                    # 64x64 pixel 

In [3]:
def get_data():                    
    train_directory ="imgs/train/" # Data Directory
    X_train = []                   # For Train and Test we assign empty array.
    y_train = []
    
    for j in range(10):            # We'll pull the data sequentially from file.
        print("Load Folder c{}".format(j))
        path = os.path.join(train_directory, 'c' + str(j), '*.jpg') #c0, c1, c2 ...
        files = glob.glob(path)
        for fl in files:
            flbase = os.path.basename(fl) #Method is used to get 
                                          #the base name in specified path
            img = resize(fl, 64, 64)   # We convert the images to 64x64 size  
            X_train.append(img)  # And finally, we added the empty defined array.
            y_train.append(j)  # And then, for y_train set.
    return X_train, y_train

In [28]:
X_train, y = get_data()

Load Folder c0
Load Folder c1
Load Folder c2
Load Folder c3
Load Folder c4
Load Folder c5
Load Folder c6
Load Folder c7
Load Folder c8
Load Folder c9


In [30]:
X_train = np.asarray(X_train) # Convert the input to an array.
y = np.asarray(y)
print(X_train.shape)

(22424, 64, 64, 3)


In [31]:
#Gives a new shape to an array without changing its data
from sklearn.model_selection import train_test_split, cross_val_score, KFold
X_train = np.reshape(X_train, (X_train.shape[0], -1))
y_train = y.reshape(-1, 1)
X_train, X_val, y_train, y_val = train_test_split(X_train, y, test_size=0.2, random_state=42)
print('Training data shape: ', X_train.shape)
print('Train Label shape:', y_train.shape)

Training data shape:  (17939, 12288)
Train Label shape: (17939,)


# Naive Bayes 

Naive Bayes to be used with GaussianNB
Naive Bayes methods are a set of supervised learning algorithms based on applying Bayes’ theorem with the “naive” assumption of conditional independence between every pair of features given the value of the class variable. Bayes’ theorem states the following relationship, given class variable " y " and dependent feature vector "X1" through " Xn ". 
 

In [8]:
from sklearn.naive_bayes import GaussianNB

clf = GaussianNB()
print(clf)

kfold = KFold(n_splits=5, random_state=42, shuffle=True)
knn_results = cross_val_score(clf, X_train, y_train, cv=kfold, scoring="accuracy")
result = "Mean Accuracy: %f" % (knn_results.mean())
print(result)

GaussianNB()
Mean Accuracy: 0.539941


# Decision Tree Classifier

Decision Trees (DTs) are a non-parametric supervised learning method used for classification and regression. The goal is to create a model that predicts the value of a target variable by learning simple decision rules inferred from the data features. A tree can be seen as a piecewise constant approximation.

In [10]:
from sklearn.tree import DecisionTreeClassifier

tree = DecisionTreeClassifier()
print(tree)
kfold = KFold(n_splits=5, random_state=42, shuffle=True)
tree_results = cross_val_score(tree, X_train, y_train, cv=kfold, scoring="accuracy")
result = "Mean Accuracy: %f" % (tree_results.mean())
print(result)

DecisionTreeClassifier()
Mean Accuracy: 0.847038


# Logistic Regression

Fully Explained:
https://medium.com/data-science-group-iitr/logistic-regression-simplified-9b4efe801389

Or creating and using this model also useful for this site without sci-kit function:
https://towardsdatascience.com/logistic-regression-explained-and-implemented-in-python-880955306060

In [32]:
from sklearn.linear_model import LogisticRegression
"""
tolfloat, default=1e-4
Tolerance for stopping criteria.

Cfloat, default=1.0
Inverse of regularization strength; 
must be a positive float. Like in support vector machines, 
smaller values specify stronger regularization.

solver{‘newton-cg’, ‘lbfgs’, ‘liblinear’, ‘sag’, ‘saga’}, default=’lbfgs’
Algorithm to use in the optimization problem.

max_iterint, default=100
Maximum number of iterations taken for the solvers to converge.

"""
clf1 = LogisticRegression(C=10, tol=0.01, solver='lbfgs', max_iter=300)
print(clf1)

scores = cross_val_score(clf1, X_train, y_train, cv=5, scoring='accuracy')
result = "Mean Accuracy: %f" % (scores.mean())
print(result)

LogisticRegression(C=10, max_iter=300, tol=0.01)
Mean Accuracy: 0.991248


# Random Forest 

Learns a random forest*, which consists of a chosen number of decision trees. Each of the decision tree models is learned on a different set of rows (records) and a different set of columns (describing attributes), whereby the latter can also be a bit-vector or byte-vector descriptor (e.g. molecular fingerprint). 

In [19]:
from sklearn.ensemble import RandomForestClassifier

clf2 = RandomForestClassifier(random_state=42)
print(clf2)

scores = cross_val_score(clf2, X_train, y_train, cv=5, scoring='accuracy')
result = "Mean Accuracy: %f" % (scores.mean())
print(result)

RandomForestClassifier(random_state=42)
Mean Accuracy: 0.990468
