# Imports

In [14]:
import os
# Pandas, numpy, for data processing
import numpy as np
import pandas as pd

# Matplotlib for visualization
import matplotlib.pyplot as plt
%matplotlib inline
#
import time
# csv to write predict file
import csv
# open cv, scipy ndimage for image processing
import cv2
import random
from scipy import ndarray
import skimage as sk
from skimage import transform
from skimage import util
from scipy import ndimage
#sklearn to train models
from sklearn import preprocessing,cross_validation,neighbors
from sklearn.preprocessing import Imputer, MinMaxScaler
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn import model_selection
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

# Data loading and shapes printing

In [15]:
#training features
images_train = np.load('all/train_images.npy', encoding='latin1')
print("training features shape",images_train.shape)

#testing features
images_test = np.load('all/test_images.npy', encoding='latin1')
print("testing  features shape",images_test.shape)

#labels
df = pd.read_csv('all/train_labels.csv')
print("training labels shape",df.shape)

training features shape (10000, 2)
testing  features shape (10000, 2)
training labels shape (10000, 2)


# Pre Processing

### Data Reshaping

In [16]:
#training data reshaping
X=np.concatenate(images_train[:,1],axis=0)
X=X.reshape(10000,10000)
print("training features new shape",X.shape)

#testing data reshaping
X_test1=np.concatenate(images_test[:,1],axis=0)
X_test1=X_test1.reshape(10000,10000)
print("testing  features new shape",X_test1.shape)

#training labels reshaping

y=np.array(df.iloc[:,1])
print("training labels   new shape",y.shape)

training features new shape (10000, 10000)
testing  features new shape (10000, 10000)
training labels   new shape (10000,)


### Searching for missing values and filling them with Median ( if there's missing values)

In [17]:
# median imputer object 
imp = Imputer(strategy='median')

# fit on the training data
imp.fit(X)

# Transform training and testing data
X = imp.transform(X)
X_test1 = imp.transform(X_test1)

print('(Missing values) training features: ', np.sum(np.isnan(X)))
print('(Missing values) testing  features:  ', np.sum(np.isnan(X_test1)))


(Missing values) training features:  0
(Missing values) testing  features:   0


### Make sure there is no infinite values 

In [18]:
print(np.where(~np.isfinite(X)))
print(np.where(~np.isfinite(X_test1)))

(array([], dtype=int64), array([], dtype=int64))
(array([], dtype=int64), array([], dtype=int64))


### Centering function

In [19]:
def centerimg(im):
    """
    takes a filtered image as argument and determins 
    mass center, then translates the image data 
    returns: centred image containing only relevant information 
    relevent pixels
    """
    # Determine Centre of Mass
    com = ndimage.measurements.center_of_mass(im)
    # Translation distances in x and y axis
    x_trans = int(im.shape[0]//2-com[0])
    y_trans = int(im.shape[1]//2-com[1])

    if x_trans > 0:
        im2 = np.pad(im, ((x_trans, 0), (0, 0)), mode='constant')
    else:
        im2 = np.pad(im, ((0, -x_trans), (0, 0)), mode='constant')
        im2 = im2[-x_trans:, :]

    if y_trans > 0:
        im3 = np.pad(im2, ((0, 0), (y_trans, 0)), mode='constant')
    else:
        im3 = np.pad(im2, ((0, 0), (0, -y_trans)), mode='constant')
        im3 = im3[:, -y_trans:]
    im3=im3[35:75,35:75].copy()
    return im3


### filtering & resizing function

In [20]:
def filterimg(matrix):
    """
    takes a 10000x10000 matrix reshapes every line  into (100 x 100 matrix)
    denoises and resizes it  into a (40 x 40) matrix (without loosing information)
    reshapes the input matrix into a (10000 x 1600)
    """
    X_new=[]
    for i in range(0,10000):
        square = matrix[i].reshape(100,100)
        square=np.int8(square)
        image = square.astype('uint8')
        nb_components, output, stats, centroids = cv2.connectedComponentsWithStats(image, connectivity=4)
        sizes = stats[:, -1]
        max_label = 1
        max_size = sizes[1]
        for i in range(2, nb_components):
            if sizes[i] > max_size:
                max_label = i
                max_size = sizes[i]
        img2 = np.zeros(output.shape)
        img2[output == max_label] = 255
        img2=centerimg(img2)
        img2=img2.reshape(-1,1600)
        X_new.append(img2)
    X_new=np.array(X_new).reshape(len(X_new),-1)
    return(X_new)

### Denoising , Resizing and translating Taining and testing data

In [21]:
X_new=filterimg(X)
print("[After pre processing] training features new shape",X_new.shape)

X_test1_new=filterimg(X_test1)
print("[After pre processing] testing  features new shape",X_test1_new.shape)

[After pre processing] training features new shape (10000, 1600)
[After pre processing] testing  features new shape (10000, 1600)


# Data Augmentation

In [22]:
def random_rotation(image_array: ndarray):
    # pick a random degree of rotation between 25% on the left and 25% on the right
    random_degree = random.uniform(-25, 25)
    return sk.transform.rotate(image_array, random_degree)



for i in range(1000):
    new_image=random_rotation(X_new[i].reshape(40,40))
    new_image=new_image.reshape(1,1600)
    X_new=np.concatenate([X_new, new_image])
   
    y=np.hstack([y,y[i] ])
   
    
    
y.shape
        

(11000,)

### Split into train (80%) & test (20%)

In [23]:
split = 0.2
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X_new, y, test_size=split,random_state=21)

### shapes after spliting

In [24]:
print(X_train.shape)
print(X_test.shape)

(8800, 1600)
(2200, 1600)


### Exploring Clean data ( 3 training Examples)

In [None]:
for i in range(0,15):
    trainsetmtx = np.reshape(X_train[i], (40,40))
    print(trainsetmtx.shape)
    imgplot = plt.imshow(trainsetmtx)
    print ('Label1 = %s' % y_train[i]), plt.show()

Cleaning images used for prediction: 

### Exploring Clean data ( 3 testing Examples)

In [None]:
for i in range(0,15):
    trainsetmtx = np.reshape(X_test1_new[i], (40,40))
    imgplot = plt.imshow(trainsetmtx)
   
    plt.show()

# Implementation of Logistic Regression

In [27]:
#Training the model
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [28]:
# Prediction with test sampling
y_pred = logreg.predict(X_test)

In [29]:
#print accuracy
print("Accuracy:{} ".format(metrics.accuracy_score(y_test, y_pred)))

Accuracy:0.22318181818181818 


# Bagged decision trees
After data preparation, we're going to use Bagged Decision Trees.
We are going to test the accuracy with differents number of trees:

In [None]:
seed = 8
kfold = model_selection.KFold(n_splits=10)
clf = DecisionTreeClassifier()
ntrees=[100,150,200,250,280,290,300]
result=[]
t0 = time.clock()
for i in ntrees:
    model1 = BaggingClassifier(base_estimator=clf, n_estimators=i, random_state=8)
    results = model_selection.cross_val_score(model1, X_train, y_train, cv=kfold)
    print ("Time: %.4fs" % (time.clock()-t0))
    print(results.mean())
    result.append(results.mean())
print(result)

Time: 1754.9418s
0.5169318181818182
Time: 4128.1049s
0.52625


In [None]:
#Generate plot

%matplotlib inline
plt.title('Bagged Decision Trees')
plt.plot(ntrees, result, label='Training Accuracy')
plt.legend()
plt.xlabel('num_trees')
plt.ylabel('Accuracy')
plt.show()

In [None]:
# Bagged Decision Trees for Classification

seed = 7
t1 = time.clock()
kfold = model_selection.KFold(n_splits=10)
clf = DecisionTreeClassifier()
n_trees = 300
model1 = BaggingClassifier(base_estimator=clf, n_estimators=n_trees, random_state=seed)
results = model_selection.cross_val_score(model1, X_train, y_train, cv=kfold)

In [None]:
print ("Time: %.4fs" % (time.clock()-t1))
print(results.mean())
cft1=model1.fit(X_train,y_train)
y_pred1=cft1.predict(X_test1_new)
print(y_pred1)

In [None]:
with open('final9.csv', 'w',newline='') as csvfile:
    spamwriter = csv.writer(csvfile, delimiter=',',
                            quotechar='|', quoting=csv.QUOTE_MINIMAL)
    spamwriter.writerow(['Id','Category'])
    for i in range(len(y_pred1)):
         spamwriter.writerow([i, y_pred1[i]])
