In [1]:
import numpy as np
import matplotlib as plt
import pandas as pd

In [2]:
train = pd.read_csv('train.csv')

### Creating a new feature which will show the average pixel value for each digit label 

In [3]:
def average_pixel_value(data, digits=[]):
    '''
    A function that accepts a DataFrame and a list of digits and returns a dictionary of average pixel values for each digit in digits
    
    Input:
            data: pandas DataFrame
            digits (list): a list of digits in the dataset
        
    Output:
            avg_pixel (dictionary): a dictionary of average pixel value for each digit
    
    '''
    avg_pixel = {}
    
    for dig in digits:
        
        img = data.loc[data['label'] == digits[dig]].iloc[:, 1:].values.reshape(-1, 28, 28, 1)
        avg = np.average(img[0][:,:,0])/255.0
        avg_pixel[dig] = avg
    
    return avg_pixel

In [4]:
dic_avgPixelVal = average_pixel_value(train, [x for x in range(0,10,1)])
dic_avgPixelVal = pd.Series(dic_avgPixelVal).to_frame('average_pixel_value')
dic_avgPixelVal

Unnamed: 0,average_pixel_value
0,0.223134
1,0.083278
2,0.206578
3,0.121489
4,0.075155
5,0.139656
6,0.134109
7,0.085794
8,0.21308
9,0.130147


### Creating a new feature which will show the average pixels used in order to draw each digit label

In [5]:
def avg_pixel_used(data, digits=[]):
    
    '''
    A function that accepts a DataFrame and a list of digits and returns a dictionary of average pixels used to draw each digit label
    
    Input:
            data: pandas DataFrame
            digits (list): a list of digits in the dataset
        
    Output:
            pixel_used (dictionary): a dictionary of average pixels used to draw each image
    '''
    
    pixel_used = {}
    
    for dig in digits:
        
        val = data.loc[data['label']==digits[dig]].iloc[[0], 1:]
        counter = 0
        
        for col in val.columns:
            
            if val[col].unique() > 0:
                
                counter += 1
                
            else: 
                continue
        
        pixel_used[dig] = counter/784

    return pixel_used

In [6]:
dic_avgPixelUsed = avg_pixel_used(train, [x for x in range(10)])
dic_avgPixelUsed = pd.Series(dic_avgPixelUsed).to_frame('average_pixels_used')
dic_avgPixelUsed

Unnamed: 0,average_pixels_used
0,0.3125
1,0.123724
2,0.269133
3,0.227041
4,0.144133
5,0.209184
6,0.232143
7,0.145408
8,0.294643
9,0.19898


In [7]:
def create_fill_column(data, columnName, values, digits=[]):
    
    '''
    A function that creates a new column, fills it with nans and then replaces each nan with average pixel used value with respect to the label
    
    Input:
            data (DataFrame)
            columnName (String): a name chosen for the new column
            values (DataFrame): a DataFrame with the desired values where each value's index corresponds with 'label' values 
            digits (list): a list of ints where each int corresponding with the label value in the dataset
    Output:
            updates the dataset with the new column in it
    '''
    
    data[columnName] = np.nan
    
    for dig in digits:
        
        data.loc[data['label'] == dig, columnName] = values.iloc[[dig]].values

In [8]:
create_fill_column(train, 'avg_pixel_val', dic_avgPixelVal, [x for x in range(10)])
create_fill_column(train, 'avg_pixel_used', dic_avgPixelUsed, [x for x in range(10)])
train.iloc[:, [-2,-1,0]].head()

Unnamed: 0,avg_pixel_val,avg_pixel_used,label
0,0.083278,0.123724,1
1,0.223134,0.3125,0
2,0.083278,0.123724,1
3,0.075155,0.144133,4
4,0.223134,0.3125,0


In [None]:
'''
1. train['label'].dtype = 'categorical'
2. sns.heatmap
'''

In [6]:
'''
Creating X and y variables of the training data set and another variable for the testing data set
for future machine learning manipulations (scaling, dimension reduction, splitting to train & val sets)
'''
X_train = train.iloc[:, 1:].values
y_train = train.iloc[:, [0]].values

In [7]:
'''
checking X and y shapes and values of their first feature
'''
print(X_train.shape, X_train[0][0])
print(y_train.shape, y_train[0])

(42000, 784) 0
(42000, 1) [1]
(28000, 784)


In [8]:
# feature scaling
X_train[:, :-2] = X_train/255.0

In [9]:
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score

create a new variable which would describe each digit
1. split to training and validation sets (10% validation)
2. scale train and val sets
3. pca and lda
