# Machine Learning - File 1

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
from PIL import Image
from sklearn.model_selection import cross_val_score
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, voting_classifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense, Conv2D, Flatten, MaxPooling2D, ZeroPadding2D
import warnings
warnings.filterwarnings("ignore")

Using TensorFlow backend.


## Import / cleaning

In [3]:
# Importing data
df = pd.read_csv('attribute_list.csv', skiprows=1)

# reading pictures. Returning as np.array. Color level: RGB
def read_pic(name, as_black_white=False):   
    string = 'dataset/' + str(name) + '.png'
    if as_black_white == True:
        return(np.array(Image.open(string).convert(mode='L')))
    else:
        return(np.array(Image.open(string)))

# Cleaning data
def clean_data():
    tmp = df[['hair_color', 'eyeglasses', 'smiling', 'young', 'human']]
    tmp['sum'] = tmp.sum(axis=1)
    t = tmp[tmp['sum'] == -5]
    return(t.index + 1)
df.drop(labels=clean_data()-1, inplace=True)

# Converting (-1,1) to (0, 1)
df['eyeglasses'] = (df['eyeglasses'] + 1) / 2
df['smiling'] = (df['smiling'] + 1) / 2
df['young'] = (df['young'] + 1) / 2
df['human'] = (df['human'] + 1) / 2

df['eyeglasses'] = df['eyeglasses'].apply(lambda x: int(x))
df['smiling'] = df['smiling'].apply(lambda x: int(x))
df['young'] = df['young'].apply(lambda x: int(x))
df['human'] = df['human'].apply(lambda x: int(x))

df.head()

Unnamed: 0,file_name,hair_color,eyeglasses,smiling,young,human
0,1,1,0,1,1,0
1,2,4,0,1,1,1
2,3,5,0,1,0,0
6,7,2,0,1,1,0
7,8,3,0,1,1,0


In [4]:
# Importing Pictures
def import_pictures(data, _as_black_white=False):
    res = []
    file_names = np.array(data['file_name'])
    for k in file_names:
        res.append(read_pic(k, as_black_white=_as_black_white)) # Data are alredy cleaned
    return(np.array(res))

pics_color = import_pictures(df)
pics_color.shape # acces to an image: pics[name_of_img][x_pixel][y_pixel][RGB_color]


(4565, 256, 256, 3)

## First try: black and white

### Preprocessing

Import pictures in __black and white__ and __scaling__.

In [5]:
pics_b_w = import_pictures(df, _as_black_white=True)
print(pics_b_w.shape)
imgs = np.array([preprocessing.scale(pics_b_w[k]) for k in range(len(pics_b_w))])

(4565, 256, 256)


Applying __PCA__.

In [6]:
# First, ravel
def to_vector(data=pics_b_w):
    res = []
    for k in range(len(data)):
        res.append(data[k].ravel())
    return(np.array(res))

pics_lin = to_vector()

In [7]:
# Apply PCA
s = time.time()
pca = PCA()
pca.fit(pics_lin)

t = time.time()

print('Time to compute pca.fit: ' + str(int(100*(t - s)/60)/100) + ' min')

print('Explained Variance:', pca.explained_variance_ratio_)
# searching for number of component to keep
v_exp = np.cumsum(pca.explained_variance_ratio_)

def extract_var(tbl, x=.95):
    k = 0
    while tbl[k] < x:
        k = k + 1
    return(k)

k95 = extract_var(v_exp)
k99 = extract_var(v_exp, x=.99)

# X = pca.fit_transform(pics_lin)
'''pca95 = PCA(n_components=k95)
X95 = pca95.fit_transform(pics_lin)
print('X95.shape', X95.shape)'''

pca99 = PCA(n_components=k99)
X99 = pca99.fit_transform(pics_lin)
print('X99.shape', X99.shape)
print('Time to compute the two pca.fit_transform (95%, 99%): ' + str(int(100*(time.time() - t)/60)/100) + ' min')


Time to compute pca.fit: 2.23 min
Explained Variance: [4.80372054e-01 6.67785790e-02 4.88748002e-02 ... 2.79181849e-07
 2.60685152e-07 9.47366833e-32]
X99.shape (4565, 1746)
Time to compute the two pca.fit_transform (95%, 99%): 2.11 min


### Learning - Neural Networks

#### Human

In [7]:
Y_human = np.array(df['human'])
X_train_h, X_test_h, Y_train_h, Y_test_h = train_test_split(X99, Y_human, test_size=0.2)

# Cross validation score
c1 = np.mean(cross_val_score(MLPClassifier(), X99, Y_human, cv=6))
print('Cross Validation score: ' + str(100*c1) + '%')

# 'By hand'
clf_neuralNetwork_human = MLPClassifier()
clf_neuralNetwork_human.fit(X_train_h, Y_train_h)
print(clf_neuralNetwork_human.score(X_test_h, Y_test_h))

Cross Validation score: 98.57611571247072%
0.9857612267250822


In [8]:
# Manual testing (possible because dataset already shuffled)
ratio_train = 0.8
Y = np.array(df['human'])
names = np.array(df['file_name'])
X_train, X_test = X99[:int(ratio_train*len(X99))], X99[int(ratio_train*len(X99)):] 
Y_train, Y_test = Y[:int(ratio_train*len(Y))], Y[int(ratio_train*len(Y)):]
clf_neuralNetwork_mt = MLPClassifier()
clf_neuralNetwork_mt.fit(X_train, Y_train)
names_train, names_test = names[:int(ratio_train*len(X99))], names[int(ratio_train*len(X99)):]
print(clf_neuralNetwork_mt.predict([X_test[20], X_test[21], X_test[22], X_test[23],
                                 X_test[24], X_test[25], X_test[26], X_test[27],
                                 X_test[28], X_test[29], X_test[30], X_test[31]]))
print(names_test[20:32])

[1 1 0 1 0 0 1 0 0 1 0 1]
[4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031]


#### Young

In [9]:
Y_young = np.array(df['young'])
X_train_y, X_test_y, Y_train_y, Y_test_y = train_test_split(X99, Y_young, test_size=0.2)

# Cross validation score
c2 = np.mean(cross_val_score(MLPClassifier(), X99, Y_young, cv=6))
print('Cross Validation score: ' + str(100*c2) + '%')

# 'By hand'
clf_neuralNetwork_young = MLPClassifier()
clf_neuralNetwork_young.fit(X_train_y, Y_train_y)
print(clf_neuralNetwork_young.score(X_test_y, Y_test_y))

Cross Validation score: 73.42834124389613%
0.7513691128148959


In [10]:
# Manual testing (possible because dataset already shuffled)
ratio_train = 0.8
names = np.array(df['file_name'])
X_train, X_test = X99[:int(ratio_train*len(X99))], X99[int(ratio_train*len(X99)):] 
Y_train, Y_test = Y_young[:int(ratio_train*len(Y_young))], Y_young[int(ratio_train*len(Y_young)):]
clf_neuralNetwork_mt = MLPClassifier()
clf_neuralNetwork_mt.fit(X_train, Y_train)
names_train, names_test = names[:int(ratio_train*len(X99))], names[int(ratio_train*len(X99)):]
print(clf_neuralNetwork_mt.predict([X_test[20], X_test[21], X_test[22], X_test[23],
                                 X_test[24], X_test[25], X_test[26], X_test[27],
                                 X_test[28], X_test[29], X_test[30], X_test[31]]))
print(names_test[20:32])

[1 1 1 1 1 1 1 0 1 0 1 1]
[4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031]


#### Smiling

In [11]:
Y_smile = np.array(df['smiling'])
X_train_s, X_test_s, Y_train_s, Y_test_s = train_test_split(X99, Y_smile, test_size=0.2)

# Cross validation score
c3 = np.mean(cross_val_score(MLPClassifier(), X99, Y_smile, cv=6))
print('Cross Validation score: ' + str(100*c3) + '%')

# 'By hand'
clf_neuralNetwork_smile = MLPClassifier()
clf_neuralNetwork_smile.fit(X_train_s, Y_train_s)
print('Splited data score:', clf_neuralNetwork_smile.score(X_test_s, Y_test_s))

Cross Validation score: 87.62370363069026%
Splited data score: 0.8751369112814896


In [12]:
# Manual testing (possible because dataset already shuffled)
ratio_train = 0.8
names = np.array(df['file_name'])
X_train, X_test = X99[:int(ratio_train*len(X99))], X99[int(ratio_train*len(X99)):] 
Y_train, Y_test = Y_smile[:int(ratio_train*len(Y_smile))], Y_smile[int(ratio_train*len(Y_smile)):]
clf_neuralNetwork_mt = MLPClassifier()
clf_neuralNetwork_mt.fit(X_train, Y_train)
names_train, names_test = names[:int(ratio_train*len(X99))], names[int(ratio_train*len(X99)):]
print(clf_neuralNetwork_mt.predict([X_test[20], X_test[21], X_test[22], X_test[23],
                                 X_test[24], X_test[25], X_test[26], X_test[27],
                                 X_test[28], X_test[29], X_test[30], X_test[31]]))
print(names_test[20:32])

[1 1 1 1 1 1 0 1 1 0 1 1]
[4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031]


#### Eyeglasses

In [13]:
Y_eyeglass = np.array(df['eyeglasses'])
X_train_e, X_test_e, Y_train_e, Y_test_e = train_test_split(X99, Y_eyeglass, test_size=0.2)

# Cross validation score
c4 = np.mean(cross_val_score(MLPClassifier(), X99, Y_eyeglass, cv=6))
print('Cross Validation score: ' + str(100*c4) + '%')

# 'By hand'
clf_neuralNetwork_eyeglass = MLPClassifier()
clf_neuralNetwork_eyeglass.fit(X_train_e, Y_train_e)
print('Splited data score:', clf_neuralNetwork_eyeglass.score(X_test_e, Y_test_e))

Cross Validation score: 84.14074147390059%
Splited data score: 0.8324205914567361


In [14]:
# Manual testing (possible because dataset already shuffled)
ratio_train = 0.8
names = np.array(df['file_name'])
X_train, X_test = X99[:int(ratio_train*len(X99))], X99[int(ratio_train*len(X99)):] 
Y_train, Y_test = Y_eyeglass[:int(ratio_train*len(Y_eyeglass))], Y_eyeglass[int(ratio_train*len(Y_eyeglass)):]
clf_neuralNetwork_mt = MLPClassifier()
clf_neuralNetwork_mt.fit(X_train, Y_train)
names_train, names_test = names[:int(ratio_train*len(X99))], names[int(ratio_train*len(X99)):]
print(clf_neuralNetwork_mt.predict([X_test[20], X_test[21], X_test[22], X_test[23],
                                 X_test[24], X_test[25], X_test[26], X_test[27],
                                 X_test[28], X_test[29], X_test[30], X_test[31]]))
print(names_test[20:32])

[0 0 1 0 0 0 0 1 0 0 0 0]
[4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031]


#### Multiclass prediction

In [15]:
Y_multi = np.array(df[['eyeglasses', 'smiling', 'young','human']])
X_train_multi, X_test_multi, Y_train_multi, Y_test_multi = train_test_split(X99, Y_multi, test_size=0.2)

# Cross validation score
c5 = np.mean(cross_val_score(MLPClassifier(), X99, Y_multi, cv=6))
print('Cross Validation score: ' + str(100*c5) + '%')

Cross Validation score: 52.88110634668142%


#### Hair Color

In [16]:
df2 = df[df['hair_color'] != -1] 
df2.shape
pics_hair = import_pictures(df2, _as_black_white=True)
print(pics_hair.shape)
imgs_hair = np.array([preprocessing.scale(pics_hair[k]) for k in range(len(pics_hair))])
pics_lin_hair = to_vector(data=imgs_hair)
print(pics_lin_hair.shape)

# PCA
s = time.time()
pca = PCA()
X_hair = pca.fit_transform(pics_lin_hair)
t = time.time()
print('Time to compute pca.fit_transform: ' + str(int(100*(t - s)/60)/100) + ' min')

(3902, 256, 256)
(3902, 65536)
Time to compute pca.fit_transform: 1.79 min


In [17]:
Y_hair = np.array(df2['hair_color'])
X_train_hair, X_test_hair, Y_train_hair, Y_test_hair = train_test_split(X_hair, Y_hair, test_size=0.2)

# Cross validation score
c5 = np.mean(cross_val_score(MLPClassifier(), X_hair, Y_hair, cv=6))
print('Cross Validation score: ' + str(100*c5) + '%')

# 'By hand'
clf_neuralNetwork_hair = MLPClassifier()
clf_neuralNetwork_hair.fit(X_train_hair, Y_train_hair)
print('Splited data score:', clf_neuralNetwork_hair.score(X_test_hair, Y_test_hair))

Cross Validation score: 44.99976570361283%
Splited data score: 0.4660691421254802


#### Keras - CNN for hair

In [4]:
# Loading data
df2 = df[df['hair_color'] != -1] 
df2.shape
pics_hair = import_pictures(df2, _as_black_white=False)
pics_hair_bw = import_pictures(df2, _as_black_white=True)
Y_hair = np.array(df2['hair_color'])
print(pics_hair.shape)

# Splitting data 
X_train_hair, X_test_hair, Y_train_hair, Y_test_hair = train_test_split(pics_hair, Y_hair, test_size=0.2)
Y_train_hair = to_categorical(Y_train_hair)
Y_test_hair = to_categorical(Y_test_hair)

# Splitting for black and white
X_train_hair_bw, X_test_hair_bw, Y_train_hair_bw, Y_test_hair_bw = train_test_split(pics_hair_bw, 
                                                                                    Y_hair, test_size=0.2)
a,b,c = X_train_hair_bw.shape
d,e,f = X_test_hair_bw.shape
X_train_hair_bw = X_train_hair_bw.reshape(a,b,c,1)
X_test_hair_bw = X_test_hair_bw.reshape(d,e,f,1)
Y_train_hair_bw = to_categorical(Y_train_hair_bw)
Y_test_hair_bw = to_categorical(Y_test_hair_bw)
print(pics_hair_bw.shape)

NameError: name 'df' is not defined

In [None]:
#create model
model = Sequential()

#add model layers
'''model.add(ZeroPadding2D((1,1),input_shape=(256,256,1)))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Conv2D(32, kernel_size=3, activation='relu'))
model.add(Conv2D(64, kernel_size=3, activation='relu'))'''
#model.add(Conv2D(32, kernel_size=3, activation='relu', input_shape=(256,256,3)))
#model.add(Conv2D(32, kernel_size=3, activation='relu'))


"""model.add(ZeroPadding2D((1,1)))
model.add(Conv2D(32, kernel_size=3, activation='relu'))"""

#model.add(MaxPooling2D(pool_size=(2, 2)))

#model.add(ZeroPadding2D((1,1)))
#model.add(Conv2D(16, kernel_size=3, activation='relu'))
#model.add(Conv2D(16, kernel_size=3, activation='relu'))

#model.add(MaxPooling2D(pool_size=(2, 2)))
"""model.add(ZeroPadding2D((1,1)))
model.add(Conv2D(32, kernel_size=3, activation='relu'))

model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.2))

model.add(ZeroPadding2D((1,1)))
model.add(Conv2D(32, kernel_size=3, activation='relu', input_shape=(256,256,3)))
model.add(ZeroPadding2D((1,1)))
model.add(Conv2D(32, kernel_size=3, activation='relu'))

model.add(MaxPooling2D(pool_size=(2, 2), strides=(2,2)))
model.add(Dropout(0.2))

model.add(ZeroPadding2D((1,1)))
model.add(Conv2D(32, kernel_size=3, activation='relu', input_shape=(256,256,3)))
model.add(ZeroPadding2D((1,1)))
model.add(Conv2D(32, kernel_size=3, activation='relu'))"""

model.add(Flatten(input_shape=(256,256,1)))

model.add(Dense(128, activation='softmax'))
model.add(Dense(6, activation='softmax'))

#Compiling
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

#train the model
model.fit(X_train_hair_bw, Y_train_hair_bw, validation_data=(X_test_hair_bw, Y_test_hair_bw), epochs=1)

Train on 3121 samples, validate on 781 samples
Epoch 1/1


In [2]:
model.evaluate(X_test_hair, Y_test_hair)

NameError: name 'model' is not defined

In [None]:
model.predict(pics_hair_bw[200:210])

In [3]:
df2 = df[df['hair_color'] != -1] 
df2.shape


NameError: name 'df' is not defined

In [None]:

pics_hair = import_pictures(df2, _as_black_white=False)
[width, height] = pics_hair.size 