In [16]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn import cross_validation
from sklearn.decomposition import PCA

<h1> Loading a data frame with business ids and restrant labels

In [2]:
df_train_labels = pd.DataFrame.from_csv('train.csv')
df_train_labels.reset_index(level=0, inplace=True)
df_train_labels.head(5)

Unnamed: 0,business_id,labels
0,1000,1 2 3 4 5 6 7
1,1001,0 1 6 8
2,100,1 2 4 5 6 7
3,1006,1 2 4 5 6
4,1010,0 6 8


<h1> Loading mean CNN codes

<h2> Features used in classifying

In [53]:
def df_train_for_classifier(layer, df_train_labels):
    name_df = 'df_train_'+layer
    df = pd.load(name_df)
    df.reset_index(level=0, inplace=True)
    df = pd.merge(df, df_train_labels, on = 'business_id')
    df = df.dropna(how = 'any')
    df['meanCNN'] = df['meanCNN'].apply(lambda x: list(x))
    
    data = df['meanCNN']
    data = np.array([x for x in data])
    
    df['labels'] = df['labels'].apply(lambda x: map(float, x.split(' ')))
    labels = MultiLabelBinarizer().fit_transform(df['labels'])
    labels[0:5]
    
    return data, labels

In [54]:
train_data_fc6, train_labels = df_train_for_classifier('fc6', df_train_labels)
train_data_fc7, train_labels = df_train_for_classifier('fc7', df_train_labels)
train_data_fc8, train_labels = df_train_for_classifier('fc8', df_train_labels)
train_data_prob, train_labels = df_train_for_classifier('prob', df_train_labels)

<h1> Training a classifier

In [55]:
##Cross-validation
kf = cross_validation.KFold(n = 1996, n_folds = 5)

<h2> Logistic Regression

<h3> Testing performance of PCA decomposition on features

In [56]:
def cv_pca_decomp(clf, data, labels, n_comp_ar):
    kf = cross_validation.KFold(n = 1996, n_folds = 5)
    meanscores_pca = []
    meanscore_def = np.mean(cross_validation.cross_val_score(clf, data, labels, cv = kf, scoring = 'f1'))
    
    for n in n_comp_ar:
        data_reduced = PCA(n_components=n).fit_transform(data)
        score = np.mean(cross_validation.cross_val_score(clf, data_reduced, labels, cv = kf, scoring = 'f1'))
        meanscores_pca.append(score)
    
    return meanscore_def, meanscores_pca

<h4> fc6 layer

In [57]:
clf = OneVsRestClassifier(LogisticRegression())

In [58]:
n_comp_ar_fc6 = [100, 250, 500, 1000, 2000]

In [59]:
meanscore_def_fc6, meanscores_pca_fc6 = cv_pca_decomp(clf, train_data_fc6, train_labels, n_comp_ar_fc6)

In [77]:
meanscore_def_fc6

0.80221692173947312

In [78]:
meanscores_pca_fc6

[0.8218118289853743,
 0.79799529103833633,
 0.73070771747756902,
 0.73005789762394779,
 0.7651011211350921]

<h4> fc7 layer

In [60]:
clf = OneVsRestClassifier(LogisticRegression())

In [61]:
n_comp_ar_fc7 = [100, 250, 500, 1000, 2000]

In [62]:
meanscore_def_fc7, meanscores_pca_fc7 = cv_pca_decomp(clf, train_data_fc7, train_labels, n_comp_ar_fc7)

In [75]:
meanscore_def_fc7

0.81775497031085409

In [76]:
meanscores_pca_fc7

[0.82669733955585234,
 0.80373275431291202,
 0.77140382030287136,
 0.7818206988484766,
 0.80902744619752853]

<h4> fc8 layer

In [65]:
clf = OneVsRestClassifier(LogisticRegression())

In [66]:
n_comp_ar_fc8 = [100, 250, 500, 1000, 2000]

In [67]:
meanscore_def_fc8, meanscores_pca_fc8 = cv_pca_decomp(clf, train_data_fc8, train_labels, n_comp_ar_fc8)

In [73]:
meanscore_def_fc8

0.82350867676710282

In [74]:
meanscores_pca_fc8

[0.8258969760951016,
 0.8123537749057661,
 0.80430028091461792,
 0.80941761717133553,
 0.82246136336677333]

<h4> prob layer

In [68]:
clf = OneVsRestClassifier(LogisticRegression())

In [69]:
n_comp_ar_prob = [100, 250, 500]

In [70]:
meanscore_def_prob, meanscores_pca_prob = cv_pca_decomp(clf, train_data_prob, train_labels, n_comp_ar_prob)

In [71]:
meanscore_def_prob

0.82048936072684175

In [72]:
meanscores_pca_prob

[0.82362399585343427, 0.81815075811955995, 0.8181896981818344]

<h2> Feature selection

In [81]:
train_data_prob_pca = PCA(n_components=100).fit_transform(train_data_prob)
train_data_fc8_pca = PCA(n_components=100).fit_transform(train_data_fc8)

In [124]:

train_data_merged = np.array([ [ [None] * 100 ] * 2 ] * 1996)

In [125]:
train_data_merged[:,0,:] = train_data_prob_pca
train_data_merged[:,1,:] = train_data_fc8_pca

In [130]:
train_data_merged = np.hstack((train_data_fc8_pca, train_data_prob_pca))

In [131]:
clf = OneVsRestClassifier(LogisticRegression())

In [None]:
score = np.mean(cross_validation.cross_val_score(clf, train_data_merged, train_labels, cv = kf, scoring = 'f1'))

In [122]:
from sklearn.ensemble import ExtraTreesClassifier
#from sklearn.feature_selection import SelectFromModel

clf1 = ExtraTreesClassifier()
clf1 = clf1.fit(train_data_merged, train_labels)

ValueError: Found array with dim 3. Expected <= 2

<h2> SVC with linear kernel

In [12]:
clf_svc = OneVsRestClassifier(SVC(kernel = 'linear', probability = True))

In [None]:
scores = cross_validation.cross_val_score(clf_svc, train_data, train_labels, cv = kf, scoring = 'f1')

In [None]:
scores