In [5]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn import cross_validation
from sklearn.decomposition import PCA

<h1> Loading a data frame with business ids and restrant labels

In [6]:
df_train_labels = pd.DataFrame.from_csv('train.csv')
df_train_labels.reset_index(level=0, inplace=True)
df_train_labels.head(5)

Unnamed: 0,business_id,labels
0,1000,1 2 3 4 5 6 7
1,1001,0 1 6 8
2,100,1 2 4 5 6 7
3,1006,1 2 4 5 6
4,1010,0 6 8


<h1> Loading mean CNN codes

<h2> Features used in classifying

In [9]:
def df_train_for_classifier(layer, df_train_labels):
    name_df = 'df_train_'+layer
    df = pd.read_pickle(name_df)
    df.reset_index(level=0, inplace=True)
    df = pd.merge(df, df_train_labels, on = 'business_id')
    df = df.dropna(how = 'any')
    df['meanCNN'] = df['meanCNN'].apply(lambda x: list(x))
    
    data = df['meanCNN']
    data = np.array([x for x in data])
    
    df['labels'] = df['labels'].apply(lambda x: map(float, x.split(' ')))
    labels = MultiLabelBinarizer().fit_transform(df['labels'])
    labels[0:5]
    
    return data, labels

In [10]:
train_data_fc6, train_labels = df_train_for_classifier('fc6', df_train_labels)
train_data_fc7, train_labels = df_train_for_classifier('fc7', df_train_labels)
train_data_fc8, train_labels = df_train_for_classifier('fc8', df_train_labels)
train_data_prob, train_labels = df_train_for_classifier('prob', df_train_labels)

In [14]:
features_combinations = [train_data_fc6, train_data_fc7, train_data_fc8, train_data_prob, 
                         np.hstack((train_data_fc6, train_data_fc7)), np.hstack((train_data_fc7, train_data_fc8)),
                        np.hstack((train_data_fc8, train_data_prob)), np.hstack((train_data_fc6, train_data_fc8)),
                        np.hstack((train_data_fc6, train_data_prob)), np.hstack((train_data_fc7, train_data_prob)),
                        np.hstack((train_data_fc6, train_data_fc7, train_data_fc8)), 
                        np.hstack((train_data_fc7, train_data_fc8, train_data_prob)),
                        np.hstack((train_data_fc6, train_data_fc8, train_data_prob)),
                         np.hstack((train_data_fc6, train_data_fc7, train_data_prob)),
                        np.hstack((train_data_fc6, train_data_fc7, train_data_fc8, train_data_prob)),]

In [19]:
features_names = ['fc6', 'fc7', 'fc8', 'prob', 'fc6_fc7', 'fc7_fc8','fc8_prob', 'fc6_fc8',
                        'fc6_prob', 'fc7_prob','fc6_fc7_fc8','fc7_fc8_prob','fc6_fc8_prob',
                         'fc6_fc7_prob','fc6_fc7_fc8_prob']

<h1> Training classifiers

In [20]:
##Cross-validation
kf = cross_validation.KFold(n = 1996, n_folds = 5)

<h2> Logistic Regression

<h3> Testing performance of PCA decomposition on features

In [21]:
def cv_pca_decomp(clf, data, labels, n_comp_ar):
    kf = cross_validation.KFold(n = 1996, n_folds = 5)
    meanscores_pca = []
    meanscore_def = np.mean(cross_validation.cross_val_score(clf, data, labels, cv = kf, scoring = 'f1'))
    
    for n in n_comp_ar:
        if data.shape[1] > n:
            data_reduced = PCA(n_components=n).fit_transform(data)
            score = np.mean(cross_validation.cross_val_score(clf, data_reduced, labels, cv = kf, scoring = 'f1'))
            meanscores_pca.append(score)
        else:
            meanscores_pca.append(np.nan)
    
    return meanscore_def, meanscores_pca

In [22]:
clf = OneVsRestClassifier(LogisticRegression())

In [23]:
n_comp_ar = [100, 250, 500, 1000, 2000]

In [25]:
cv_score_pca_log_reg = {}
for data, name in zip(features_combinations, features_names):
    clf = OneVsRestClassifier(LogisticRegression())
    meanscore_def, meanscores_pca = cv_pca_decomp(clf, data, train_labels, n_comp_ar)
    cv_score_pca_log_reg[name] = [meanscore_def, meanscores_pca]

  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample

In [30]:
df_cv_score_pca_log_reg = pd.DataFrame(cv_score_pca_log_reg)

In [31]:
df_cv_score_pca_log_reg.to_pickle('df_cv_score_pca_log_reg')

<h2> SVC with linear kernel

In [12]:
clf_svc = OneVsRestClassifier(SVC(kernel = 'linear', probability = True))

In [None]:
scores = cross_validation.cross_val_score(clf_svc, train_data, train_labels, cv = kf, scoring = 'f1')

In [None]:
scores