In [14]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn import cross_validation

<h1> Loading a data frame with business ids and restrant labels

In [4]:
df_train_labels = pd.DataFrame.from_csv('train.csv')
df_train_labels.reset_index(level=0, inplace=True)
df_train_labels.head(5)

Unnamed: 0,business_id,labels
0,1000,1 2 3 4 5 6 7
1,1001,0 1 6 8
2,100,1 2 4 5 6 7
3,1006,1 2 4 5 6
4,1010,0 6 8


<h1> Loading mean CNN codes

In [5]:
layer = 'fc7'

In [6]:
name_df_train = 'df_train_'+layer
df_train = pd.load(name_df_train)
df_train.reset_index(level=0, inplace=True)
df_train = pd.merge(df_train, df_train_labels, on = 'business_id')
df_train = df_train.dropna(how = 'any')

df_train.head(5)



Unnamed: 0,business_id,meanCNN,labels
0,3,"[0.857319, 0.447813, 0.0, 0.0737472, 0.421991,...",0 8
1,4,"[0.566141, 1.00486, 0.329898, 0.494994, 1.1129...",1 2 4 5 6
2,5,"[1.28208, 0.882854, 0.328062, 0.583805, 0.6321...",1 2 4 5 6 7
3,6,"[0.271841, 1.00067, 0.256874, 0.390041, 0.5149...",0 1 6 8
4,7,"[0.216943, 0.746779, 0.0136863, 1.43696, 0.0, ...",0 1 8


In [7]:
train_data = df_train['meanCNN']
df_train['meanCNN'] = df_train['meanCNN'].apply(lambda x: list(x))

In [8]:
train_data = df_train['meanCNN']
train_data = np.array([x for x in train_data])
train_data.shape

(1996L, 4096L)

In [9]:
df_train['labels'] = df_train['labels'].apply(lambda x: map(float, x.split(' ')))
train_labels = MultiLabelBinarizer().fit_transform(df_train['labels'])
train_labels[0:5]

array([[1, 0, 0, 0, 0, 0, 0, 0, 1],
       [0, 1, 1, 0, 1, 1, 1, 0, 0],
       [0, 1, 1, 0, 1, 1, 1, 1, 0],
       [1, 1, 0, 0, 0, 0, 1, 0, 1],
       [1, 1, 0, 0, 0, 0, 0, 0, 1]])

<h1> Training a classifier

In [26]:
##Cross-validation
kf = cross_validation.KFold(n = 1996, n_folds = 2)

<h2> Logistic Regression

In [27]:
clf = OneVsRestClassifier(LogisticRegression())

In [28]:
scores = cross_validation.cross_val_score(clf, train_data, train_labels, cv = kf, scoring = 'f1')

In [29]:
scores

array([ 0.81617346,  0.80930784])

<h2> SVC with linear kernel

In [30]:
clf_svc = OneVsRestClassifier(SVC(kernel = 'linear', probability = True))

In [31]:
scores = cross_validation.cross_val_score(clf_svc, train_data, train_labels, cv = kf, scoring = 'f1')

In [32]:
scores

array([ 0.80465431,  0.80184264])