# DSG: MultinomialNB

In [173]:
import numpy as np
import pandas as pd
import os

from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline, make_union
from sklearn.grid_search import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import confusion_matrix
from sklearn.cross_validation import cross_val_score, train_test_split,StratifiedKFold, StratifiedShuffleSplit, KFold
from sklearn.feature_selection import chi2, SelectKBest
from sklearn.calibration import CalibratedClassifierCV
from sklearn.preprocessing import StandardScaler
from lib_DSG import ColumnSelector, DenseTransformer

folder = os.getcwd() ; print folder

/home/arda/Documents/youtube


# Importing data

In [174]:
pd_train = pd.read_csv('./data/train_sample_munged.csv', header=0, escapechar='\\', quotechar='"', low_memory=False, error_bad_lines=False)
pd_test = pd.read_csv('./data/test_sample_munged.csv', header=0, escapechar='\\', quotechar='"', low_memory=False, error_bad_lines=False )

In [175]:
pd_train = pd_train.fillna('')
pd_test = pd_test.fillna('')

# First model

In [176]:
dico_pattern={'match_lowercase_only':'\\b[a-z]+\\b',
              'match_word':'\\w{1,}',
              'match_word1': '(?u)\\b\\w+\\b',
              'match_3char': '(?u)\\b\\w+\\b\\w+\\b',
              'match_word_punct': '\w+|[,.?!;]',
              'match_NNP': '\\b[A-Z][a-z]+\\b|\\b[A-Z]+\\b',
              'match_punct': "[,.?!;'-]"
             }

stopwords = [u'http', u'com', u'www', u's', u'subscribe'
, u'new', u'like', u'watch', u't', u'2014', u'1'
, u'2013', u'2', u'la', u'en'
, u'world', u'make', u'v', u'check', u'time'
, u'10', u'best', u'3', u'5', u'day', u'y']
tfv_title = TfidfVectorizer(lowercase=True, stop_words=stopwords, token_pattern=dico_pattern["match_word"], 
                      ngram_range=(1, 2), max_df=0.5, min_df=2, max_features=None, 
                      vocabulary=None, binary=True, norm=u'l2', 
                      use_idf=True, smooth_idf=True, sublinear_tf=True)

tfv_desc = TfidfVectorizer(lowercase=True, stop_words='english', token_pattern=dico_pattern["match_word"], 
                      ngram_range=(1, 2), max_df=0.5, min_df=2, max_features=None, 
                      vocabulary=None, binary=True, norm=u'l2', 
                      use_idf=True, smooth_idf=True, sublinear_tf=True)

tfv_topicid = TfidfVectorizer(lowercase=True, stop_words=None, token_pattern=dico_pattern["match_word1"], 
                      ngram_range=(1, 1), max_df=1.0, min_df=2, max_features=None, 
                      vocabulary=None, binary=True, norm=u'l2', 
                      use_idf=True, smooth_idf=True, sublinear_tf=True)

tfv_rel_topic = TfidfVectorizer(lowercase=True, stop_words=None, token_pattern=dico_pattern["match_word1"], 
                      ngram_range=(1, 1), max_df=1.0, min_df=2, max_features=None, 
                      vocabulary=None, binary=True, norm=u'l2', 
                      use_idf=True, smooth_idf=True, sublinear_tf=True)

clf = MultinomialNB(alpha=0.05, fit_prior=True, class_prior=None)

title_pipe = make_pipeline(ColumnSelector(key='title'), tfv_title)
desc_pipe = make_pipeline(ColumnSelector(key='description'), tfv_desc)
topicId_pipe = make_pipeline(ColumnSelector(key=u'topicIds'), tfv_topicid)
reltopicID_pipe = make_pipeline(ColumnSelector(key=u'relevantTopicIds'), tfv_rel_topic)

pipeline = make_union(title_pipe, desc_pipe, topicId_pipe, reltopicID_pipe)
pipeline.transformer_weights=[1, 1, 1, 1]


In [177]:
Y = pd_train[u'video_category_id'].values
X = pipeline.fit_transform(pd_train) ; print X.shape
X_test = pipeline.transform(pd_test)

(239225, 657494)


In [178]:
alphas = np.arange(0.01, 0.2, 0.03)
alphas = [1e-3, 1e-2, 1e-1, 1, 5]
print alphas

[0.001, 0.01, 0.1, 1, 5]


In [179]:
results=[]
for i in alphas:
    clf.alpha = i

    sss = StratifiedShuffleSplit(Y, 10, test_size=0.2, random_state=0)
    scores_sss = cross_val_score(clf, X, Y,scoring='accuracy',cv=sss, n_jobs=-1)

    print ("SSS: acc: %0.4f, std: %0.4f, alpha: %s" %(scores_sss.mean(), scores_sss.std(), i))

SSS: acc: 0.7797, std: 0.0018, alpha: 0.001
SSS: acc: 0.7833, std: 0.0018, alpha: 0.01
SSS: acc: 0.7824, std: 0.0018, alpha: 0.1
SSS: acc: 0.7599, std: 0.0020, alpha: 1
SSS: acc: 0.7291, std: 0.0020, alpha: 5


# OTHER MODEL

In [59]:
sentence = pd_train.apply(lambda r: (r['title'] + r['description']), axis=1)

In [130]:
stopwords = [u'http', u'com', u'www', u's', u'subscribe'
, u'new', u'like', u'watch', u't', u'2014', u'1'
, u'2013', u'2', u'la', u'en'
, u'world', u'make', u'v', u'check', u'time'
, u'10', u'best', u'3', u'5', u'day', u'y']

dico_pattern={'match_lowercase_only':'\\b[a-z]+\\b',
              'match_word':'\\w{1,}',
              'match_word1': '(?u)\\b\\w+\\b',
              'match_3char': '(?u)\\b\\w+\\b\\w+\\b',
              'match_word_punct': '\w+|[,.?!;]',
              'match_NNP': '\\b[A-Z][a-z]+\\b|\\b[A-Z]+\\b',
              'match_punct': "[,.?!;'-]"
             }
tfv = TfidfVectorizer(lowercase=True, stop_words=stopwords, token_pattern=dico_pattern["match_word1"], 
                      ngram_range=(1, 2), max_df=0.5, min_df=2, max_features=None, 
                      vocabulary=None, binary=True, norm=u'l2', 
                      use_idf=True, smooth_idf=True, sublinear_tf=True)

clf = MultinomialNB(alpha=0.05, fit_prior=True, class_prior=None)

X = tfv.fit_transform(sentence)
print X.shape

TypeError: cannot concatenate 'str' and 'list' objects

(239225, 635525)


In [139]:
numFeat = 40

Features = np.array(tfv.get_feature_names())
sorted_indices = np.argsort(np.array(X.sum(0))[0])[::-1]
rankFeatures = Features[sorted_indices][:numFeat]
print rankFeatures

[u'your recreations' u'\u0e2d\u0e1e' u'\u043e\u043d\u043e' u'mieser'
 u'hayes tattoo'
 u'\u0646\u0642\u0648\u0645 \u0628\u0639\u0631\u0636\u0647\u0627'
 u'\u0441\u043e\u0431\u0430\u043a\u0435'
 u'\u044d\u0434\u0438\u043b\u044c\u0431\u0430\u0439'
 u'\u043f\u0440\u0438\u043d\u044f\u0442\u0438\u0435' u'undocumented' u'ztv'
 u'west films' u'\u043c\u0438\u043d\u0443\u0441 \u043a\u043c'
 u'\u043e\u0431 \u043e\u0434\u043d\u043e\u043c' u'shirt jcanell'
 u'\u0645\u062a\u0631\u062c\u0645'
 u'\u0644\u0627 \u0645\u0643\u0631\u0645\u0629'
 u'\u0432\u0438\u0434\u0430\u043c' u'\u0434\u0442\u043f'
 u'\u0434\u043e\u0447\u043a\u0438' u'minecraftwhat'
 u'\u0645\u0635\u0637\u0641\u0649' u'\u043e\u043d\u0438 \u043d\u0435'
 u'witch minecraft' u'top 5who' u'country it'
 u'\u0644\u0644\u0642\u0648\u0627\u062a' u'with japan' u'take down'
 u'krave cereal' u'\u043c\u043e\u043d\u0442\u0430\u0436'
 u'\u043c\u043d\u043e\u0439' u'\u043a\u043e\u043b\u0435\u0439'
 u'\u062d\u0633\u0627\u0628\u064a \u0641\u064a'
 u'\u06

In [128]:
alphas = np.arange(0.01, 0.2, 0.03)
alphas = [1e-3, 1e-2, 1e-1, 1, 5]
print alphas

[0.001, 0.01, 0.1, 1, 5]


In [129]:
results=[]
for i in alphas:
    clf.alpha = i
    sss = StratifiedShuffleSplit(Y, 10, test_size=0.2, random_state=0)
    scores_sss = cross_val_score(clf, X, Y,scoring='accuracy',cv=sss, n_jobs=-1)
    print ("SSS: acc: %0.4f, std: %0.4f, alpha: %s" %(scores_sss.mean(), scores_sss.std(), i))

SSS: acc: 0.7361, std: 0.0012, alpha: 0.001
SSS: acc: 0.7407, std: 0.0014, alpha: 0.01
SSS: acc: 0.7372, std: 0.0011, alpha: 0.1
SSS: acc: 0.6846, std: 0.0021, alpha: 1
SSS: acc: 0.5955, std: 0.0019, alpha: 5


# Final dump

# SUBMIT KAGGLE

In [162]:
X_test = pipeline.transform(pd_test)

In [180]:
clf.alpha = 0.01

In [181]:
clf.fit(X,Y)

MultinomialNB(alpha=0.01, class_prior=None, fit_prior=True)

In [182]:
y_pred = clf.predict(X_test)

In [183]:
submit = pd.DataFrame(index=None)
submit['id']=pd_test['id']
submit['Pred']=y_pred

In [184]:
submit.to_csv(folder+'/results/0.arda1.csv',sep=';',index=None)

# FINAL DUMP

In [101]:
clf.alpha = 0.01

In [102]:
final_pipe = make_pipeline(pipeline, clf)

In [None]:
final_pipe.fit