# DSG: MultinomialNB

In [1]:
import numpy as np
import pandas as pd
import os

from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline, make_union
from sklearn.grid_search import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import confusion_matrix
from sklearn.cross_validation import cross_val_score, train_test_split,StratifiedKFold, StratifiedShuffleSplit, KFold
from sklearn.feature_selection import chi2, SelectKBest
from sklearn.calibration import CalibratedClassifierCV
from sklearn.preprocessing import StandardScaler
from lib_DSG import ColumnSelector, DenseTransformer

folder = os.getcwd() ; print folder

/home/arda/Documents/youtube


# Importing data

In [2]:
pd_train = pd.read_csv('./data/train_sample_munged.csv', header=0, escapechar='\\', quotechar='"', low_memory=False)
pd_test = pd.read_csv('./data/test_sample_munged.csv', header=0, escapechar='\\', quotechar='"', low_memory=False, )

In [3]:
pd_train = pd_train.fillna('')
pd_test = pd_test.fillna('')

In [12]:
from unidecode import unidecode

In [14]:
pd_train["title"].apply(lambda r: unidecode(r))

  if __name__ == '__main__':


0         Disneyland Railroad at night and first look at...
1                                       David Kills Goliath
2         Downtown Disney | Walt Disney World | Parques ...
3                                    The Renaissance Faire!
4         On board Quantum of the Seas - North Star | Ri...
5                                   SHARMEEN PITHI DANCES 1
6           The CAR Carrier : Indian Railways Freight Wagon
7                     Florence, Italy: Michelangelo's David
8                           Dubrovnik and Balkan Side-Trips
9                                   McShrimp and Egg Burger
10        Gruzja 4x4  Georgia Expedition - Mestia, Ushgu...
11        VIP Terror Tram at Halloween Horror Nights 201...
12        Instant pedicure with Japanese nail art pantyh...
13                        Pour toi aussi c'est possible !!!
14               Krake front seat on-ride HD POV Heide Park
15                    Tamborrada: The Beat of a Basque Drum
16        "ChiAskie" zupki - Tanie A1/4a

In [None]:
pd_train["title"].apply(lambda r: [stemmer.stem(word) for word in r.split(" ") ]  )

In [6]:
from nltk import PorterStemmer

In [7]:
stemmer = PorterStemmer()

  if word[-1] == 's':


UnicodeDecodeError: 'ascii' codec can't decode byte 0xe6 in position 0: ordinal not in range(128)

In [153]:
print pd_train.columns

Index([u'video_category_id', u'title', u'description', u'published_at',
       u'viewCount', u'likeCount', u'dislikeCount', u'favoriteCount',
       u'commentCount', u'duration', u'dimension', u'definition', u'caption',
       u'licensedContent', u'topicIds', u'relevantTopicIds', u'dimension_2d',
       u'dimension_3d', u'definition_hd', u'definition_sd'],
      dtype='object')


# First model

In [230]:
dico_pattern={'match_lowercase_only':'\\b[a-z]+\\b',
              'match_word':'\\w{1,}',
              'match_word1': '(?u)\\b\\w+\\b',
              'match_word_punct': '\w+|[,.?!;]',
              'match_NNP': '\\b[A-Z][a-z]+\\b|\\b[A-Z]+\\b',
              'match_punct': "[,.?!;'-]"
             }
tfv_title = TfidfVectorizer(lowercase=True, stop_words='english', token_pattern=dico_pattern["match_word1"], 
                      ngram_range=(1, 2), max_df=1.0, min_df=2, max_features=None, 
                      vocabulary=None, binary=True, norm=u'l2', 
                      use_idf=True, smooth_idf=True, sublinear_tf=True)

tfv_desc = TfidfVectorizer(lowercase=True, stop_words='english', token_pattern=dico_pattern["match_word1"], 
                      ngram_range=(1, 2), max_df=1.0, min_df=2, max_features=None, 
                      vocabulary=None, binary=True, norm=u'l2', 
                      use_idf=True, smooth_idf=True, sublinear_tf=True)

tfv_topicid = TfidfVectorizer(lowercase=True, stop_words=None, token_pattern=dico_pattern["match_word1"], 
                      ngram_range=(1, 1), max_df=1.0, min_df=2, max_features=None, 
                      vocabulary=None, binary=True, norm=u'l2', 
                      use_idf=True, smooth_idf=True, sublinear_tf=True)

tfv_rel_topic = TfidfVectorizer(lowercase=True, stop_words=None, token_pattern=dico_pattern["match_word1"], 
                      ngram_range=(1, 1), max_df=1.0, min_df=2, max_features=None, 
                      vocabulary=None, binary=True, norm=u'l2', 
                      use_idf=True, smooth_idf=True, sublinear_tf=True)

clf = MultinomialNB(alpha=0.05, fit_prior=True, class_prior=None)

title_pipe = make_pipeline(ColumnSelector(key='title'), tfv_title)
desc_pipe = make_pipeline(ColumnSelector(key='description'), tfv_desc)
topicId_pipe = make_pipeline(ColumnSelector(key=u'topicIds'), tfv_topicid)
reltopicID_pipe = make_pipeline(ColumnSelector(key=u'relevantTopicIds'), tfv_rel_topic)

pipeline = make_union(title_pipe, desc_pipe, topicId_pipe, reltopicID_pipe)
pipeline.transformer_weights=[1, 1, 1.5, 1.5]

In [231]:
Y = pd_train[u'video_category_id'].values
X = pipeline.fit_transform(pd_train) ; print X.shape
X_test = pipeline.transform(pd_test)

(239225, 660606)


In [232]:
alphas = np.arange(0.01, 0.2, 0.03)
alphas = [1e-3, 1e-2, 1e-1, 1, 5]
print alphas

[0.001, 0.01, 0.1, 1, 5]


In [233]:
results=[]
for i in alphas:
    clf.alpha = i

    skf = StratifiedShuffleSplit(Y, 10, test_size=0.2, random_state=0)
    scores_sss = cross_val_score(clf, X, Y,scoring='accuracy',cv=sss, n_jobs=-1)

    print ("SSS: acc: %0.4f, std: %0.4f, alpha: %s" %(scores_sss.mean(), scores_sss.std(), i))

SSS: acc: 0.7827, std: 0.0018, alpha: 0.001
SSS: acc: 0.7862, std: 0.0021, alpha: 0.01
SSS: acc: 0.7852, std: 0.0019, alpha: 0.1
SSS: acc: 0.7630, std: 0.0020, alpha: 1
SSS: acc: 0.7321, std: 0.0022, alpha: 5


# Final dump

# SUBMIT KAGGLE

In [234]:
clf.alpha = 0.01

In [235]:
clf.fit(X,Y)

MultinomialNB(alpha=0.01, class_prior=None, fit_prior=True)

In [236]:
y_pred = clf.predict(X_test)

In [241]:
submit = pd.DataFrame(index=None)
submit['id']=pd_test['id']
submit['Pred']=y_pred

In [242]:
submit.to_csv(folder+'/results/0.arda1.csv',sep=';',index=None)

# FINAL DUMP

In [101]:
clf.alpha = 0.01

In [102]:
final_pipe = make_pipeline(pipeline, clf)

In [None]:
final_pipe.fit