# DSG: MultinomialNB

In [72]:
import numpy as np
import pandas as pd
import os

from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline, make_union
from sklearn.grid_search import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import confusion_matrix
from sklearn.naive_bayes import MultinomialNB
from sklearn.cross_validation import cross_val_score, train_test_split,StratifiedKFold, StratifiedShuffleSplit, KFold
from sklearn.feature_selection import chi2, SelectKBest
from sklearn.calibration import CalibratedClassifierCV
from sklearn.preprocessing import StandardScaler
from lib_DSG import ColumnSelector, DenseTransformer

folder = os.getcwd() ; print folder

/home/arda/Documents/youtube


# Importing data

In [73]:
pd_train = pd.read_csv('./data/train_sample_munged.csv', header=0, escapechar='\\', quotechar='"', low_memory=False, error_bad_lines=False)
pd_test = pd.read_csv('./data/test_sample_munged.csv', header=0, escapechar='\\', quotechar='"', low_memory=False, error_bad_lines=False )

In [74]:
pd_train = pd_train.fillna('')
pd_test = pd_test.fillna('')

In [76]:
pd_train['duration'].mean()

439.51987877521162

# First model

In [176]:
dico_pattern={'match_lowercase_only':'\\b[a-z]+\\b',
              'match_word':'\\w{1,}',
              'match_word1': '(?u)\\b\\w+\\b',
              'match_3char': '(?u)\\b\\w+\\b\\w+\\b',
              'match_word_punct': '\w+|[,.?!;]',
              'match_NNP': '\\b[A-Z][a-z]+\\b|\\b[A-Z]+\\b',
              'match_punct': "[,.?!;'-]"
             }

stopwords = [u'http', u'com', u'www', u's', u'subscribe'
, u'new', u'like', u'watch', u't', u'2014', u'1'
, u'2013', u'2', u'la', u'en'
, u'world', u'make', u'v', u'check', u'time'
, u'10', u'best', u'3', u'5', u'day', u'y']
tfv_title = TfidfVectorizer(lowercase=True, stop_words=stopwords, token_pattern=dico_pattern["match_word"], 
                      ngram_range=(1, 1), max_df=0.5, min_df=2, max_features=None, 
                      vocabulary=None, binary=True, norm=u'l2', 
                      use_idf=True, smooth_idf=True, sublinear_tf=True)

tfv_desc = TfidfVectorizer(lowercase=True, stop_words='english', token_pattern=dico_pattern["match_word"], 
                      ngram_range=(1, 1), max_df=0.5, min_df=2, max_features=None, 
                      vocabulary=None, binary=True, norm=u'l2', 
                      use_idf=True, smooth_idf=True, sublinear_tf=True)

tfv_topicid = TfidfVectorizer(lowercase=True, stop_words=None, token_pattern=dico_pattern["match_word1"], 
                      ngram_range=(1, 1), max_df=1.0, min_df=2, max_features=None, 
                      vocabulary=None, binary=True, norm=u'l2', 
                      use_idf=True, smooth_idf=True, sublinear_tf=True)

tfv_rel_topic = TfidfVectorizer(lowercase=True, stop_words=None, token_pattern=dico_pattern["match_word1"], 
                      ngram_range=(1, 1), max_df=1.0, min_df=2, max_features=None, 
                      vocabulary=None, binary=True, norm=u'l2', 
                      use_idf=True, smooth_idf=True, sublinear_tf=True)

clf = MultinomialNB(alpha=0.05, fit_prior=True, class_prior=None)

title_pipe = make_pipeline(ColumnSelector(key='title'), tfv_title)
desc_pipe = make_pipeline(ColumnSelector(key='description'), tfv_desc)
topicId_pipe = make_pipeline(ColumnSelector(key=u'topicIds'), tfv_topicid)
reltopicID_pipe = make_pipeline(ColumnSelector(key=u'relevantTopicIds'), tfv_rel_topic)

pipeline = make_union(title_pipe, desc_pipe, topicId_pipe, reltopicID_pipe)
pipeline.transformer_weights=[1, 1, 1, 1]


In [177]:
Y = pd_train[u'video_category_id'].values
X = pipeline.fit_transform(pd_train) ; print X.shape
X_test = pipeline.transform(pd_test)

(239225, 657494)


In [178]:
alphas = np.arange(0.01, 0.2, 0.03)
alphas = [1e-3, 1e-2, 1e-1, 1, 5]
print alphas

[0.001, 0.01, 0.1, 1, 5]


In [179]:
results=[]
for i in alphas:
    clf.alpha = i

    sss = StratifiedShuffleSplit(Y, 10, test_size=0.2, random_state=0)
    scores_sss = cross_val_score(clf, X, Y,scoring='accuracy',cv=sss, n_jobs=-1)

    print ("SSS: acc: %0.4f, std: %0.4f, alpha: %s" %(scores_sss.mean(), scores_sss.std(), i))

SSS: acc: 0.7797, std: 0.0018, alpha: 0.001
SSS: acc: 0.7833, std: 0.0018, alpha: 0.01
SSS: acc: 0.7824, std: 0.0018, alpha: 0.1
SSS: acc: 0.7599, std: 0.0020, alpha: 1
SSS: acc: 0.7291, std: 0.0020, alpha: 5


# MODEL CATEGORIEL

In [9]:
pd_train.columns

Index([u'video_category_id', u'title', u'description', u'published_at',
       u'viewCount', u'likeCount', u'dislikeCount', u'favoriteCount',
       u'commentCount', u'duration', u'dimension', u'definition', u'caption',
       u'licensedContent', u'topicIds', u'relevantTopicIds', u'dimension_2d',
       u'dimension_3d', u'definition_hd', u'definition_sd'],
      dtype='object')

In [14]:
cols = [u'video_category_id', u'title', u'description', u'published_at',
       u'viewCount', u'likeCount', u'dislikeCount', u'favoriteCount',
       u'commentCount', u'duration', u'dimension', u'definition', u'caption',
       u'licensedContent', u'topicIds', u'relevantTopicIds', u'dimension_2d',
       u'dimension_3d', u'definition_hd', u'definition_sd']

# OTHER MODEL

In [55]:
str_train = pd_train.apply(lambda r: (r['title'] + r['description'] + r['topicIds']), axis=1)
str_test = pd_test.apply(lambda r: (r['title'] + r['description'] + r['topicIds']), axis=1)

In [68]:
stopwords = [u'http', u'com', u'www', u's', u'subscribe'
, u'new', u'like', u'watch', u't', u'2014', u'1'
, u'2013', u'2', u'la', u'en'
, u'world', u'make', u'v', u'check', u'time'
, u'10', u'best', u'3', u'5', u'day', u'y']

dico_pattern={'match_lowercase_only':'\\b[a-z]+\\b',
              'match_word':'\\w{4,}',
              'match_word1': '(?u)\\b\\w+\\b',
              'match_3char': '(?u)\\b\\w+\\b\\w+\\b',
              'match_word_punct': '\w+|[,.?!;]',
              'match_NNP': '\\b[A-Z][a-z]+\\b|\\b[A-Z]+\\b',
              'match_punct': "[,.?!;'-]"
             }
tfv = TfidfVectorizer(lowercase=True, stop_words='english', token_pattern=dico_pattern["match_word"], 
                      ngram_range=(1, 1), max_df=0.3, min_df=2, max_features=None, 
                      vocabulary=None, binary=True, norm=u'l2', 
                      use_idf=False, smooth_idf=True, sublinear_tf=True)

clf = SGDClassifier(loss='loss', penalty='l2', alpha=0.0001, l1_ratio=0.15, fit_intercept=True,
                    n_iter=5, shuffle=True, verbose=0, epsilon=0.1, n_jobs=1, random_state=None,
                    learning_rate='optimal', eta0=0.0, power_t=0.5, class_weight=None,
                    warm_start=False, average=False)

In [69]:
Y = pd_train[u'video_category_id'].values
X = tfv.fit_transform(str_train)
X_test = tfv.transform(str_test)

print X.shape

(239225, 149674)


In [58]:
numFeat = 40

Features = np.array(tfv.get_feature_names())
sorted_indices = np.argsort(np.array(X.sum(0))[0])[::-1]
rankFeatures = Features[sorted_indices][:numFeat]
print rankFeatures

[u'http' u'video' u'facebook' u'youtube' u'subscribe' u'https' u'like'
 u'twitter' u'watch' u'2014' u'2013' u'http youtube' u'music' u'official'
 u'world' u'videos' u'channel' u'make' u'episode' u'time' u'itunes'
 u'check' u'facebook http' u'follow' u'http facebook' u'best'
 u'https facebook' u'youtube watch' u'https youtube' u'just' u'life'
 u'game' u'love' u'album' u'para' u'people' u'instagram' u'http twitter'
 u'0bt9lr' u'click']


In [70]:
alphas = np.arange(0.01, 0.2, 0.03)
alphas = [1e-3, 1e-2, 1e-1, 1, 5]
print alphas

[0.001, 0.01, 0.1, 1, 5]


In [71]:
results=[]
for i in alphas:
    clf.alpha = i
    sss = StratifiedShuffleSplit(Y, 10, test_size=0.2, random_state=0)
    scores_sss = cross_val_score(clf, X, Y,scoring='accuracy',cv=sss, n_jobs=-1)
    print ("SSS: acc: %0.4f, std: %0.4f, alpha: %s" %(scores_sss.mean(), scores_sss.std(), i))

SSS: acc: 0.6569, std: 0.0020, alpha: 0.001
SSS: acc: 0.6518, std: 0.0020, alpha: 0.01


KeyboardInterrupt: 

# SUBMIT KAGGLE

In [61]:
clf.alpha = 0.01

In [62]:
clf.fit(X,Y)

MultinomialNB(alpha=0.01, class_prior=None, fit_prior=True)

In [65]:
y_pred = clf.predict(X_test)

In [66]:
submit = pd.DataFrame(index=None)
submit['id']=pd_test['id']
submit['Pred']=y_pred

In [67]:
submit.to_csv(folder+'/results/0.arda1.csv',sep=';',index=None)

# FINAL DUMP

In [101]:
clf.alpha = 0.01

In [102]:
final_pipe = make_pipeline(pipeline, clf)

In [None]:
final_pipe.fit