# DSG: MultinomialNB

In [56]:
import numpy as np
import pandas as pd
import os

from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline, make_union
from sklearn.grid_search import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import confusion_matrix
from sklearn.cross_validation import cross_val_score, train_test_split,StratifiedKFold, StratifiedShuffleSplit
from sklearn.feature_selection import chi2, SelectKBest
from sklearn.calibration import CalibratedClassifierCV
from sklearn.preprocessing import StandardScaler
from lib_DSG import ColumnSelector, DenseTransformer


folder = os.getcwd() ; print folder

/home/arda/Documents/youtube


# Importing data

In [83]:
pd_train = pd.read_csv('./data/train_sample_munged.csv', header=0, escapechar='\\', quotechar='"', low_memory=False)
pd_test = pd.read_csv('./data/test_sample_munged.csv', header=0, escapechar='\\', quotechar='"', low_memory=False)

In [85]:
pd_train = pd_train.fillna('')
pd_test = pd_test.fillna('')

In [86]:
print pd_train.columns

Index([u'video_category_id', u'title', u'description', u'published_at',
       u'viewCount', u'likeCount', u'dislikeCount', u'favoriteCount',
       u'commentCount', u'duration', u'dimension', u'definition', u'caption',
       u'licensedContent', u'topicIds', u'relevantTopicIds', u'dimension_2d',
       u'dimension_3d', u'definition_hd', u'definition_sd'],
      dtype='object')


In [97]:
a =pd_train['topicIds'].apply(lambda r: "".join(r))

# First model

In [87]:
dico_pattern={'match_lowercase_only':'\\b[a-z]+\\b',
              'match_word':'\\w{1,}',
              'match_word1': '(?u)\\b\\w+\\b',
              'match_word_punct': '\w+|[,.?!;]',
              'match_NNP': '\\b[A-Z][a-z]+\\b|\\b[A-Z]+\\b',
              'match_punct': "[,.?!;'-]"
             }
tfv_title = TfidfVectorizer(lowercase=True, stop_words='english', token_pattern=dico_pattern["match_word1"], 
                      ngram_range=(1, 2), max_df=1.0, min_df=2, max_features=None, 
                      vocabulary=None, binary=True, norm=u'l2', 
                      use_idf=True, smooth_idf=True, sublinear_tf=True)

tfv_desc = TfidfVectorizer(lowercase=True, stop_words='english', token_pattern=dico_pattern["match_word1"], 
                      ngram_range=(1, 2), max_df=1.0, min_df=2, max_features=None, 
                      vocabulary=None, binary=True, norm=u'l2', 
                      use_idf=True, smooth_idf=True, sublinear_tf=True)

clf = MultinomialNB(alpha=0.05, fit_prior=True, class_prior=None)

title_pipe = make_pipeline(ColumnSelector(key='title'), tfv_title)

desc_pipe = make_pipeline(ColumnSelector(key='description'), tfv_desc)

pipeline = make_union(title_pipe, desc_pipe)

In [88]:
Y = pd_train[u'video_category_id'].values
X = pipeline.fit_transform(pd_train) ; print X.shape
X_test = pipeline.transform(pd_test)

(239225, 594406)


In [89]:
alphas = np.arange(0.01, 0.2, 0.03)
#alphas = [0.06]
print alphas

[ 0.01  0.04  0.07  0.1   0.13  0.16  0.19]


In [90]:
results=[]
for i in alphas:
    clf.alpha = i

    sss = StratifiedShuffleSplit(Y, 10, test_size=0.2, random_state=0)
    scores_sss = cross_val_score(clf, X, Y,scoring='accuracy',cv=sss, n_jobs=-1)

    print ("SSS: acc: %0.4f, std: %0.4f, alpha: %s" %(scores_sss.mean(), scores_sss.std(), i))

SSS: acc: 0.7568, std: 0.0014, alpha: 0.01
SSS: acc: 0.7588, std: 0.0016, alpha: 0.04
SSS: acc: 0.7586, std: 0.0015, alpha: 0.07
SSS: acc: 0.7575, std: 0.0017, alpha: 0.1
SSS: acc: 0.7564, std: 0.0016, alpha: 0.13
SSS: acc: 0.7551, std: 0.0016, alpha: 0.16
SSS: acc: 0.7536, std: 0.0016, alpha: 0.19


In [5]:
Y = pd_train['Author'].values
X = pipeline.fit_transform(pd_train)
X_test = pipeline.transform(pd_test)

print X.shape

 (28723, 77821)


In [12]:
for weight in weights:
    print weight
    
    pipeline.transformer_weights=[1.3, 1, 1.1, 1]
    
    tfv_uni.binary=True
    tfv_bi.binary=True
    tfv_nnp.binary=True
    tfv_punctuation.binary=True
    
    X = pipeline.fit_transform(pd_train)
    results=[]
    for i in alphas:
        clf.alpha = i
        
        skf = StratifiedKFold(Y, n_folds=10, indices=None, shuffle=True, random_state=None)
        scores_skf = cross_val_score(clf, X, Y,scoring='accuracy',cv=skf, n_jobs=-1)
        
        sss = StratifiedShuffleSplit(Y, 10, test_size=0.2, random_state=0)
        scores_sss = cross_val_score(clf, X, Y,scoring='accuracy',cv=sss, n_jobs=-1)

        print ("SLF: acc: %0.4f, std: %0.4f, SSS: acc: %0.4f, std: %0.4f, alpha: %s" %
               (scores_skf.mean(), scores_skf.std(), scores_sss.mean(), scores_sss.std(), i))

1
SLF: acc: 0.8202, std: 0.0084, SSS: acc: 0.8181, std: 0.0058, alpha: 0.01
SLF: acc: 0.8267, std: 0.0064, SSS: acc: 0.8246, std: 0.0054, alpha: 0.04
SLF: acc: 0.8277, std: 0.0064, SSS: acc: 0.8242, std: 0.0053, alpha: 0.07
SLF: acc: 0.8251, std: 0.0116, SSS: acc: 0.8231, std: 0.0048, alpha: 0.1
SLF: acc: 0.8263, std: 0.0063, SSS: acc: 0.8216, std: 0.0043, alpha: 0.13
SLF: acc: 0.8244, std: 0.0041, SSS: acc: 0.8196, std: 0.0047, alpha: 0.16
SLF: acc: 0.8233, std: 0.0051, SSS: acc: 0.8178, std: 0.0047, alpha: 0.19


# Final dump

# SUBMIT KAGGLE

In [103]:
clf.alpha = 0.01

In [45]:
y_pred = NB.predict(X_test)

In [46]:
submit = pd.DataFrame(index=None)
submit['Id']=pd_test['Id']
submit['Pred']=y_pred

In [47]:
submit.to_csv(folder+'/results/0.arda1.csv',sep=';',index=None)

# FINAL DUMP

In [101]:
clf.alpha = 0.01

In [102]:
final_pipe = make_pipeline(pipeline, clf)

In [None]:
final_pipe.fit