In [1]:
# Importing the libraries
import numpy as np
import pandas as pd

# Input data files
train = pd.read_csv('./input/train.csv')
test = pd.read_csv('./input/test.csv')
submiss=pd.read_csv("./input/sample_submission.csv")

X_Train=train['text'].str.replace('[^a-zA-Z0-9]', ' ')
y_train=train['author']
X_Test=test['text'].str.replace('[^a-zA-Z0-9]', ' ')

In [5]:
## Multinomial Naive Bayes Classifier ##
# Build pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
classifier = Pipeline([('vect', CountVectorizer(lowercase=False)),
                      ('tfidf', TfidfTransformer()),
                      ('clf', MultinomialNB()),
])

# parameter tuning with grid search
from sklearn.model_selection import GridSearchCV
parameters = {'vect__ngram_range': [(1, 2)],
              'vect__max_df': ( 0.75,0.8,0.85),
              'vect__min_df': (1,2),    
              'clf__alpha': ( 0.024,0.025, 0.026),
}
gs_clf = GridSearchCV(classifier, parameters,n_jobs=-1, verbose=1,cv=5)
gs_clf.fit(X_Train, y_train)
best_parameters = gs_clf.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

# Predicting the Test set results
y_pred_proba = gs_clf.predict_proba(X_Test)
y_pred_proba = np.round(y_pred_proba,4)

# pre best
# clf__alpha: 0.025
# 	vect__max_df: 0.8
# 	vect__min_df: 1
# 	vect__ngram_range: (1, 2)

Fitting 5 folds for each of 18 candidates, totalling 90 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   30.5s
[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed:  1.7min finished


	clf__alpha: 0.025
	vect__max_df: 0.75
	vect__min_df: 1
	vect__ngram_range: (1, 2)


In [8]:
submiss['EAP']=y_pred_proba[:,0]
submiss['HPL']=y_pred_proba[:,1]
submiss['MWS']=y_pred_proba[:,2]
submiss.to_csv("results/res.csv",index=False)
submiss.head(10)

Unnamed: 0,id,EAP,HPL,MWS
0,id02310,0.0645,0.0049,0.9305
1,id24541,0.9614,0.0299,0.0087
2,id00134,0.0133,0.9816,0.0051
3,id27757,0.722,0.2706,0.0074
4,id04081,0.6571,0.2492,0.0937
5,id27337,0.9871,0.0126,0.0003
6,id24265,0.9716,0.0184,0.01
7,id25917,0.0067,0.0253,0.968
8,id04951,0.9948,0.0052,0.0001
9,id14549,0.6021,0.2457,0.1522


In [7]:
from sklearn.metrics import log_loss
y_train_pred_prob = gs_clf.predict_proba(X_Train)
y_train_pred_prob = np.round(y_train_pred_prob,4)
print('score',log_loss(y_train,y_train_pred_prob))

score 0.00862477167538
