In [1]:
# Importing the libraries
import numpy as np
import pandas as pd

# Input data files
train = pd.read_csv('./input/train.csv')
test = pd.read_csv('./input/test.csv')
submiss=pd.read_csv("./input/sample_submission.csv")

X_Train=train['text'].str.replace('[^a-zA-Z0-9]', ' ')
y_train=train['author']
X_Test=test['text'].str.replace('[^a-zA-Z0-9]', ' ')

In [2]:
X_Train[:5]

0    This process  however  afforded me no means of...
1    It never once occurred to me that the fumbling...
2    In his left hand was a gold snuff box  from wh...
3    How lovely is spring As we looked from Windsor...
4    Finding nothing else  not even gold  the Super...
Name: text, dtype: object

In [3]:
y_train[:5]

0    EAP
1    HPL
2    EAP
3    MWS
4    HPL
Name: author, dtype: object

In [12]:
## Multinomial Naive Bayes Classifier ##
# Build pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import log_loss
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.pipeline import Pipeline
classifier = Pipeline([('vect', CountVectorizer(lowercase=False)),
                      ('tfidf', TfidfTransformer()),
                      ('clf', RandomForestClassifier(criterion='entropy',n_estimators=200)),
])

# parameter tuning with grid search
from sklearn.model_selection import GridSearchCV
parameters = {'vect__ngram_range': [(1, 1), (1, 2),(1,3)],
              'vect__max_df': ( 0.7,0.8,0.9,1.0),
              'vect__min_df': (1,2),    
              'clf__max_depth': (3,4,5)
}
gs_clf = GridSearchCV(classifier, parameters,n_jobs=-1, verbose=1,cv=3)
gs_clf.fit(X_Train, y_train)
best_parameters = gs_clf.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Fitting 3 folds for each of 72 candidates, totalling 216 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  5.1min
[Parallel(n_jobs=-1)]: Done 216 out of 216 | elapsed:  5.9min finished


	clf__max_depth: 5
	vect__max_df: 0.8
	vect__min_df: 2
	vect__ngram_range: (1, 1)


In [13]:
y_train_pred_prob = gs_clf.predict_proba(X_Train)
y_train_pred_prob = np.round(y_train_pred_prob,4)
print('score',log_loss(y_train,y_train_pred_prob))

score 1.04967505399


In [14]:
# Predicting the Test set results
y_pred_proba = gs_clf.predict_proba(X_Test)
y_pred_proba = np.round(y_pred_proba,4)

In [15]:
submiss['EAP']=y_pred_proba[:,0]
submiss['HPL']=y_pred_proba[:,1]
submiss['MWS']=y_pred_proba[:,2]
submiss.to_csv("results/rf_res.csv",index=False)
submiss.head(10)

Unnamed: 0,id,EAP,HPL,MWS
0,id02310,0.3881,0.2702,0.3417
1,id24541,0.4092,0.2917,0.2991
2,id00134,0.3884,0.3154,0.2961
3,id27757,0.4064,0.2919,0.3016
4,id04081,0.4089,0.278,0.313
5,id27337,0.4124,0.2873,0.3003
6,id24265,0.4324,0.2785,0.2892
7,id25917,0.396,0.293,0.311
8,id04951,0.4076,0.2976,0.2948
9,id14549,0.4258,0.2765,0.2977
