In [1]:
import pandas as pd
import seaborn as sns
import numpy as np

train = pd.read_csv("/Users/dishabhatnagar/Desktop/Text Mining/spooky-author-identification/train.csv")
test = pd.read_csv('/Users/dishabhatnagar/Desktop/Text Mining/spooky-author-identification/test.csv')


In [115]:
train.head()

Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",EAP
1,id17569,It never once occurred to me that the fumbling...,HPL
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP
3,id27763,How lovely is spring As we looked from Windsor...,MWS
4,id12958,"Finding nothing else, not even gold, the Super...",HPL


In [5]:
from keras.preprocessing import sequence, text

In [4]:
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline

In [2]:
def multiclass_logloss(actual, predicted, eps=1e-15):

    if len(actual.shape) == 1:
        actual2 = np.zeros((actual.shape[0], predicted.shape[1]))
        for i, val in enumerate(actual):
            actual2[i, val] = 1
        actual = actual2

    clip = np.clip(predicted, eps, 1 - eps)
    rows = actual.shape[0]
    vsota = np.sum(actual * np.log(clip))
    return -1.0 / rows * vsota

In [6]:
lbl_enc = preprocessing.LabelEncoder()

In [7]:

y = lbl_enc.fit_transform(train.author.values)

In [8]:
from sklearn.model_selection import train_test_split

In [9]:

xtrain, xtest, ytrain, ytest = train_test_split(train.text.values, y, 
                                                  stratify=y, 
                                                  random_state=42, 
                                                  test_size=0.2, shuffle=True)

In [10]:
xtrain.shape

(15663,)

In [11]:
xtest.shape

(3916,)

In [None]:
## Model 1 - tf-idf frequency

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [13]:
tfv = TfidfVectorizer(min_df=3,  max_features=None, 
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3), use_idf=1,smooth_idf=1,sublinear_tf=1,
            stop_words = 'english')

In [None]:
# Fitting TF-IDF to both training and test sets (semi-supervised learning)

In [14]:
tfv.fit(list(xtrain) + list(xtest))
xtrain_tfv =  tfv.transform(xtrain) 
xvalid_tfv = tfv.transform(xtest)

In [None]:
# Fitting a simple Logistic Regression on TFIDF


In [15]:
from sklearn.linear_model import LogisticRegression

In [16]:
clf = LogisticRegression(C=1.0)

In [17]:
clf.fit(xtrain_tfv, ytrain)




LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [18]:
predictions = clf.predict_proba(xvalid_tfv)

In [23]:
predictions

array([[0.65260506, 0.29904053, 0.04835441],
       [0.34896049, 0.52991125, 0.12112826],
       [0.38396129, 0.58446816, 0.03157054],
       ...,
       [0.16595209, 0.10133845, 0.73270946],
       [0.70515782, 0.16761624, 0.12722594],
       [0.21985111, 0.70775709, 0.0723918 ]])

In [19]:
print ("logloss: %0.3f" % multiclass_logloss(ytest, predictions))

logloss: 0.635


In [None]:
## Instead of using TF-IDF, we can also use word counts as features. This can be done easily using CountVectorizer from scikit-learn.

In [20]:
cv = CountVectorizer(analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3), stop_words = 'english')

In [21]:
# Fitting Count Vectorizer to both training and test sets (semi-supervised learning)
cv.fit(list(xtrain) + list(xtest))
xtrain_cv =  cv.transform(xtrain) 
xtest_cv = cv.transform(xtest)

In [None]:
# Fitting a simple Logistic Regression on Counts


In [22]:
clf.fit(xtrain_cv, ytrain)
predictions = clf.predict_proba(xtest_cv)

print ("logloss: %0.3f " % multiclass_logloss(ytest, predictions))



logloss: 0.532 


In [None]:
# Fitting a simple Naive Bayes on TFIDF


In [23]:
from sklearn.naive_bayes import MultinomialNB

In [24]:
clf = MultinomialNB()

In [25]:
clf.fit(xtrain_tfv, ytrain)


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [26]:
predictions = clf.predict_proba(xvalid_tfv)

In [27]:
print ("logloss: %0.3f " % multiclass_logloss(ytest, predictions))

logloss: 0.584 


In [None]:
# Fitting a simple Naive Bayes on Counts

In [28]:
clf = MultinomialNB()

In [30]:
clf.fit(xtrain_cv, ytrain)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [31]:
predictions = clf.predict_proba(xtest_cv)

print ("logloss: %0.3f " % multiclass_logloss(ytest, predictions))


logloss: 0.460 


In [33]:
import xgboost as xgb

In [None]:
# Fitting a xgboost on counts

In [145]:
clf = xgb.XGBClassifier(max_depth=7, n_estimators=200, colsample_bytree=0.8, 
                        subsample=0.8, nthread=10, learning_rate=0.1)
clf.fit(xtrain_cv.tocsc(), ytrain)
predictions = clf.predict_proba(xtest_cv.tocsc())

print ("logloss: %0.3f " % multiclass_logloss(ytest, predictions))

logloss: 0.775 


In [None]:
# Fitting a xgboost on tf-idf

In [34]:
clf = xgb.XGBClassifier(max_depth=7, n_estimators=200, colsample_bytree=0.8, 
                        subsample=0.8, nthread=10, learning_rate=0.1)
clf.fit(xtrain_tfv.tocsc(), ytrain)
predictions = clf.predict_proba(xvalid_tfv.tocsc())

In [35]:
print ("logloss: %0.3f " % multiclass_logloss(ytest, predictions))

logloss: 0.786 


In [None]:
## Support vector machine

In [36]:
svd = decomposition.TruncatedSVD(n_components=150)

In [37]:
svd.fit(xtrain_tfv)

TruncatedSVD(algorithm='randomized', n_components=150, n_iter=5,
             random_state=None, tol=0.0)

In [38]:
xtrain_svd = svd.transform(xtrain_tfv)
xvalid_svd = svd.transform(xvalid_tfv)

In [39]:
scl = preprocessing.StandardScaler()
scl.fit(xtrain_svd)
xtrain_svd_scl = scl.transform(xtrain_svd)
xvalid_svd_scl = scl.transform(xvalid_svd)

In [None]:
## Fitting SVM

In [40]:
from sklearn.svm import SVC

In [None]:
clf = SVC(C=1.0, probability=True) # since we need probabilities
clf.fit(xtrain_svd_scl, ytrain)
predictions = clf.predict_proba(xvalid_svd_scl)

In [108]:
print ("logloss: %0.3f " % multiclass_logloss(ytest, predictions))

logloss: 0.721 


In [None]:
## SVM on counts

In [109]:
svd.fit(xtrain_cv)

TruncatedSVD(algorithm='randomized', n_components=150, n_iter=5,
             random_state=None, tol=0.0)

In [110]:
xtrain_svd = svd.transform(xtrain_cv)
xvalid_svd = svd.transform(xtest_cv)

In [111]:
scl = preprocessing.StandardScaler()
scl.fit(xtrain_svd)
xtrain_svd_scl = scl.transform(xtrain_svd)
xvalid_svd_scl = scl.transform(xvalid_svd)

In [112]:
clf = SVC(C=1.0, probability=True) # since we need probabilities
clf.fit(xtrain_svd_scl, ytrain)
predictions = clf.predict_proba(xvalid_svd_scl)

In [113]:
print ("logloss: %0.3f " % multiclass_logloss(ytest, predictions))

logloss: 0.778 
