In [196]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [205]:
from smart_open import open
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import gensim
import unidecode

In [206]:
df = pd.read_csv('data/train.csv')

In [207]:
df.head()

Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",EAP
1,id17569,It never once occurred to me that the fumbling...,HPL
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP
3,id27763,How lovely is spring As we looked from Windsor...,MWS
4,id12958,"Finding nothing else, not even gold, the Super...",HPL


In [208]:
label_code = {
    'EAP': 0,
    'HPL': 1,
    'MWS': 2
}

In [209]:
stop_words = set(stopwords.words("english"))

def preprocess(s):
    x = unidecode.unidecode(s)
    x = simple_preprocess(str(s), deacc=True)
    x = [word for word in x if word not in stop_words]
    #x = ''.join(e for e in x.lower() if (e.isalnum() or e.isspace()))
    #x = ' '.join(lemmatizer.lemmatize(token) for token in x.split(" "))
    #x = ' '.join(lemmatizer.lemmatize(token, "v") for token in x.split(" "))
    #x = ' '.join(word for word in x.split(" ") if not word in stop_words)
    return x

def get_tokenized_data(valid_ratio, X, y):
    X = list(map(lambda x: preprocess(x), X))
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=valid_ratio, stratify=y)
    return list(X_train), list(X_valid), list(y_train), list(y_valid)

In [210]:
X = list(df['text'])
y = list(map(lambda x: label_code[x], df['author']))
valid_ratio = 0.1

In [211]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=valid_ratio, stratify=y)

In [212]:
X_processed = list(map(lambda x: preprocess(x), X_train))

In [213]:
len(X_processed)

17621

In [214]:
len(y_train)

17621

In [215]:
def bigrams(words, bi_min=5, tri_min=2):
    bigram = gensim.models.Phrases(words, min_count = bi_min)
    bigram_model = gensim.models.phrases.Phraser(bigram)
    return bigram_model

In [216]:
bigram = bigrams(X_processed)

In [217]:
bigram = [bigram[snipet] for snipet in X_processed]

In [218]:
id2word = gensim.corpora.Dictionary(bigram)

In [219]:
id2word.filter_extremes(no_below=5, no_above=0.75)

In [220]:
id2word.compactify()
corpus = [id2word.doc2bow(text) for text in bigram]

In [221]:
import logging
import warnings
logging.basicConfig(filename='lda_model.log', format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    lda_train4 = gensim.models.ldamulticore.LdaMulticore(
                           corpus=corpus,
                           num_topics=10,
                           id2word=id2word,
                           chunksize=100,
                           workers=7, # Num. Processing Cores - 1
                           passes=50,
                           eval_every = 1,
                           per_word_topics=True)
    lda_train4.save('lda_train4.model')

In [222]:
lda_train4.print_topics(5,num_words=12)[:10]

[(3,
  '0.012*"one" + 0.008*"reason" + 0.008*"great" + 0.007*"little" + 0.007*"upon" + 0.005*"less" + 0.005*"idea" + 0.005*"make" + 0.005*"things" + 0.005*"believe" + 0.005*"old" + 0.005*"enough"'),
 (7,
  '0.012*"like" + 0.011*"door" + 0.009*"came" + 0.007*"window" + 0.007*"sea" + 0.007*"open" + 0.006*"sky" + 0.006*"entered" + 0.006*"city" + 0.006*"still" + 0.006*"heavy" + 0.005*"wind"'),
 (8,
  '0.016*"upon" + 0.014*"eyes" + 0.008*"like" + 0.007*"one" + 0.007*"saw" + 0.006*"head" + 0.006*"face" + 0.006*"lay" + 0.006*"could" + 0.005*"still" + 0.005*"seemed" + 0.005*"light"'),
 (9,
  '0.011*"one" + 0.008*"street" + 0.008*"house" + 0.007*"time" + 0.006*"child" + 0.006*"three" + 0.006*"come" + 0.006*"though" + 0.006*"old" + 0.006*"town" + 0.005*"upon" + 0.005*"room"'),
 (2,
  '0.009*"one" + 0.008*"death" + 0.008*"night" + 0.007*"could" + 0.007*"found" + 0.007*"last" + 0.006*"land" + 0.005*"would" + 0.005*"every" + 0.005*"time" + 0.005*"place" + 0.004*"two"')]

In [223]:
train_vecs = []
for i in range(len(X_processed)):
    top_topics = lda_train4.get_document_topics(corpus[i], minimum_probability=0.0)
    topic_vec = [top_topics[i][1] for i in range(5)]
    topic_vec.extend([len(X_processed[i])]) # length review
    train_vecs.append(topic_vec)

In [224]:
train_vecs[12]

[0.8100033, 0.01000496, 0.010002675, 0.10996897, 0.010005308, 13]

In [225]:
x = np.array(train_vecs)
_y = np.array(y_train)

In [226]:
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import seaborn as sns
%config InlineBackend.figure_formats = ['retina']
from sklearn.metrics import f1_score, accuracy_score
from sklearn import linear_model
from sklearn import metrics
from sklearn.model_selection import KFold
from sklearn.metrics import fbeta_score
import matplotlib.pyplot as plt

In [227]:
kf = KFold(5, shuffle=True, random_state=42)
cv_lr_f1, cv_lrsgd_f1, cv_svcsgd_f1,  = [], [], []
cv_lr_acc, cv_lrsgd_acc, cv_svcsgd_acc,  = [], [], []

for train_ind, val_ind in kf.split(x, _y):
    print(len(train_ind))
    print(len(val_ind))
    X_train, y_train = x[train_ind], _y[train_ind]
    X_val, y_val = x[val_ind], _y[val_ind]
    
    # Scale Data
    scaler = StandardScaler()
    X_train_scale = scaler.fit_transform(X_train)
    X_val_scale = scaler.transform(X_val)

    # Logisitic Regression
    lr = LogisticRegression(
        class_weight= 'balanced',
        solver='newton-cg',
        fit_intercept=True
    ).fit(X_train_scale, y_train)

    y_pred = lr.predict(X_val_scale)
    cv_lr_f1.append(f1_score(y_val, y_pred, average='weighted'))
    cv_lr_acc.append(accuracy_score(y_val, y_pred))
    
    # Logistic Regression Mini-Batch SGD
    sgd = linear_model.SGDClassifier(
        max_iter=1000,
        tol=1e-3,
        loss='log',
        class_weight='balanced'
    ).fit(X_train_scale, y_train)
    
    y_pred = sgd.predict(X_val_scale)
    cv_lrsgd_f1.append(f1_score(y_val, y_pred, average='weighted'))
    cv_lrsgd_acc.append(accuracy_score(y_val, y_pred))
    
    # SGD Modified Huber
    sgd_huber = linear_model.SGDClassifier(
        max_iter=1000,
        tol=1e-3,
        alpha=20,
        loss='modified_huber',
        class_weight='balanced'
    ).fit(X_train_scale, y_train)
    
    y_pred = sgd_huber.predict(X_val_scale)
    cv_svcsgd_f1.append(f1_score(y_val, y_pred, average='weighted'))
    cv_svcsgd_acc.append(accuracy_score(y_val, y_pred))

14096
3525
14097
3524
14097
3524
14097
3524
14097
3524


In [228]:
cv_lr_acc

[0.4371631205673759,
 0.43558456299659476,
 0.4301929625425653,
 0.44551645856980704,
 0.438422247446084]