In [2]:
from gensim import corpora
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import pandas as pd
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
import string
from string import digits
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
import xgboost as xgb
import numpy as np
import sklearn as sk
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import ensemble, metrics, model_selection, naive_bayes, pipeline
from sklearn.decomposition import NMF, LatentDirichletAllocation
from stop_words import get_stop_words
import gensim
import re
from gensim.models import ldamodel as ld

In [3]:
df_train = pd.read_csv("../data/train.csv")
df_test = pd.read_csv("../data/test.csv")
# 3 columns id, text, author
df_train.head(n=3)

Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",EAP
1,id17569,It never once occurred to me that the fumbling...,HPL
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP


In [6]:
# no zeroes!

def sum_up_word2vec_array(clnd_text):
    
    total = None
    
    for word in clnd_text:
        if word in w2v:
            if total is None:
                total = w2v[word]
            else:
                total = np.add(total, w2v[word])
                
    if total is None:
        return np.zeros(w2v_array_len)
    else:
        return total
    

In [9]:
eng_stopwords = set(stopwords.words('english')).union(set(get_stop_words('english')))

In [7]:
remove_digits = str.maketrans('', '', digits)


def tokenize_stem(file_text):
    #firstly let's apply nltk tokenization
    file_text = file_text.translate(remove_digits)
    
    tokens = nltk.word_tokenize(file_text)

    #let's delete punctuation symbols
    tokens = [i for i in tokens if ( i not in string.punctuation )]

    #deleting stop_words
    stop_words = stopwords.words('english')
    tokens = [i for i in tokens if ( i not in eng_stopwords )]

    #cleaning words
    stemmer = SnowballStemmer("english")
    
    tokens = [stemmer.stem(i) for i in tokens]

    return tokens

In [10]:
df_train['cleaned_text'] = df_train.text.apply(tokenize_stem)
df_train['cleaned_text_string'] = df_train.cleaned_text.apply(' '.join)
df_train.head(n=3)


Unnamed: 0,id,text,author,cleaned_text,cleaned_text_string
0,id26305,"This process, however, afforded me no means of...",EAP,"[this, process, howev, afford, mean, ascertain...",this process howev afford mean ascertain dimen...
1,id17569,It never once occurred to me that the fumbling...,HPL,"[it, never, occur, fumbl, might, mere, mistak]",it never occur fumbl might mere mistak
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP,"[in, left, hand, gold, snuff, box, caper, hill...",in left hand gold snuff box caper hill cut man...


In [11]:
def lexical_diversity(file_text):
    file_text = file_text.translate(remove_digits)
    try:
        tokens = nltk.word_tokenize(file_text)
    except:
        nltk.download('punkt')
        tokens = nltk.word_tokenize(file_text)
        

    #let's delete punctuation symbols
    tokens = [i for i in tokens if ( i not in string.punctuation )]

    #cleaning words
    stemmer = SnowballStemmer("english")
    
    tokens = [stemmer.stem(i) for i in tokens]

    return len(set(tokens))/len(tokens)

In [12]:
# extract "meaningful" words

raw_documents_authors = ['', '', '']


for index, row in df_train.iterrows():
    
    if row['author'] == 'EAP':
        raw_documents_authors[0] += row['cleaned_text_string'] + ' '
    elif row['author'] == 'HPL':
        raw_documents_authors[1] += row['cleaned_text_string'] + ' '
    else:
        raw_documents_authors[2] += row['cleaned_text_string'] + ' '
        

# delete unique words

eap_only = set(raw_documents_authors[0].split(' ')) - set(raw_documents_authors[1].split(' ')) - set(raw_documents_authors[2].split(' '))
hpl_only = set(raw_documents_authors[1].split(' ')) - set(raw_documents_authors[0].split(' ')) - set(raw_documents_authors[2].split(' '))
msh_only = set(raw_documents_authors[2].split(' ')) - set(raw_documents_authors[0].split(' ')) - set(raw_documents_authors[1].split(' '))

unique_words = eap_only.union(hpl_only).union(msh_only)

tf = TfidfVectorizer(analyzer='word')
idf_matrix =  tf.fit_transform(raw_documents_authors)
feature_names = tf.get_feature_names()
# dictionary_word = dict(zip(feature_names, idf_matrix))

dense_idf = [i.todense() for i in idf_matrix]
print(dense_idf)

max_weighted_term = []

eap_dense_list = dense_idf[0].tolist()[0]
hpl_dense_list = dense_idf[1].tolist()[0]
mws_dense_list = dense_idf[2].tolist()[0]

for inum, i in enumerate(eap_dense_list):
    max_weighted_term.append(max(hpl_dense_list[inum], mws_dense_list[inum], 
                             i))

max_tf_dict = dict(zip(feature_names, max_weighted_term))



[matrix([[ 0.00054413,  0.00054413,  0.00108826, ...,  0.00054413,
          0.        ,  0.        ]]), matrix([[ 0.       ,  0.       ,  0.       , ...,  0.       ,  0.0013942,
          0.0006971]]), matrix([[ 0.,  0.,  0., ...,  0.,  0.,  0.]])]


In [13]:
model = gensim.models.Word2Vec(df_train['cleaned_text'], size=100)

In [14]:
w2v = dict(zip(model.wv.index2word, model.wv.syn0))

In [15]:
w2v_array_len = list(w2v.items())[0][1].shape[0]

In [16]:
def extract_top_words(tfidfdict, numwrd):

    top_word_dict, min_value, min_key = {}, 99, ''
    

    for k, v in max_tf_dict.items():
        # print(top_word_dict.values())
        # print(v)
        if k not in unique_words and k not in eng_stopwords:
        
            if len(top_word_dict) < numwrd:
                top_word_dict[k] = v
                if v <= min_value:
                    min_key = k
            else:
                # print(v, min(list(top_word_dict.values())))
                if v > min(list(top_word_dict.values())) and k not in eng_stopwords:

                    min_value = min(top_word_dict.values())

                    for ky, va in top_word_dict.items():
                        if va == min_value:
                            min_key = ky

                    top_word_dict.pop(min_key)
                    top_word_dict[k] = v
                
    return top_word_dict
another_top_words_dict = extract_top_words(max_tf_dict, 80)
high_tf_idf_words_columns = list(another_top_words_dict.keys())


def count_topwords(target_df):

    for word in high_tf_idf_words_columns:
        
        # TODO: костыль, нужен, когда у нас уже есть такие столбцы
        # в датасете
#         try:
#             target_df = target_df.drop(word, 1)
#         except ValueError:
#             pass
        

        def count_numwords(collist):
            value = 0

            for wd in collist:
                if wd == word:
                    value += 1
            return value


        target_df[word] = target_df.cleaned_text.apply(count_numwords)
        


        

In [17]:
df_train['length']=df_train['cleaned_text_string'].apply(len)
df_train["num_words"] = df_train["text"].apply(lambda x: len(str(x).split()))
df_train["num_unique_words"] = df_train["text"].apply(lambda x: len(set(str(x).split())))
df_train["num_punctuations"] =df_train['text'].apply(lambda x: len([c for c in str(x) if c in string.punctuation]))
df_train["num_words_upper"] = df_train["text"].apply(lambda x: len([w for w in str(x).split() if w.isupper()]))
df_train["num_words_title"] = df_train["text"].apply(lambda x: len([w for w in str(x).split() if w.istitle()]))
df_train["mean_word_len"] = df_train["text"].apply(lambda x: np.mean([len(w) for w in str(x).split()]))
df_train["num_stopwords"] = df_train["text"].apply(lambda x: len([w for w in str(x).lower().split() if w in eng_stopwords]))
df_train['lexical_diversity'] = df_train.text.apply(lexical_diversity)
df_train['w2v_array'] = df_train.cleaned_text.apply(sum_up_word2vec_array)
count_topwords(df_train)

df_train.head(n=3)

Unnamed: 0,id,text,author,cleaned_text,cleaned_text_string,length,num_words,num_unique_words,num_punctuations,num_words_upper,...,heard,made,day,death,eye,one,certain,look,though,see
0,id26305,"This process, however, afforded me no means of...",EAP,"[this, process, howev, afford, mean, ascertain...",this process howev afford mean ascertain dimen...,145,41,35,7,2,...,0,0,0,0,0,0,0,0,0,0
1,id17569,It never once occurred to me that the fumbling...,HPL,"[it, never, occur, fumbl, might, mere, mistak]",it never occur fumbl might mere mistak,38,14,14,1,0,...,0,0,0,0,0,0,0,0,0,0
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP,"[in, left, hand, gold, snuff, box, caper, hill...",in left hand gold snuff box caper hill cut man...,116,36,32,5,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
def create_w2v_columns(target_df):
    
    # сначала вытаскиваем колонку как список списков
    
    w2v_array = target_df['w2v_array'].tolist()
    
    for i in range(100):
        
        target_df['w2v_feature_' + str(i)] = [j[i] for j in w2v_array]

In [19]:
create_w2v_columns(df_train)

In [20]:
df_train.head(2)

Unnamed: 0,id,text,author,cleaned_text,cleaned_text_string,length,num_words,num_unique_words,num_punctuations,num_words_upper,...,w2v_feature_90,w2v_feature_91,w2v_feature_92,w2v_feature_93,w2v_feature_94,w2v_feature_95,w2v_feature_96,w2v_feature_97,w2v_feature_98,w2v_feature_99
0,id26305,"This process, however, afforded me no means of...",EAP,"[this, process, howev, afford, mean, ascertain...",this process howev afford mean ascertain dimen...,145,41,35,7,2,...,-7.282747,9.626265,2.237465,6.71699,-13.649048,4.22552,1.332154,-4.678023,0.837265,1.132221
1,id17569,It never once occurred to me that the fumbling...,HPL,"[it, never, occur, fumbl, might, mere, mistak]",it never occur fumbl might mere mistak,38,14,14,1,0,...,-2.068661,2.744264,0.738569,2.069827,-3.973595,1.240327,0.297657,-1.333887,0.329666,0.331183


In [21]:
df_eap=df_train[df_train['author']=='EAP']
df_eap.describe()

Unnamed: 0,length,num_words,num_unique_words,num_punctuations,num_words_upper,num_words_title,mean_word_len,num_stopwords,lexical_diversity,yet,...,w2v_feature_90,w2v_feature_91,w2v_feature_92,w2v_feature_93,w2v_feature_94,w2v_feature_95,w2v_feature_96,w2v_feature_97,w2v_feature_98,w2v_feature_99
count,7900.0,7900.0,7900.0,7900.0,7900.0,7900.0,7900.0,7900.0,7900.0,7900.0,...,7900.0,7900.0,7900.0,7900.0,7900.0,7900.0,7900.0,7900.0,7900.0,7900.0
mean,80.893038,25.442405,21.894937,4.096329,0.553291,2.102405,4.644952,12.747595,0.88606,0.029367,...,-4.121095,5.174927,1.259972,3.763914,-7.366056,2.379205,0.613379,-2.561031,0.648804,0.580359
std,59.749772,18.567706,13.727397,3.573788,0.892966,2.052241,0.63134,9.619779,0.097354,0.174013,...,2.901784,3.640101,1.320235,3.2683,5.1873,1.852029,0.740328,1.783153,1.065674,0.418002
min,5.0,2.0,2.0,1.0,0.0,0.0,2.0,0.0,0.333333,0.0,...,-55.469582,0.0,-1.364278,0.0,-99.334419,0.0,-5.296139,-25.904354,-0.99541,0.0
25%,40.0,12.0,12.0,2.0,0.0,1.0,4.25,6.0,0.821429,0.0,...,-5.257265,2.726165,0.484218,1.743163,-9.374925,1.175117,0.228258,-3.289303,0.126715,0.299102
50%,65.0,21.0,19.0,3.0,0.0,1.0,4.6,10.0,0.894737,0.0,...,-3.416716,4.290519,0.881104,2.855869,-6.099398,1.907407,0.533437,-2.120817,0.318356,0.4771
75%,106.0,33.0,29.0,5.0,1.0,2.0,5.0,17.0,1.0,0.0,...,-2.164654,6.585654,1.57649,4.72489,-3.866128,3.00525,0.964207,-1.328563,0.72016,0.74617
max,925.0,267.0,155.0,71.0,15.0,43.0,11.0,135.0,1.0,2.0,...,0.0,69.969254,29.584175,74.442268,0.0,40.654839,6.608479,0.0,24.606798,7.66276


In [22]:
df_mws=df_train[df_train['author']=='MWS']
df_mws.describe()

Unnamed: 0,length,num_words,num_unique_words,num_punctuations,num_words_upper,num_words_title,mean_word_len,num_stopwords,lexical_diversity,yet,...,w2v_feature_90,w2v_feature_91,w2v_feature_92,w2v_feature_93,w2v_feature_94,w2v_feature_95,w2v_feature_96,w2v_feature_97,w2v_feature_98,w2v_feature_99
count,6044.0,6044.0,6044.0,6044.0,6044.0,6044.0,6044.0,6044.0,6044.0,6044.0,...,6044.0,6044.0,6044.0,6044.0,6044.0,6044.0,6044.0,6044.0,6044.0,6044.0
mean,85.267869,27.417273,23.544672,3.833719,0.751489,2.124255,4.598182,13.896923,0.883407,0.052614,...,-4.448266,5.683639,1.393814,4.109165,-8.11786,2.570948,0.714982,-2.832133,0.59798,0.675044
std,71.37294,23.13444,14.925835,2.840625,1.203636,1.759572,0.561558,12.196599,0.086804,0.234839,...,3.698518,4.743915,1.375281,3.693571,6.772512,2.194987,0.814026,2.396792,0.858745,0.573873
min,3.0,2.0,2.0,1.0,0.0,0.0,2.666667,0.0,0.39899,0.0,...,-143.409912,0.078739,-0.414662,0.070037,-259.238342,0.036848,-3.06717,-95.582367,-0.641901,0.009522
25%,48.0,15.0,14.0,2.0,0.0,1.0,4.25,7.0,0.823529,0.0,...,-5.511232,3.250366,0.631881,2.155786,-9.991849,1.414498,0.323831,-3.531134,0.163795,0.376359
50%,73.0,23.0,21.0,3.0,0.0,2.0,4.560791,12.0,0.885714,0.0,...,-3.786991,4.843415,1.043284,3.289917,-6.91982,2.121407,0.629973,-2.425043,0.344649,0.569625
75%,107.0,34.0,30.0,5.0,1.0,3.0,4.907156,18.0,0.95,0.0,...,-2.549342,6.997622,1.700543,4.944173,-4.632903,3.121715,1.013919,-1.605256,0.670134,0.835616
max,2709.0,861.0,429.0,59.0,27.0,46.0,10.5,437.0,1.0,3.0,...,-0.059117,181.707382,36.505371,119.148956,-0.113325,76.459557,29.18292,-0.038485,12.197112,21.186209


In [23]:
# all words set

wordset=set()

for i in df_train.index:
    wordset |= set(df_train['cleaned_text'][i])
wordlist=list(wordset)

In [24]:
#делаю фрейм со словами
df_word=pd.DataFrame(columns=["word", "mws", "eap", "hpl", "all"])
df_word["word"]=wordlist
df_word["mws"]=0
df_word["eap"]=0
df_word["hpl"]=0
df_word["all"]=0
df_word.head()

Unnamed: 0,word,mws,eap,hpl,all
0,inessenti,0,0,0,0
1,doco,0,0,0,0
2,kirwin,0,0,0,0
3,propens,0,0,0,0
4,drinen,0,0,0,0


In [25]:

word_dict = {}
counter = 0
head = []

for wordlist in df_train['cleaned_text']:
    for word in wordlist:
        if word not in word_dict:
            head.append(word)
            word_dict[word] = counter
            counter += 1


In [26]:

list_of_lists = []

for wordlist in df_train['cleaned_text']:
    row = [0 for i in range(len(word_dict))]
    for word in wordlist:
        row[word_dict[word]] += 1
    list_of_lists.append(row)



In [27]:

count_frame = pd.DataFrame(list_of_lists)
count_frame['author'] = df_train['author']



In [28]:

count_frame.columns = head + ['author']


In [29]:
col=list(count_frame.columns)
col[-1]='author_name'
count_frame.columns=col
pivot_col=pd.pivot_table(count_frame, aggfunc=np.sum, values=col, index=['author_name'])

In [31]:
# Delete excessive words
col=list(pivot_col.columns)
col2=[string for string in col if (string[0]!='"' and string[0]!="'"
                                  and string[0]!='.' and string[0]!='`'
                                   and len(string)>3 and '.' not in string)]
col=[]
pivot_col=pivot_col[col2]
pivot_col.head()

Unnamed: 0_level_0,aaem,aback,abaft,abandon,abaout,abas,abash,abat,abbey,abbrevi,...,æmilianus,æneid,ærial,æronaut,ærostat,æschylus,élite,émeut,οἶδα,υπνος
author_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
EAP,1,2,0,22,0,2,1,2,3,2,...,0,0,1,3,1,1,1,1,0,0
HPL,0,0,0,17,24,0,1,3,0,0,...,2,1,0,0,0,0,0,0,2,1
MWS,0,0,1,9,0,0,0,1,2,0,...,0,0,0,0,0,0,0,0,0,0


In [32]:
#Create pivot
pivot_col=pivot_col.append(pivot_col.sum(), ignore_index=True)
pivot_col.index=['EAP', 'HPL', 'MWS', 'SUMA']
pivot_col.head()

Unnamed: 0,aaem,aback,abaft,abandon,abaout,abas,abash,abat,abbey,abbrevi,...,æmilianus,æneid,ærial,æronaut,ærostat,æschylus,élite,émeut,οἶδα,υπνος
EAP,1,2,0,22,0,2,1,2,3,2,...,0,0,1,3,1,1,1,1,0,0
HPL,0,0,0,17,24,0,1,3,0,0,...,2,1,0,0,0,0,0,0,2,1
MWS,0,0,1,9,0,0,0,1,2,0,...,0,0,0,0,0,0,0,0,0,0
SUMA,1,2,1,48,24,2,2,6,5,2,...,2,1,1,3,1,1,1,1,2,1


In [33]:
summa=[pivot_col.loc['EAP'].sum(), pivot_col.loc['HPL'].sum(), 
       pivot_col.loc['MWS'].sum(), pivot_col.loc['SUMA'].sum()]
pivot_col['summa']=summa
pivot_col.head()

Unnamed: 0,aaem,aback,abaft,abandon,abaout,abas,abash,abat,abbey,abbrevi,...,æneid,ærial,æronaut,ærostat,æschylus,élite,émeut,οἶδα,υπνος,summa
EAP,1,2,0,22,0,2,1,2,3,2,...,0,1,3,1,1,1,1,0,0,87765
HPL,0,0,0,17,24,0,1,3,0,0,...,1,0,0,0,0,0,0,2,1,74269
MWS,0,0,1,9,0,0,0,1,2,0,...,0,0,0,0,0,0,0,0,0,73160
SUMA,1,2,1,48,24,2,2,6,5,2,...,1,1,3,1,1,1,1,2,1,235194


In [34]:
# Create probability of author text knowing that a word was used
pivot_part=pivot_col
pivot_part.loc['EAP']=pivot_col.loc['EAP']/pivot_col.loc['SUMA']
pivot_part.loc['HPL']=pivot_col.loc['HPL']/pivot_col.loc['SUMA']
pivot_part.loc['MWS']=pivot_col.loc['MWS']/pivot_col.loc['SUMA']
pivot_part=pivot_part.loc[['EAP', 'HPL', 'MWS']]
# Delete unique words
pivot_part=pivot_part.loc[:, (pivot_part!=1).all(axis=0)]
pivot_part.head()

Unnamed: 0,abandon,abash,abat,abbey,abdic,aberr,abhor,abhorr,abil,abject,...,younger,youngest,your,youth,zeal,zenith,zest,zigzag,zone,summa
EAP,0.458333,0.5,0.333333,0.6,0.142857,0.166667,0.058824,0.111111,0.789474,0.333333,...,0.272727,0.2,0.534884,0.101562,0.117647,0.4,0.2,0.4,0.666667,0.37316
HPL,0.354167,0.5,0.5,0.0,0.0,0.666667,0.235294,0.555556,0.052632,0.0,...,0.0,0.4,0.069767,0.429688,0.470588,0.6,0.2,0.6,0.333333,0.315778
MWS,0.1875,0.0,0.166667,0.4,0.857143,0.166667,0.705882,0.333333,0.157895,0.666667,...,0.727273,0.4,0.395349,0.46875,0.411765,0.0,0.6,0.0,0.0,0.311062


In [35]:
# It will be easier to work this way
eap_dict=pivot_part.loc['EAP'].to_dict()
hpl_dict=pivot_part.loc['HPL'].to_dict()
mws_dict=pivot_part.loc['MWS'].to_dict()
eap_dict['word']

0.44859813084112149

In [36]:
# Create author score 
def ind_val_eap(listn):
    quant=0
    for word in listn:
        try:
            quant+=eap_dict[word]
        except KeyError:
            quant+=0
    return quant

def ind_val_hpl(listn):
    quant=0
    for word in listn:
        try:
            quant+=hpl_dict[word]
        except KeyError:
            quant+=0
    return quant

def ind_val_mws(listn):
    quant=0
    for word in listn:
        try:
            quant+=mws_dict[word]
        except KeyError:
            quant+=0
    return quant

In [37]:
# Add index of author
df_train['mws_index']=df_train['cleaned_text'].apply(ind_val_mws)/df_train['length']
df_train['eap_index']=df_train['cleaned_text'].apply(ind_val_eap)/df_train['length']
df_train['hpl_index']=df_train['cleaned_text'].apply(ind_val_hpl)/df_train['length']
df_train.head(n=3)

Unnamed: 0,id,text,author,cleaned_text,cleaned_text_string,length,num_words,num_unique_words,num_punctuations,num_words_upper,...,w2v_feature_93,w2v_feature_94,w2v_feature_95,w2v_feature_96,w2v_feature_97,w2v_feature_98,w2v_feature_99,mws_index,eap_index,hpl_index
0,id26305,"This process, however, afforded me no means of...",EAP,"[this, process, howev, afford, mean, ascertain...",this process howev afford mean ascertain dimen...,145,41,35,7,2,...,0.984288,1.852068,1.080477,-0.010269,-7.75753,-3.017276,2.205754,0.035935,0.074388,0.034504
1,id17569,It never once occurred to me that the fumbling...,HPL,"[it, never, occur, fumbl, might, mere, mistak]",it never occur fumbl might mere mistak,38,14,14,1,0,...,0.255773,0.44648,0.274428,0.282043,-2.035049,-0.809668,0.59657,0.03586,0.06098,0.034739
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP,"[in, left, hand, gold, snuff, box, caper, hill...",in left hand gold snuff box caper hill cut man...,116,36,32,5,0,...,0.23042,2.103114,1.331694,-1.781096,-7.162513,-2.376012,0.95071,0.0269,0.047832,0.037337


In [71]:
#Transform authors' names to numeric
df_train['author']=df_train['author'].astype('category')
df_train['author2']=df_train['author'].cat.codes
# Create different features 
df_train.head(n=3)
mid = df_train['author2']
df_train.drop(labels=['author2'], axis=1,inplace = True)
df_train.insert(0, 'author2', mid)
df_train.head()

Unnamed: 0,author2,id,text,author,cleaned_text,cleaned_text_string,length,num_words,num_unique_words,num_punctuations,...,w2v_feature_93,w2v_feature_94,w2v_feature_95,w2v_feature_96,w2v_feature_97,w2v_feature_98,w2v_feature_99,mws_index,eap_index,hpl_index
0,0,id26305,"This process, however, afforded me no means of...",EAP,"[this, process, howev, afford, mean, ascertain...",this process howev afford mean ascertain dimen...,145,41,35,7,...,0.984288,1.852068,1.080477,-0.010269,-7.75753,-3.017276,2.205754,0.035935,0.074388,0.034504
1,1,id17569,It never once occurred to me that the fumbling...,HPL,"[it, never, occur, fumbl, might, mere, mistak]",it never occur fumbl might mere mistak,38,14,14,1,...,0.255773,0.44648,0.274428,0.282043,-2.035049,-0.809668,0.59657,0.03586,0.06098,0.034739
2,0,id11008,"In his left hand was a gold snuff box, from wh...",EAP,"[in, left, hand, gold, snuff, box, caper, hill...",in left hand gold snuff box caper hill cut man...,116,36,32,5,...,0.23042,2.103114,1.331694,-1.781096,-7.162513,-2.376012,0.95071,0.0269,0.047832,0.037337
3,2,id27763,How lovely is spring As we looked from Windsor...,MWS,"[how, love, spring, as, look, windsor, terrac,...",how love spring as look windsor terrac sixteen...,144,34,32,4,...,0.434389,2.018498,1.203822,-1.145513,-7.376847,-2.525478,1.201493,0.07185,0.033438,0.033601
4,1,id12958,"Finding nothing else, not even gold, the Super...",HPL,"[find, noth, els, even, gold, superintend, aba...",find noth els even gold superintend abandon at...,102,27,25,4,...,0.414073,1.278332,0.731568,-0.165426,-4.971582,-1.783214,1.067079,0.036859,0.056661,0.043735


In [153]:
author_mapping_dict = {'EAP':0, 'HPL':1, 'MWS':2}
train_y = df_train['author'].map(author_mapping_dict)
train_id = df_train['id'].values
test_id = df_test['id'].values
cols_to_drop = ['id', 'text']
train_X = df_train.drop(cols_to_drop+['author'], axis=1)
test_X = df_test.drop(cols_to_drop, axis=1)
tfidf_vec = TfidfVectorizer(stop_words='english', ngram_range=(1,3))
full_tfidf = tfidf_vec.fit_transform(df_train['text'].values.tolist() + df_test['text'].values.tolist())
train_tfidf = tfidf_vec.transform(df_train['text'].values.tolist())
test_tfidf = tfidf_vec.transform(df_test['text'].values.tolist())

In [154]:
def runMNB(train_X, train_y, test_X, test_y, test_X2):
    model = naive_bayes.MultinomialNB()
    model.fit(train_X, train_y)
    pred_test_y = model.predict_proba(test_X)
    pred_test_y2 = model.predict_proba(test_X2)
    return pred_test_y, pred_test_y2, model

In [155]:
cv_scores = []
pred_full_test = 0
pred_train = np.zeros([df_train.shape[0], 3])
kf = model_selection.KFold(n_splits=3, shuffle=True, random_state=194)
for dev_index, val_index in kf.split(train_X):
    dev_X, val_X = train_tfidf[dev_index], train_tfidf[val_index]
    dev_y, val_y = train_y[dev_index], train_y[val_index]
    pred_val_y, pred_test_y, model = runMNB(dev_X, dev_y, val_X, val_y, test_tfidf)
    pred_full_test = pred_full_test + pred_test_y
    pred_train[val_index,:] = pred_val_y
    cv_scores.append(metrics.log_loss(val_y, pred_val_y))
print("Mean cv score : ", np.mean(cv_scores))
pred_full_test = pred_full_test / 5.

Mean cv score :  0.862729297781


In [156]:
n_comp = 50
svd_obj = TruncatedSVD(n_components=n_comp, algorithm='arpack')
svd_obj.fit(full_tfidf)
train_svd = pd.DataFrame(svd_obj.transform(train_tfidf))
test_svd = pd.DataFrame(svd_obj.transform(test_tfidf))
    
train_svd.columns = ['svd_word_'+str(i) for i in range(n_comp)]
test_svd.columns = ['svd_word_'+str(i) for i in range(n_comp)]
df_train = pd.concat([df_train, train_svd], axis=1)
df_test = pd.concat([df_test, test_svd], axis=1)
del full_tfidf, train_tfidf, test_tfidf, train_svd, test_svd

In [157]:
### Fit transform the count vectorizer ###
tfidf_vec = CountVectorizer(stop_words='english', ngram_range=(1,3))
tfidf_vec.fit(df_train['text'].values.tolist() + df_test['text'].values.tolist())
train_tfidf = tfidf_vec.transform(df_train['text'].values.tolist())
test_tfidf = tfidf_vec.transform(df_test['text'].values.tolist())

In [158]:
cv_scores = []
pred_full_test = 0
pred_train = np.zeros([df_train.shape[0], 3])
kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=2017)
for dev_index, val_index in kf.split(train_X):
    dev_X, val_X = train_tfidf[dev_index], train_tfidf[val_index]
    dev_y, val_y = train_y[dev_index], train_y[val_index]
    pred_val_y, pred_test_y, model = runMNB(dev_X, dev_y, val_X, val_y, test_tfidf)
    pred_full_test = pred_full_test + pred_test_y
    pred_train[val_index,:] = pred_val_y
    cv_scores.append(metrics.log_loss(val_y, pred_val_y))
print("Mean cv score : ", np.mean(cv_scores))
pred_full_test = pred_full_test / 5.

# add the predictions as new features #
df_train["nb_cvec_eap"] = pred_train[:,0]
df_train["nb_cvec_hpl"] = pred_train[:,1]
df_train["nb_cvec_mws"] = pred_train[:,2]
df_test["nb_cvec_eap"] = pred_full_test[:,0]
df_test["nb_cvec_hpl"] = pred_full_test[:,1]
df_test["nb_cvec_mws"] = pred_full_test[:,2]

Mean cv score :  0.450918416166


In [44]:
df_train.head()

Unnamed: 0,author2,id,text,author,cleaned_text,cleaned_text_string,length,num_words,num_unique_words,num_punctuations,...,svd_word_43,svd_word_44,svd_word_45,svd_word_46,svd_word_47,svd_word_48,svd_word_49,nb_cvec_eap,nb_cvec_hpl,nb_cvec_mws
0,0,id26305,"This process, however, afforded me no means of...",EAP,"[this, process, howev, afford, mean, ascertain...",this process howev afford mean ascertain dimen...,145,41,35,7,...,0.060822,-0.01947,0.022374,0.019055,0.001587,0.037074,0.002873,0.9999933,2.75279e-06,3.990111e-06
1,1,id17569,It never once occurred to me that the fumbling...,HPL,"[it, never, occur, fumbl, might, mere, mistak]",it never occur fumbl might mere mistak,38,14,14,1,...,0.001035,0.000631,-0.001032,0.003187,-0.004602,0.00411,-0.002629,0.822682,0.1492107,0.02810727
2,0,id11008,"In his left hand was a gold snuff box, from wh...",EAP,"[in, left, hand, gold, snuff, box, caper, hill...",in left hand gold snuff box caper hill cut man...,116,36,32,5,...,-0.031303,0.01151,-0.014951,-0.028148,0.010121,0.02438,-0.027564,0.9999918,8.206128e-06,1.06472e-08
3,2,id27763,How lovely is spring As we looked from Windsor...,MWS,"[how, love, spring, as, look, windsor, terrac,...",how love spring as look windsor terrac sixteen...,144,34,32,4,...,0.009119,-0.00991,0.015214,0.013138,0.002048,0.017966,-0.011112,1.43689e-09,7.472578e-10,1.0
4,1,id12958,"Finding nothing else, not even gold, the Super...",HPL,"[find, noth, els, even, gold, superintend, aba...",find noth els even gold superintend abandon at...,102,27,25,4,...,0.001328,-0.000561,0.003316,-0.001091,4.7e-05,-0.009183,-0.002713,0.8960309,0.1016456,0.002323469


In [66]:
del df_train['nb_cvec_eap']
del df_train['nb_cvec_hpl']
del df_train['nb_cvec_mws']

In [46]:
def print_top_words(model, feature_names, n_top_words):
    for index, topic in enumerate(model.components_):
        message = "\nTopic #{}:".format(index)
        message += " ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1 :-1]])
        print(message)
        print("="*70)

In [47]:
lemm = WordNetLemmatizer()
class LemmaCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(LemmaCountVectorizer, self).build_analyzer()
        return lambda doc: (lemm.lemmatize(w) for w in analyzer(doc))

In [48]:
# Storing the entire training text in a list
text = list(df_train.text.values)
# Calling our overwritten Count vectorizer
tf_vectorizer = LemmaCountVectorizer(max_df=0.95, 
                                     min_df=2,
                                     stop_words='english',
                                     decode_error='ignore')
tf = tf_vectorizer.fit_transform(text)

In [49]:
lda = LatentDirichletAllocation(n_components=13, max_iter=5,
                                learning_method = 'online',
                                learning_offset = 50.,
                                random_state = 0)

In [50]:
lda.fit(tf)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='online', learning_offset=50.0,
             max_doc_update_iter=100, max_iter=5, mean_change_tol=0.001,
             n_components=13, n_jobs=1, n_topics=None, perp_tol=0.1,
             random_state=0, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [51]:
lda.transform(tf)

array([[ 0.00452489,  0.00452489,  0.00452489, ...,  0.00452499,
         0.06415156,  0.00452489],
       [ 0.01538462,  0.01538462,  0.01538467, ...,  0.01538462,
         0.01538462,  0.01538462],
       [ 0.00404858,  0.00404859,  0.89772543, ...,  0.00404861,
         0.00404865,  0.00404867],
       ..., 
       [ 0.00961538,  0.00961538,  0.00961556, ...,  0.0096154 ,
         0.00961538,  0.00961538],
       [ 0.01098901,  0.21741942,  0.01098901, ...,  0.01098901,
         0.01098901,  0.01098906],
       [ 0.00961538,  0.00961538,  0.13461525, ...,  0.00961563,
         0.00961538,  0.00961544]])

In [159]:
# Create data set
ds_train=df_train.values
X=ds_train[:, 6:]
Y=ds_train[:, 0]
seed=7
test_size=0.3
X_train, X_test, y_train, y_test = sk.model_selection.train_test_split(X,Y, test_size=test_size, random_state=seed)

In [160]:
# fit model
xg_train=xgb.DMatrix(X_train, label=y_train)
xg_test=xgb.DMatrix(X_test, label=y_test)
xg_t=xgb.DMatrix(X, label=Y)
param={}
param['objective'] = 'multi:softmax'
param['eta'] = 0.2
param['max_depth'] = 2
param['silent'] = 1
param['num_class'] = 3
param['eval_metric']= "mlogloss"
watchlist = [(xg_train, 'train'), (xg_test, 'test')]
num_round = 240
bst = xgb.train(param, xg_train, num_round, watchlist)
# get prediction
pred = bst.predict(xg_test)
error_rate = np.sum(pred != y_test) / y_test.shape[0]
print('Test error using softmax = {}'.format(error_rate))

[0]	train-mlogloss:0.921491	test-mlogloss:0.922556
[1]	train-mlogloss:0.798604	test-mlogloss:0.800504
[2]	train-mlogloss:0.708918	test-mlogloss:0.711609
[3]	train-mlogloss:0.641138	test-mlogloss:0.644234
[4]	train-mlogloss:0.589194	test-mlogloss:0.592531
[5]	train-mlogloss:0.54839	test-mlogloss:0.552153
[6]	train-mlogloss:0.516373	test-mlogloss:0.520772
[7]	train-mlogloss:0.490757	test-mlogloss:0.495507
[8]	train-mlogloss:0.469581	test-mlogloss:0.475061
[9]	train-mlogloss:0.452602	test-mlogloss:0.458706
[10]	train-mlogloss:0.438182	test-mlogloss:0.44437
[11]	train-mlogloss:0.426372	test-mlogloss:0.432798
[12]	train-mlogloss:0.4165	test-mlogloss:0.423322
[13]	train-mlogloss:0.408062	test-mlogloss:0.415548
[14]	train-mlogloss:0.40077	test-mlogloss:0.408611
[15]	train-mlogloss:0.394704	test-mlogloss:0.402764
[16]	train-mlogloss:0.389174	test-mlogloss:0.397481
[17]	train-mlogloss:0.384247	test-mlogloss:0.393244
[18]	train-mlogloss:0.380209	test-mlogloss:0.389446
[19]	train-mlogloss:0.37638

[158]	train-mlogloss:0.265062	test-mlogloss:0.329184
[159]	train-mlogloss:0.264783	test-mlogloss:0.329138
[160]	train-mlogloss:0.264363	test-mlogloss:0.329166
[161]	train-mlogloss:0.264102	test-mlogloss:0.329087
[162]	train-mlogloss:0.263768	test-mlogloss:0.329008
[163]	train-mlogloss:0.263426	test-mlogloss:0.328956
[164]	train-mlogloss:0.262933	test-mlogloss:0.328963
[165]	train-mlogloss:0.262586	test-mlogloss:0.328765
[166]	train-mlogloss:0.262196	test-mlogloss:0.328762
[167]	train-mlogloss:0.261851	test-mlogloss:0.328684
[168]	train-mlogloss:0.261571	test-mlogloss:0.328725
[169]	train-mlogloss:0.26122	test-mlogloss:0.328756
[170]	train-mlogloss:0.260942	test-mlogloss:0.328775
[171]	train-mlogloss:0.260638	test-mlogloss:0.328852
[172]	train-mlogloss:0.260242	test-mlogloss:0.328796
[173]	train-mlogloss:0.259864	test-mlogloss:0.328758
[174]	train-mlogloss:0.25949	test-mlogloss:0.328719
[175]	train-mlogloss:0.259133	test-mlogloss:0.328689
[176]	train-mlogloss:0.258801	test-mlogloss:0.32

In [161]:
# probabilities output
param['objective'] = 'multi:softprob'
bstp = xgb.train(param, xg_train, num_round, watchlist)
# Note: this convention has been changed since xgboost-unity
# get prediction, this is in 1D array, need reshape to (ndata, nclass)
pred_prob = bstp.predict(xg_test).reshape(y_test.shape[0], 3)
pred_label = np.argmax(pred_prob, axis=1)
error_rate = np.sum(pred_label != y_test) / y_test.shape[0]
print('Test error using softprob = {}'.format(error_rate))

[0]	train-mlogloss:0.921491	test-mlogloss:0.922556
[1]	train-mlogloss:0.798604	test-mlogloss:0.800504
[2]	train-mlogloss:0.708918	test-mlogloss:0.711609
[3]	train-mlogloss:0.641138	test-mlogloss:0.644234
[4]	train-mlogloss:0.589194	test-mlogloss:0.592531
[5]	train-mlogloss:0.54839	test-mlogloss:0.552153
[6]	train-mlogloss:0.516373	test-mlogloss:0.520772
[7]	train-mlogloss:0.490757	test-mlogloss:0.495507
[8]	train-mlogloss:0.469581	test-mlogloss:0.475061
[9]	train-mlogloss:0.452602	test-mlogloss:0.458706
[10]	train-mlogloss:0.438182	test-mlogloss:0.44437
[11]	train-mlogloss:0.426372	test-mlogloss:0.432798
[12]	train-mlogloss:0.4165	test-mlogloss:0.423322
[13]	train-mlogloss:0.408062	test-mlogloss:0.415548
[14]	train-mlogloss:0.40077	test-mlogloss:0.408611
[15]	train-mlogloss:0.394704	test-mlogloss:0.402764
[16]	train-mlogloss:0.389174	test-mlogloss:0.397481
[17]	train-mlogloss:0.384247	test-mlogloss:0.393244
[18]	train-mlogloss:0.380209	test-mlogloss:0.389446
[19]	train-mlogloss:0.37638

[158]	train-mlogloss:0.265062	test-mlogloss:0.329184
[159]	train-mlogloss:0.264783	test-mlogloss:0.329138
[160]	train-mlogloss:0.264363	test-mlogloss:0.329166
[161]	train-mlogloss:0.264102	test-mlogloss:0.329087
[162]	train-mlogloss:0.263768	test-mlogloss:0.329008
[163]	train-mlogloss:0.263426	test-mlogloss:0.328956
[164]	train-mlogloss:0.262933	test-mlogloss:0.328963
[165]	train-mlogloss:0.262586	test-mlogloss:0.328765
[166]	train-mlogloss:0.262196	test-mlogloss:0.328762
[167]	train-mlogloss:0.261851	test-mlogloss:0.328684
[168]	train-mlogloss:0.261571	test-mlogloss:0.328725
[169]	train-mlogloss:0.26122	test-mlogloss:0.328756
[170]	train-mlogloss:0.260942	test-mlogloss:0.328775
[171]	train-mlogloss:0.260638	test-mlogloss:0.328852
[172]	train-mlogloss:0.260242	test-mlogloss:0.328796
[173]	train-mlogloss:0.259864	test-mlogloss:0.328758
[174]	train-mlogloss:0.25949	test-mlogloss:0.328719
[175]	train-mlogloss:0.259133	test-mlogloss:0.328689
[176]	train-mlogloss:0.258801	test-mlogloss:0.32

In [162]:
# do the same thing again, but output probabilities
param['objective'] = 'multi:softprob'
bstp = xgb.train(param, xg_t, num_round, watchlist)
# Note: this convention has been changed since xgboost-unity
# get prediction, this is in 1D array, need reshape to (ndata, nclass)
pred_prob = bstp.predict(xg_t).reshape(Y.shape[0], 3)
pred_label = np.argmax(pred_prob, axis=1)
error_rate = np.sum(pred_label != Y) / Y.shape[0]
print('Test error using softprob = {}'.format(error_rate))

[0]	train-mlogloss:0.921818	test-mlogloss:0.921863
[1]	train-mlogloss:0.79919	test-mlogloss:0.799002
[2]	train-mlogloss:0.709528	test-mlogloss:0.70911
[3]	train-mlogloss:0.641847	test-mlogloss:0.641432
[4]	train-mlogloss:0.58975	test-mlogloss:0.589307
[5]	train-mlogloss:0.549532	test-mlogloss:0.548942
[6]	train-mlogloss:0.51771	test-mlogloss:0.516905
[7]	train-mlogloss:0.492063	test-mlogloss:0.491249
[8]	train-mlogloss:0.470918	test-mlogloss:0.470304
[9]	train-mlogloss:0.453839	test-mlogloss:0.453268
[10]	train-mlogloss:0.439633	test-mlogloss:0.439011
[11]	train-mlogloss:0.42799	test-mlogloss:0.427375
[12]	train-mlogloss:0.418099	test-mlogloss:0.41718
[13]	train-mlogloss:0.409744	test-mlogloss:0.408586
[14]	train-mlogloss:0.402601	test-mlogloss:0.401298
[15]	train-mlogloss:0.396384	test-mlogloss:0.394931
[16]	train-mlogloss:0.391226	test-mlogloss:0.389685
[17]	train-mlogloss:0.386594	test-mlogloss:0.384904
[18]	train-mlogloss:0.382438	test-mlogloss:0.380924
[19]	train-mlogloss:0.378784

KeyboardInterrupt: 

In [56]:
df_test.head(n=3)

Unnamed: 0,id,text,svd_word_0,svd_word_1,svd_word_2,svd_word_3,svd_word_4,svd_word_5,svd_word_6,svd_word_7,...,svd_word_43,svd_word_44,svd_word_45,svd_word_46,svd_word_47,svd_word_48,svd_word_49,nb_cvec_eap,nb_cvec_hpl,nb_cvec_mws
0,id02310,"Still, as I urged our leaving Ireland with suc...",0.024516,-0.010185,0.001168,-0.005363,-0.013319,-0.003444,-0.002816,0.00324,...,-0.00632,0.019999,-0.006687,0.008188,-0.046846,-0.042702,3e-06,0.021018,0.000595,0.978387
1,id24541,"If a fire wanted fanning, it could readily be ...",0.022294,-0.011968,-0.001596,-0.004478,-0.012514,-0.000641,-0.009725,-0.000215,...,0.003998,-0.00061,0.001042,-0.000802,-0.001725,0.004586,-0.006745,0.999985,9e-06,6e-06
2,id00134,And when they had broken down the frail door t...,0.016906,-0.008934,0.00024,-0.006892,-0.008843,0.004787,-0.005058,-0.004598,...,0.007646,-0.004253,0.031135,8.8e-05,-0.001402,-0.012464,0.007291,0.217325,0.782527,0.000148


In [53]:
df_test['cleaned_text'] = df_test.text.apply(tokenize_stem)
df_test['cleaned_text_string'] = df_test.cleaned_text.apply(' '.join)
df_test.head(n=3)

Unnamed: 0,id,text,svd_word_0,svd_word_1,svd_word_2,svd_word_3,svd_word_4,svd_word_5,svd_word_6,svd_word_7,...,svd_word_45,svd_word_46,svd_word_47,svd_word_48,svd_word_49,nb_cvec_eap,nb_cvec_hpl,nb_cvec_mws,cleaned_text,cleaned_text_string
0,id02310,"Still, as I urged our leaving Ireland with suc...",0.024516,-0.010185,0.001168,-0.005363,-0.013319,-0.003444,-0.002816,0.00324,...,-0.006687,0.008188,-0.046846,-0.042702,3e-06,0.021018,0.000595,0.978387,"[still, i, urg, leav, ireland, inquietud, impa...",still i urg leav ireland inquietud impati fath...
1,id24541,"If a fire wanted fanning, it could readily be ...",0.022294,-0.011968,-0.001596,-0.004478,-0.012514,-0.000641,-0.009725,-0.000215,...,0.001042,-0.000802,-0.001725,0.004586,-0.006745,0.999985,9e-06,6e-06,"[if, fire, want, fan, could, readili, fan, new...",if fire want fan could readili fan newspap gov...
2,id00134,And when they had broken down the frail door t...,0.016906,-0.008934,0.00024,-0.006892,-0.008843,0.004787,-0.005058,-0.004598,...,0.031135,8.8e-05,-0.001402,-0.012464,0.007291,0.217325,0.782527,0.000148,"[and, broken, frail, door, found, two, clean, ...",and broken frail door found two clean pick hum...


In [54]:
df_test['length']=df_test['cleaned_text_string'].apply(len)
df_test["num_words"] = df_test["text"].apply(lambda x: len(str(x).split()))
df_test["num_unique_words"] = df_test["text"].apply(lambda x: len(set(str(x).split())))
df_test["num_punctuations"] =df_test['text'].apply(lambda x: len([c for c in str(x) if c in string.punctuation]))
df_test["num_words_upper"] = df_test["text"].apply(lambda x: len([w for w in str(x).split() if w.isupper()]))
df_test["num_words_title"] = df_test["text"].apply(lambda x: len([w for w in str(x).split() if w.istitle()]))
df_test["mean_word_len"] = df_test["text"].apply(lambda x: np.mean([len(w) for w in str(x).split()]))
df_test["num_stopwords"] = df_test["text"].apply(lambda x: len([w for w in str(x).lower().split() if w in eng_stopwords]))
df_test['lexical_diversity'] = df_test.text.apply(lexical_diversity)
df_test['w2v_array'] = df_test.cleaned_text.apply(sum_up_word2vec_array)
count_topwords(df_test)
create_w2v_columns(df_test) 
df_test.head(n=3)

Unnamed: 0,id,text,svd_word_0,svd_word_1,svd_word_2,svd_word_3,svd_word_4,svd_word_5,svd_word_6,svd_word_7,...,w2v_feature_90,w2v_feature_91,w2v_feature_92,w2v_feature_93,w2v_feature_94,w2v_feature_95,w2v_feature_96,w2v_feature_97,w2v_feature_98,w2v_feature_99
0,id02310,"Still, as I urged our leaving Ireland with suc...",0.024516,-0.010185,0.001168,-0.005363,-0.013319,-0.003444,-0.002816,0.00324,...,2.470797,0.148157,4.90227,0.55062,0.620488,0.293736,0.499431,-3.060433,-1.231265,1.104716
1,id24541,"If a fire wanted fanning, it could readily be ...",0.022294,-0.011968,-0.001596,-0.004478,-0.012514,-0.000641,-0.009725,-0.000215,...,6.992887,-0.361119,10.840488,0.832574,1.976278,1.081327,-0.232783,-7.698443,-2.857432,1.991308
2,id00134,And when they had broken down the frail door t...,0.016906,-0.008934,0.00024,-0.006892,-0.008843,0.004787,-0.005058,-0.004598,...,5.884607,-1.001613,6.743172,0.213241,1.738426,0.930032,-1.557037,-5.864972,-1.940434,0.822325


In [55]:
df_test['mws_index']=df_test['cleaned_text'].apply(ind_val_mws)/df_test['length']
df_test['eap_index']=df_test['cleaned_text'].apply(ind_val_eap)/df_test['length']
df_test['hpl_index']=df_test['cleaned_text'].apply(ind_val_hpl)/df_test['length']
df_test.head(n=3)
# df_test.head(n=3)

Unnamed: 0,id,text,svd_word_0,svd_word_1,svd_word_2,svd_word_3,svd_word_4,svd_word_5,svd_word_6,svd_word_7,...,w2v_feature_93,w2v_feature_94,w2v_feature_95,w2v_feature_96,w2v_feature_97,w2v_feature_98,w2v_feature_99,mws_index,eap_index,hpl_index
0,id02310,"Still, as I urged our leaving Ireland with suc...",0.024516,-0.010185,0.001168,-0.005363,-0.013319,-0.003444,-0.002816,0.00324,...,0.55062,0.620488,0.293736,0.499431,-3.060433,-1.231265,1.104716,0.071473,0.036315,0.026541
1,id24541,"If a fire wanted fanning, it could readily be ...",0.022294,-0.011968,-0.001596,-0.004478,-0.012514,-0.000641,-0.009725,-0.000215,...,0.832574,1.976278,1.081327,-0.232783,-7.698443,-2.857432,1.991308,0.035932,0.062884,0.039306
2,id00134,And when they had broken down the frail door t...,0.016906,-0.008934,0.00024,-0.006892,-0.008843,0.004787,-0.005058,-0.004598,...,0.213241,1.738426,0.930032,-1.557037,-5.864972,-1.940434,0.822325,0.027486,0.055142,0.057723


In [56]:
del df_test['cleaned_text']
del df_test['cleaned_text_string']
del df_test['w2v_array']

In [57]:
cols=(df_train.columns.tolist())[6:]
[item for item in df_train.columns.tolist() if item not in df_test.columns.tolist()]

['author2', 'author', 'cleaned_text', 'cleaned_text_string']

In [58]:
df_test.head()

Unnamed: 0,id,text,svd_word_0,svd_word_1,svd_word_2,svd_word_3,svd_word_4,svd_word_5,svd_word_6,svd_word_7,...,w2v_feature_93,w2v_feature_94,w2v_feature_95,w2v_feature_96,w2v_feature_97,w2v_feature_98,w2v_feature_99,mws_index,eap_index,hpl_index
0,id02310,"Still, as I urged our leaving Ireland with suc...",0.024516,-0.010185,0.001168,-0.005363,-0.013319,-0.003444,-0.002816,0.00324,...,0.55062,0.620488,0.293736,0.499431,-3.060433,-1.231265,1.104716,0.071473,0.036315,0.026541
1,id24541,"If a fire wanted fanning, it could readily be ...",0.022294,-0.011968,-0.001596,-0.004478,-0.012514,-0.000641,-0.009725,-0.000215,...,0.832574,1.976278,1.081327,-0.232783,-7.698443,-2.857432,1.991308,0.035932,0.062884,0.039306
2,id00134,And when they had broken down the frail door t...,0.016906,-0.008934,0.00024,-0.006892,-0.008843,0.004787,-0.005058,-0.004598,...,0.213241,1.738426,0.930032,-1.557037,-5.864972,-1.940434,0.822325,0.027486,0.055142,0.057723
3,id27757,While I was thinking how I should possibly man...,0.013408,-0.007515,-0.000154,-0.00402,-0.004521,0.002712,-0.00422,-0.002882,...,0.739516,1.780698,1.040653,-0.705218,-7.231385,-2.711493,1.857348,0.024369,0.065057,0.055736
4,id04081,I am not sure to what limit his knowledge may ...,0.012565,-0.003185,-0.000719,-0.001152,0.00093,-0.00611,0.00169,-0.002149,...,0.410158,0.281921,0.229973,0.561596,-1.859675,-0.814361,0.731755,0.025615,0.070948,0.028437


In [163]:
ds_test=df_test[cols].values

In [87]:
print(pred_prob)

[[  3.44851837e-02   5.50276274e-03   9.60012019e-01]
 [  9.96701777e-01   2.07544537e-03   1.22278044e-03]
 [  6.28928840e-02   9.35769796e-01   1.33732741e-03]
 ..., 
 [  9.06760871e-01   5.87458573e-02   3.44932452e-02]
 [  2.11261455e-02   1.50194473e-03   9.77371871e-01]
 [  1.41863478e-02   9.85448301e-01   3.65293323e-04]]


In [88]:
export=pd.DataFrame(pred_prob)
export.insert(loc=0, column='id', value=y_t)
export.columns=['id','EAP', 'HPL', 'MWS']
export.head()

Unnamed: 0,id,EAP,HPL,MWS
0,id02310,0.034485,0.005503,0.960012
1,id24541,0.996702,0.002075,0.001223
2,id00134,0.062893,0.93577,0.001337
3,id27757,0.80909,0.188739,0.002171
4,id04081,0.978529,0.016969,0.004502


In [68]:
len(export)

8392

In [69]:
export[export['id']=='id23301']

Unnamed: 0,id,EAP,HPL,MWS
6106,id23301,0.337495,0.166845,0.49566


In [70]:
export.to_csv(path_or_buf="../data/export.csv", index=False)

In [105]:
# попробуем теперь генсимовский лда

from gensim.models import LdaModel
from gensim.corpora import Dictionary


In [119]:
# set(stopwords.words('english')).union(set(get_stop_words('english')))


words_exclude = set(eng_stopwords).union(unique_words)

In [124]:
# trying to create corpus as a list of words

train_corpse = df_train['cleaned_text_string'].tolist()

texts = [[word for word in document.lower().split() if word not in words_exclude] for document in train_corpse]
dicccs = Dictionary(texts)


corpus = [dicccs.doc2bow(text) for text in texts]

# print(corpse[0:5])


lda_model = LdaModel(corpus, id2word=dicccs, num_topics=25)

In [131]:
print(texts[0])

['process', 'howev', 'afford', 'mean', 'ascertain', 'dimens', 'dungeon', 'might', 'make', 'circuit', 'return', 'point', 'whenc', 'set', 'without', 'awar', 'fact', 'perfect', 'uniform', 'seem', 'wall']


In [132]:
doc = dicccs.doc2bow(texts[0])

lda_model.get_document_topics(doc)

[(3, 0.081501738548664951),
 (18, 0.39479289459575573),
 (19, 0.050642737227831379),
 (20, 0.37754174325765977),
 (24, 0.059157250001896047)]

In [133]:
doc = dicccs.doc2bow(texts[15])

lda_model.get_document_topics(doc)

[(0, 0.60075588554034731),
 (5, 0.12909575248303104),
 (6, 0.086607033567281319),
 (7, 0.062167704750211973),
 (9, 0.052128071578546056),
 (23, 0.036202073815929887)]

In [134]:
# поехали. времени мало

def topicify_cleane_text_list(cl_tex_lst):
    
    cl_lst = [i for i in cl_tex_lst if i not in words_exclude]
    
    doc = dicccs.doc2bow(cl_lst)
    
    return lda_model.get_document_topics(doc)




In [135]:
df_train['topic_probs_lda'] = df_train.cleaned_text.apply(topicify_cleane_text_list)

In [139]:
def split_topicdata(frm, topicnum):
    
    topic_probs = frm['topic_probs_lda'].tolist()
    
    rownum = len(topic_probs)
    
    generate_zeroes = [[0 for j in range(rownum)] for i in range(topicnum)]
    
    for tuplistnum, tuplist in enumerate(topic_probs):
        for tup in tuplist:
            
            generate_zeroes[tup[0]][tuplistnum] = tup[1]
            
    return generate_zeroes


test_run = split_topicdata(df_train, 25)

25


In [143]:
for colnum, col in enumerate(test_run):
    # print(len(col))
    
    df_train['lda_topic_' + str(colnum)] = col
    

In [147]:
df_test['cleaned_text'] = df_test.text.apply(tokenize_stem)

df_test['topic_probs_lda'] = df_test.cleaned_text.apply(topicify_cleane_text_list)

test_test_run = split_topicdata(df_test, 25)

for colnum, col in enumerate(test_test_run):
    # print(len(col))
    
    df_test['lda_topic_' + str(colnum)] = col


25


In [152]:
del df_test['topic_probs_lda']
del df_train['topic_probs_lda']
del df_test['cleaned_text']