In [1]:
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import pandas as pd
from pandas import isnull
from nltk.stem.snowball import SnowballStemmer
import string
from string import digits

from scipy.sparse import csr_matrix
import numpy as np

In [2]:
import xgboost as xgb




In [3]:
from stop_words import get_stop_words

In [4]:
df_train = pd.read_csv("../data/train.csv")
# 3 столбца - id, text, author
df_train.head(n=3)

Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",EAP
1,id17569,It never once occurred to me that the fumbling...,HPL
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP


In [5]:
len(df_train)

19579

In [7]:
remove_digits = str.maketrans('', '', digits)
def tokenize_stem(file_text):
    #firstly let's apply nltk tokenization
    file_text = file_text.translate(remove_digits)
    try:
        tokens = nltk.word_tokenize(file_text)
    except:
        nltk.download('punkt')
        tokens = nltk.word_tokenize(file_text)
        

    #let's delete punctuation symbols
    tokens = [i for i in tokens if ( i not in string.punctuation )]

    #deleting stop_words
    try:
        stop_words = stopwords.words('english')
    except LookupError:
        nltk.download('stopwords')
        stop_words = stopwords.words('english')
    tokens = [i for i in tokens if ( i not in stop_words )]

    #cleaning words
    stemmer = SnowballStemmer("english")
    
    tokens = [stemmer.stem(i) for i in tokens]

    return tokens

# измерить лексическое разнообразие
def lexical_diversity(file_text):
    file_text = file_text.translate(remove_digits)
    try:
        tokens = nltk.word_tokenize(file_text)
    except:
        nltk.download('punkt')
        tokens = nltk.word_tokenize(file_text)
        

    #let's delete punctuation symbols
    tokens = [i for i in tokens if ( i not in string.punctuation )]

    #cleaning words
    stemmer = SnowballStemmer("english")
    
    tokens = [stemmer.stem(i) for i in tokens]

    return len(set(tokens))/len(tokens)

def compute_punctuation(file_text):
    
    punct_counter = 0
    
    for i in file_text:
        if i in string.punctuation:
            punct_counter += 1
            
    return punct_counter
    
    
    

In [8]:
df_train['cleaned_text'] = df_train.text.apply(tokenize_stem)
df_train['cleaned_text_string'] = df_train.cleaned_text.apply(' '.join)
df_train.head(n=3)

Unnamed: 0,id,text,author,cleaned_text,cleaned_text_string
0,id26305,"This process, however, afforded me no means of...",EAP,"[this, process, howev, afford, mean, ascertain...",this process howev afford mean ascertain dimen...
1,id17569,It never once occurred to me that the fumbling...,HPL,"[it, never, occur, fumbl, might, mere, mistak]",it never occur fumbl might mere mistak
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP,"[in, left, hand, gold, snuff, box, caper, hill...",in left hand gold snuff box caper hill cut man...


In [9]:
df_train['length']=df_train['cleaned_text_string'].apply(len)
df_train.head(n=3)

Unnamed: 0,id,text,author,cleaned_text,cleaned_text_string,length
0,id26305,"This process, however, afforded me no means of...",EAP,"[this, process, howev, afford, mean, ascertain...",this process howev afford mean ascertain dimen...,145
1,id17569,It never once occurred to me that the fumbling...,HPL,"[it, never, occur, fumbl, might, mere, mistak]",it never occur fumbl might mere mistak,38
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP,"[in, left, hand, gold, snuff, box, caper, hill...",in left hand gold snuff box caper hill cut man...,116


In [10]:
df_hpl=df_train[df_train['author']=='HPL']
df_hpl.describe()

Unnamed: 0,length
count,5635.0
mean,93.395386
std,51.075096
min,7.0
25%,58.0
50%,85.0
75%,118.0
max,561.0


In [11]:
df_eap=df_train[df_train['author']=='EAP']
df_eap.describe()

Unnamed: 0,length
count,7900.0
mean,81.543165
std,60.100183
min,5.0
25%,40.0
50%,66.0
75%,106.0
max,925.0


In [12]:
df_mws=df_train[df_train['author']=='MWS']
df_mws.describe()

Unnamed: 0,length
count,6044.0
mean,86.124586
std,71.976281
min,4.0
25%,48.0
50%,74.0
75%,108.0
max,2715.0


In [13]:
# как мы будем эту штуку правильнее делать (возможно это жуткий костыль), я хз
# сначала создаем словарь где ключ - уникальное слово, а значение - его порядковый номер
# затем создаем разреженную матрицу, которую заполняем в зависимости от порядковых номеров 
word_dict = {}



In [14]:
#делаю сет со всеми словами
# и сразу заготовку под шапку(потом увидишь зачем)
counter = 0
head = []

for wordlist in df_train['cleaned_text']:
    for word in wordlist:
        if word not in word_dict:
            head.append(word)
            word_dict[word] = counter
            counter += 1


In [15]:
len(head)

15230

In [16]:
# видоизменять колонки в pandas руками по одному значению в строке или столбце - очень плохая идея
# колонка это numpy.ndarray, а значит при каждой итерации она будет пересоздаваться
# что угробит производительность
# делаем значит так. считаем где сколько и где встречались отдельные слова, затем создаем строку за строкой для 
# каждого предложения

list_of_lists = []

for wordlist in df_train['cleaned_text']:
    row = [0 for i in range(len(word_dict))]
    for word in wordlist:
        row[word_dict[word]] += 1
    list_of_lists.append(row)



In [17]:
print(len(list_of_lists))


19579


In [18]:
# ... и для того чтобы посмотреть встречаемость того или иного слова по авторам добавим такую колонку

count_frame = pd.DataFrame(list_of_lists)
count_frame['author'] = df_train['author']



In [19]:
count_frame.columns = head + ['author']

In [20]:
print(count_frame.head())


   this  process  howev  afford  mean  ascertain  dimens  dungeon  i  might  \
0     1        1      1       1     1          1       1        1  2      1   
1     0        0      0       0     0          0       0        0  0      1   
2     0        0      0       0     0          0       0        0  0      0   
3     0        0      0       0     0          0       0        0  0      0   
4     0        0      0       0     0          0       0        0  0      0   

    ...    aegidus  burr  bentley  waltzer  binder  brusqueri  adriat  ancona  \
0   ...          0     0        0        0       0          0       0       0   
1   ...          0     0        0        0       0          0       0       0   
2   ...          0     0        0        0       0          0       0       0   
3   ...          0     0        0        0       0          0       0       0   
4   ...          0     0        0        0       0          0       0       0   

   agir  author  
0     0     EAP  
1 

In [21]:
# Пока объединим все, потом может быть будем использовать
col=list(count_frame.columns)
col[-1]='author_name'
count_frame.columns=col
pivot_col=pd.pivot_table(count_frame, aggfunc=np.sum, values=col, index=['author_name'])

In [22]:
#Убираем лишние слова, которые не учли раньше
col=list(pivot_col.columns)
col2=[string for string in col if (string[0]!='"' and string[0]!="'"
                                  and string[0]!='.' and string[0]!='`'
                                   and len(string)>3 and '.' not in string)]
col=[]
pivot_col=pivot_col[col2]
pivot_col.head()

Unnamed: 0_level_0,aaem,aback,abaft,abandon,abaout,abas,abash,abat,abbey,abbrevi,...,æmilianus,æneid,ærial,æronaut,ærostat,æschylus,élite,émeut,οἶδα,υπνος
author_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
EAP,1,2,0,22,0,2,1,2,3,2,...,0,0,1,3,1,1,1,1,0,0
HPL,0,0,0,17,24,0,1,3,0,0,...,2,1,0,0,0,0,0,0,2,1
MWS,0,0,1,9,0,0,0,1,2,0,...,0,0,0,0,0,0,0,0,0,0


In [23]:

#Create pivot
pivot_col=pivot_col.append(pivot_col.sum(), ignore_index=True)
pivot_col.index=['EAP', 'HPL', 'MWS', 'SUMA']
pivot_col.head()

Unnamed: 0,aaem,aback,abaft,abandon,abaout,abas,abash,abat,abbey,abbrevi,...,æmilianus,æneid,ærial,æronaut,ærostat,æschylus,élite,émeut,οἶδα,υπνος
EAP,1,2,0,22,0,2,1,2,3,2,...,0,0,1,3,1,1,1,1,0,0
HPL,0,0,0,17,24,0,1,3,0,0,...,2,1,0,0,0,0,0,0,2,1
MWS,0,0,1,9,0,0,0,1,2,0,...,0,0,0,0,0,0,0,0,0,0
SUMA,1,2,1,48,24,2,2,6,5,2,...,2,1,1,3,1,1,1,1,2,1


In [24]:
summa=[pivot_col.loc['EAP'].sum(), pivot_col.loc['HPL'].sum(), 
       pivot_col.loc['MWS'].sum(), pivot_col.loc['SUMA'].sum()]
pivot_col['summa']=summa
pivot_col.head()

Unnamed: 0,aaem,aback,abaft,abandon,abaout,abas,abash,abat,abbey,abbrevi,...,æneid,ærial,æronaut,ærostat,æschylus,élite,émeut,οἶδα,υπνος,summa
EAP,1,2,0,22,0,2,1,2,3,2,...,0,1,3,1,1,1,1,0,0,87765
HPL,0,0,0,17,24,0,1,3,0,0,...,1,0,0,0,0,0,0,2,1,74269
MWS,0,0,1,9,0,0,0,1,2,0,...,0,0,0,0,0,0,0,0,0,73160
SUMA,1,2,1,48,24,2,2,6,5,2,...,1,1,3,1,1,1,1,2,1,235194


In [25]:
# Create probability of author text knowing that a word was used
pivot_part=pivot_col
pivot_part.loc['EAP']=pivot_col.loc['EAP']/pivot_col.loc['SUMA']
pivot_part.loc['HPL']=pivot_col.loc['HPL']/pivot_col.loc['SUMA']
pivot_part.loc['MWS']=pivot_col.loc['MWS']/pivot_col.loc['SUMA']
pivot_part=pivot_part.loc[['EAP', 'HPL', 'MWS']]
# Delete unique words
pivot_part=pivot_part.loc[:, (pivot_part!=1).all(axis=0)]
pivot_part.head()

Unnamed: 0,abandon,abash,abat,abbey,abdic,aberr,abhor,abhorr,abil,abject,...,younger,youngest,your,youth,zeal,zenith,zest,zigzag,zone,summa
EAP,0.458333,0.5,0.333333,0.6,0.142857,0.166667,0.058824,0.111111,0.789474,0.333333,...,0.272727,0.2,0.534884,0.101562,0.117647,0.4,0.2,0.4,0.666667,0.37316
HPL,0.354167,0.5,0.5,0.0,0.0,0.666667,0.235294,0.555556,0.052632,0.0,...,0.0,0.4,0.069767,0.429688,0.470588,0.6,0.2,0.6,0.333333,0.315778
MWS,0.1875,0.0,0.166667,0.4,0.857143,0.166667,0.705882,0.333333,0.157895,0.666667,...,0.727273,0.4,0.395349,0.46875,0.411765,0.0,0.6,0.0,0.0,0.311062


In [26]:

# It will be easier to work this way
eap_dict=pivot_part.loc['EAP'].to_dict()
hpl_dict=pivot_part.loc['HPL'].to_dict()
mws_dict=pivot_part.loc['MWS'].to_dict()
eap_dict['word']

0.44859813084112149

In [27]:

# Create author score 
def ind_val_eap(listn):
    quant=0
    for word in listn:
        try:
            quant+=eap_dict[word]
        except KeyError:
            quant+=0
    return quant

def ind_val_hpl(listn):
    quant=0
    for word in listn:
        try:
            quant+=hpl_dict[word]
        except KeyError:
            quant+=0
    return quant

def ind_val_mws(listn):
    quant=0
    for word in listn:
        try:
            quant+=mws_dict[word]
        except KeyError:
            quant+=0
    return quant

In [28]:
df_train['mws_index']=df_train['cleaned_text'].apply(ind_val_mws)/df_train['length']
df_train['eap_index']=df_train['cleaned_text'].apply(ind_val_eap)/df_train['length']
df_train['hpl_index']=df_train['cleaned_text'].apply(ind_val_hpl)/df_train['length']
df_train.head(n=3)

Unnamed: 0,id,text,author,cleaned_text,cleaned_text_string,length,mws_index,eap_index,hpl_index
0,id26305,"This process, however, afforded me no means of...",EAP,"[this, process, howev, afford, mean, ascertain...",this process howev afford mean ascertain dimen...,145,0.035935,0.074388,0.034504
1,id17569,It never once occurred to me that the fumbling...,HPL,"[it, never, occur, fumbl, might, mere, mistak]",it never occur fumbl might mere mistak,38,0.03586,0.06098,0.034739
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP,"[in, left, hand, gold, snuff, box, caper, hill...",in left hand gold snuff box caper hill cut man...,116,0.0269,0.047832,0.037337


In [29]:

#Transform authors' names to numeric
df_train['author']=df_train['author'].astype('category')
df_train['author2']=df_train['author'].cat.codes
# Create different features 
df_train['length2']=df_train['length']**2
df_train['mws']=df_train['mws_index']*df_train['length']
df_train['eap']=df_train['eap_index']*df_train['length']
df_train['hpl']=df_train['hpl_index']*df_train['length']
df_train.head(n=3)
mid = df_train['author2']
df_train.drop(labels=['author2'], axis=1,inplace = True)
df_train.insert(0, 'author2', mid)

In [29]:
# выглядить довольно отстойно

In [30]:

df_train.head(n=5)

Unnamed: 0,author2,id,text,author,cleaned_text,cleaned_text_string,length,mws_index,eap_index,hpl_index,length2,mws,eap,hpl
0,0,id26305,"This process, however, afforded me no means of...",EAP,"[this, process, howev, afford, mean, ascertain...",this process howev afford mean ascertain dimen...,145,0.035935,0.074388,0.034504,21025,5.210565,10.78633,5.003105
1,1,id17569,It never once occurred to me that the fumbling...,HPL,"[it, never, occur, fumbl, might, mere, mistak]",it never occur fumbl might mere mistak,38,0.03586,0.06098,0.034739,1444,1.362682,2.317251,1.320066
2,0,id11008,"In his left hand was a gold snuff box, from wh...",EAP,"[in, left, hand, gold, snuff, box, caper, hill...",in left hand gold snuff box caper hill cut man...,116,0.0269,0.047832,0.037337,13456,3.120422,5.548524,4.331054
3,2,id27763,How lovely is spring As we looked from Windsor...,MWS,"[how, love, spring, as, look, windsor, terrac,...",how love spring as look windsor terrac sixteen...,144,0.07185,0.033438,0.033601,20736,10.346342,4.815096,4.838562
4,1,id12958,"Finding nothing else, not even gold, the Super...",HPL,"[find, noth, els, even, gold, superintend, aba...",find noth els even gold superintend abandon at...,102,0.036859,0.056661,0.043735,10404,3.75959,5.779399,4.461011


In [30]:
# попробуем не просто tf-idf посчитать, а слить все предложения для каждого писателя в один большой массив слов
# и после этого дальше считаем tf-idf для каждого слова при этом только для трех документов

In [31]:
raw_documents_authors = ['', '', '']


for index, row in df_train.iterrows():
    
    if row['author'] == 'EAP':
        raw_documents_authors[0] += row['cleaned_text_string'] + ' '
    elif row['author'] == 'HPL':
        raw_documents_authors[1] += row['cleaned_text_string'] + ' '
    else:
        raw_documents_authors[2] += row['cleaned_text_string'] + ' '


In [32]:
# удалим уникальные слова, не встречающиеся у других писателей

eap_only = set(raw_documents_authors[0].split(' ')) - set(raw_documents_authors[1].split(' ')) - set(raw_documents_authors[2].split(' '))
hpl_only = set(raw_documents_authors[1].split(' ')) - set(raw_documents_authors[0].split(' ')) - set(raw_documents_authors[2].split(' '))
msh_only = set(raw_documents_authors[2].split(' ')) - set(raw_documents_authors[0].split(' ')) - set(raw_documents_authors[1].split(' '))

unique_words = eap_only.union(hpl_only).union(msh_only)

In [33]:
len(unique_words)

8149

In [34]:
from sklearn.feature_extraction.text import TfidfVectorizer

tf = TfidfVectorizer(analyzer='word')
idf_matrix =  tf.fit_transform(raw_documents_authors)
feature_names = tf.get_feature_names()
# dictionary_word = dict(zip(feature_names, idf_matrix))



In [35]:
dense_idf = [i.todense() for i in idf_matrix]
print(dense_idf)

[matrix([[ 0.00053369,  0.00053369,  0.00106738, ...,  0.00053369,
          0.        ,  0.        ]]), matrix([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.00135189,  0.00067595]]), matrix([[ 0.,  0.,  0., ...,  0.,  0.,  0.]])]


In [36]:
#print(dense_idf[0].tolist())

In [36]:
max_weighted_term = []

eap_dense_list = dense_idf[0].tolist()[0]
hpl_dense_list = dense_idf[1].tolist()[0]
mws_dense_list = dense_idf[2].tolist()[0]

for inum, i in enumerate(eap_dense_list):
    max_weighted_term.append(max(hpl_dense_list[inum], mws_dense_list[inum], 
                             i))

In [37]:
print(len(max_weighted_term))
print(len(feature_names))

15120
15120


In [38]:
max_tf_dict = dict(zip(feature_names, max_weighted_term))

In [None]:
# найдем теперь топ 20 слов по tf_idf

In [39]:
# TODO: вот это причесать и отавтоматизировать

def extract_top_words(tfidfdict, numwrd):

    top_word_dict, min_value, min_key = {}, 99, ''
    

    for k, v in max_tf_dict.items():
        # print(top_word_dict.values())
        # print(v)
        if k not in unique_words:
        
            if len(top_word_dict) < numwrd:
                top_word_dict[k] = v
                if v <= min_value:
                    min_key = k
            else:
                # print(v, min(list(top_word_dict.values())))
                if v > min(list(top_word_dict.values())):

                    min_value = min(top_word_dict.values())

                    for ky, va in top_word_dict.items():
                        if va == min_value:
                            min_key = ky

                    top_word_dict.pop(min_key)
                    top_word_dict[k] = v
                
    return top_word_dict
            

In [40]:
# так не годится. нужно дополнительно почтистить стоп слова


stop_words = get_stop_words('english')

max_tf_dict_clone = max_tf_dict

for wrd in stop_words:
    try:
        max_tf_dict_clone.pop(wrd)
    except KeyError:
        pass


In [41]:
another_top_words_dict = extract_top_words(max_tf_dict_clone, 200)

In [42]:
another_top_words_dict

{'affect': 0.04468304482835414,
 'almost': 0.05070160384633806,
 'alon': 0.03824565701409973,
 'although': 0.053900383459694244,
 'among': 0.04619772431406106,
 'ancient': 0.04990315339206502,
 'anoth': 0.037127946123696375,
 'appear': 0.0706063502629913,
 'around': 0.03952329748651549,
 'away': 0.04910470293779198,
 'back': 0.05389540566343022,
 'beauti': 0.048469743542621437,
 'becam': 0.04809107367119471,
 'becom': 0.043168365342647216,
 'began': 0.042717099303607654,
 'beyond': 0.05668998225338586,
 'black': 0.0534961804362937,
 'bodi': 0.049172279647440365,
 'call': 0.04570500351845418,
 'came': 0.08623264906148835,
 'certain': 0.06427526156897975,
 'chang': 0.060208509556850066,
 'charact': 0.03908565818129875,
 'citi': 0.05629075702624935,
 'come': 0.07505434270166579,
 'continu': 0.03862432688552646,
 'countri': 0.04392570508550068,
 'cours': 0.040661692785383376,
 'dark': 0.05908533361620499,
 'day': 0.10905692297089824,
 'de': 0.04192252046865108,
 'dear': 0.0416536858569403,

In [None]:
# будем считать что эти слова влияют на то автор это или нет

In [43]:
high_tf_idf_words_columns = list(another_top_words_dict.keys())

def count_topwords(target_df):

    for word in high_tf_idf_words_columns:
        
        # TODO: костыль, нужен, когда у нас уже есть такие столбцы
        # в датасете
#         try:
#             target_df = target_df.drop(word, 1)
#         except ValueError:
#             pass
        

        def count_numwords(collist):
            value = 0

            for wd in collist:
                if wd == word:
                    value += 1
            return value


        target_df[word] = target_df.cleaned_text.apply(count_numwords)
        print(target_df.columns.values)



In [44]:
high_tf_idf_words_columns

['stone',
 'folk',
 'idea',
 'men',
 'although',
 'year',
 'know',
 'mani',
 'love',
 'express',
 'make',
 'dear',
 'moment',
 'littl',
 'room',
 'return',
 'power',
 'sun',
 'us',
 'reason',
 'visit',
 'dark',
 'father',
 'say',
 'seen',
 'felt',
 'good',
 'life',
 'though',
 'part',
 'pain',
 'began',
 'must',
 'toward',
 'raymond',
 'ancient',
 'continu',
 'evid',
 'soon',
 'made',
 'among',
 'face',
 'eye',
 'tell',
 'live',
 'happi',
 'man',
 'everi',
 'beyond',
 'hous',
 'passion',
 'human',
 'word',
 'hand',
 'back',
 'horror',
 'never',
 'observ',
 'yet',
 'hill',
 'shall',
 'strang',
 'hour',
 'de',
 'floor',
 'inde',
 'immedi',
 'manner',
 'tear',
 'time',
 'west',
 'saw',
 'pass',
 'miseri',
 'hope',
 'death',
 'hideous',
 'someth',
 'let',
 'die',
 'way',
 'howev',
 'fear',
 'spirit',
 'left',
 'new',
 'mad',
 'voic',
 'cours',
 'window',
 'two',
 'last',
 'enter',
 'remain',
 'water',
 'god',
 'sound',
 'think',
 'around',
 'even',
 'black',
 'appear',
 'matter',
 'heard',

In [45]:
# теперь добавим еще стилометрических фич 
# лексическое разнообразие - отношение числа уникальных слов к числу слов в предложении
# интересно, для этого нужен неочищенный текст

df_train['lexical_diversity'] = df_train.text.apply(lexical_diversity)
df_train['punctuation_count'] = df_train.text.apply(compute_punctuation)


In [46]:
# теперь загружвем тестовые данные

df_test = pd.read_csv("../data/test.csv")
df_test.head(n=3)



Unnamed: 0,id,text
0,id02310,"Still, as I urged our leaving Ireland with suc..."
1,id24541,"If a fire wanted fanning, it could readily be ..."
2,id00134,And when they had broken down the frail door t...


In [47]:
df_test['cleaned_text'] = df_test.text.apply(tokenize_stem)
df_test['cleaned_text_string'] = df_test.cleaned_text.apply(' '.join)


In [48]:

df_test['length']=df_test['cleaned_text_string'].apply(len)

In [49]:
df_test['mws_index']=df_test['cleaned_text'].apply(ind_val_mws)/df_test['length']
df_test['eap_index']=df_test['cleaned_text'].apply(ind_val_eap)/df_test['length']
df_test['hpl_index']=df_test['cleaned_text'].apply(ind_val_hpl)/df_test['length']
df_test.head(n=3)
df_test['mws']=df_test['mws_index']*df_test['length']
df_test['eap']=df_test['eap_index']*df_test['length']
df_test['hpl']=df_test['hpl_index']*df_test['length']
df_test.head(n=3)

Unnamed: 0,id,text,cleaned_text,cleaned_text_string,length,mws_index,eap_index,hpl_index,mws,eap,hpl
0,id02310,"Still, as I urged our leaving Ireland with suc...","[still, i, urg, leav, ireland, inquietud, impa...",still i urg leav ireland inquietud impati fath...,67,0.071473,0.036315,0.026541,4.788659,2.433094,1.778246
1,id24541,"If a fire wanted fanning, it could readily be ...","[if, fire, want, fan, could, readili, fan, new...",if fire want fan could readili fan newspap gov...,181,0.035932,0.062884,0.039306,6.503682,11.381946,7.114372
2,id00134,And when they had broken down the frail door t...,"[and, broken, frail, door, found, two, clean, ...",and broken frail door found two clean pick hum...,114,0.027486,0.055142,0.057723,3.133403,6.286216,6.580381


In [51]:
count_topwords(df_test)

['id' 'text' 'cleaned_text' 'cleaned_text_string' 'length' 'mws_index'
 'eap_index' 'hpl_index' 'mws' 'eap' 'hpl' 'stone']
['id' 'text' 'cleaned_text' 'cleaned_text_string' 'length' 'mws_index'
 'eap_index' 'hpl_index' 'mws' 'eap' 'hpl' 'stone' 'folk']
['id' 'text' 'cleaned_text' 'cleaned_text_string' 'length' 'mws_index'
 'eap_index' 'hpl_index' 'mws' 'eap' 'hpl' 'stone' 'folk' 'idea']
['id' 'text' 'cleaned_text' 'cleaned_text_string' 'length' 'mws_index'
 'eap_index' 'hpl_index' 'mws' 'eap' 'hpl' 'stone' 'folk' 'idea' 'men']
['id' 'text' 'cleaned_text' 'cleaned_text_string' 'length' 'mws_index'
 'eap_index' 'hpl_index' 'mws' 'eap' 'hpl' 'stone' 'folk' 'idea' 'men'
 'although']
['id' 'text' 'cleaned_text' 'cleaned_text_string' 'length' 'mws_index'
 'eap_index' 'hpl_index' 'mws' 'eap' 'hpl' 'stone' 'folk' 'idea' 'men'
 'although' 'year']
['id' 'text' 'cleaned_text' 'cleaned_text_string' 'length' 'mws_index'
 'eap_index' 'hpl_index' 'mws' 'eap' 'hpl' 'stone' 'folk' 'idea' 'men'
 'althou

['id' 'text' 'cleaned_text' 'cleaned_text_string' 'length' 'mws_index'
 'eap_index' 'hpl_index' 'mws' 'eap' 'hpl' 'stone' 'folk' 'idea' 'men'
 'although' 'year' 'know' 'mani' 'love' 'express' 'make' 'dear' 'moment'
 'littl' 'room' 'return' 'power' 'sun' 'us' 'reason' 'visit' 'dark'
 'father' 'say' 'seen' 'felt' 'good' 'life' 'though' 'part' 'pain' 'began'
 'must' 'toward' 'raymond' 'ancient' 'continu' 'evid' 'soon' 'made']
['id' 'text' 'cleaned_text' 'cleaned_text_string' 'length' 'mws_index'
 'eap_index' 'hpl_index' 'mws' 'eap' 'hpl' 'stone' 'folk' 'idea' 'men'
 'although' 'year' 'know' 'mani' 'love' 'express' 'make' 'dear' 'moment'
 'littl' 'room' 'return' 'power' 'sun' 'us' 'reason' 'visit' 'dark'
 'father' 'say' 'seen' 'felt' 'good' 'life' 'though' 'part' 'pain' 'began'
 'must' 'toward' 'raymond' 'ancient' 'continu' 'evid' 'soon' 'made' 'among']
['id' 'text' 'cleaned_text' 'cleaned_text_string' 'length' 'mws_index'
 'eap_index' 'hpl_index' 'mws' 'eap' 'hpl' 'stone' 'folk' 'idea' 'm

['id' 'text' 'cleaned_text' 'cleaned_text_string' 'length' 'mws_index'
 'eap_index' 'hpl_index' 'mws' 'eap' 'hpl' 'stone' 'folk' 'idea' 'men'
 'although' 'year' 'know' 'mani' 'love' 'express' 'make' 'dear' 'moment'
 'littl' 'room' 'return' 'power' 'sun' 'us' 'reason' 'visit' 'dark'
 'father' 'say' 'seen' 'felt' 'good' 'life' 'though' 'part' 'pain' 'began'
 'must' 'toward' 'raymond' 'ancient' 'continu' 'evid' 'soon' 'made' 'among'
 'face' 'eye' 'tell' 'live' 'happi' 'man' 'everi' 'beyond' 'hous' 'passion'
 'human' 'word' 'hand' 'back' 'horror' 'never' 'observ' 'yet' 'hill']
['id' 'text' 'cleaned_text' 'cleaned_text_string' 'length' 'mws_index'
 'eap_index' 'hpl_index' 'mws' 'eap' 'hpl' 'stone' 'folk' 'idea' 'men'
 'although' 'year' 'know' 'mani' 'love' 'express' 'make' 'dear' 'moment'
 'littl' 'room' 'return' 'power' 'sun' 'us' 'reason' 'visit' 'dark'
 'father' 'say' 'seen' 'felt' 'good' 'life' 'though' 'part' 'pain' 'began'
 'must' 'toward' 'raymond' 'ancient' 'continu' 'evid' 'soon' '

['id' 'text' 'cleaned_text' 'cleaned_text_string' 'length' 'mws_index'
 'eap_index' 'hpl_index' 'mws' 'eap' 'hpl' 'stone' 'folk' 'idea' 'men'
 'although' 'year' 'know' 'mani' 'love' 'express' 'make' 'dear' 'moment'
 'littl' 'room' 'return' 'power' 'sun' 'us' 'reason' 'visit' 'dark'
 'father' 'say' 'seen' 'felt' 'good' 'life' 'though' 'part' 'pain' 'began'
 'must' 'toward' 'raymond' 'ancient' 'continu' 'evid' 'soon' 'made' 'among'
 'face' 'eye' 'tell' 'live' 'happi' 'man' 'everi' 'beyond' 'hous' 'passion'
 'human' 'word' 'hand' 'back' 'horror' 'never' 'observ' 'yet' 'hill'
 'shall' 'strang' 'hour' 'de' 'floor' 'inde' 'immedi' 'manner' 'tear'
 'time' 'west' 'saw' 'pass' 'miseri' 'hope']
['id' 'text' 'cleaned_text' 'cleaned_text_string' 'length' 'mws_index'
 'eap_index' 'hpl_index' 'mws' 'eap' 'hpl' 'stone' 'folk' 'idea' 'men'
 'although' 'year' 'know' 'mani' 'love' 'express' 'make' 'dear' 'moment'
 'littl' 'room' 'return' 'power' 'sun' 'us' 'reason' 'visit' 'dark'
 'father' 'say' 'seen' 

['id' 'text' 'cleaned_text' 'cleaned_text_string' 'length' 'mws_index'
 'eap_index' 'hpl_index' 'mws' 'eap' 'hpl' 'stone' 'folk' 'idea' 'men'
 'although' 'year' 'know' 'mani' 'love' 'express' 'make' 'dear' 'moment'
 'littl' 'room' 'return' 'power' 'sun' 'us' 'reason' 'visit' 'dark'
 'father' 'say' 'seen' 'felt' 'good' 'life' 'though' 'part' 'pain' 'began'
 'must' 'toward' 'raymond' 'ancient' 'continu' 'evid' 'soon' 'made' 'among'
 'face' 'eye' 'tell' 'live' 'happi' 'man' 'everi' 'beyond' 'hous' 'passion'
 'human' 'word' 'hand' 'back' 'horror' 'never' 'observ' 'yet' 'hill'
 'shall' 'strang' 'hour' 'de' 'floor' 'inde' 'immedi' 'manner' 'tear'
 'time' 'west' 'saw' 'pass' 'miseri' 'hope' 'death' 'hideous' 'someth'
 'let' 'die' 'way' 'howev' 'fear' 'spirit' 'left' 'new' 'mad' 'voic'
 'cours' 'window']
['id' 'text' 'cleaned_text' 'cleaned_text_string' 'length' 'mws_index'
 'eap_index' 'hpl_index' 'mws' 'eap' 'hpl' 'stone' 'folk' 'idea' 'men'
 'although' 'year' 'know' 'mani' 'love' 'express' 

['id' 'text' 'cleaned_text' 'cleaned_text_string' 'length' 'mws_index'
 'eap_index' 'hpl_index' 'mws' 'eap' 'hpl' 'stone' 'folk' 'idea' 'men'
 'although' 'year' 'know' 'mani' 'love' 'express' 'make' 'dear' 'moment'
 'littl' 'room' 'return' 'power' 'sun' 'us' 'reason' 'visit' 'dark'
 'father' 'say' 'seen' 'felt' 'good' 'life' 'though' 'part' 'pain' 'began'
 'must' 'toward' 'raymond' 'ancient' 'continu' 'evid' 'soon' 'made' 'among'
 'face' 'eye' 'tell' 'live' 'happi' 'man' 'everi' 'beyond' 'hous' 'passion'
 'human' 'word' 'hand' 'back' 'horror' 'never' 'observ' 'yet' 'hill'
 'shall' 'strang' 'hour' 'de' 'floor' 'inde' 'immedi' 'manner' 'tear'
 'time' 'west' 'saw' 'pass' 'miseri' 'hope' 'death' 'hideous' 'someth'
 'let' 'die' 'way' 'howev' 'fear' 'spirit' 'left' 'new' 'mad' 'voic'
 'cours' 'window' 'two' 'last' 'enter' 'remain' 'water' 'god' 'sound'
 'think' 'around' 'even' 'black' 'appear' 'matter' 'heard']
['id' 'text' 'cleaned_text' 'cleaned_text_string' 'length' 'mws_index'
 'eap_inde

['id' 'text' 'cleaned_text' 'cleaned_text_string' 'length' 'mws_index'
 'eap_index' 'hpl_index' 'mws' 'eap' 'hpl' 'stone' 'folk' 'idea' 'men'
 'although' 'year' 'know' 'mani' 'love' 'express' 'make' 'dear' 'moment'
 'littl' 'room' 'return' 'power' 'sun' 'us' 'reason' 'visit' 'dark'
 'father' 'say' 'seen' 'felt' 'good' 'life' 'though' 'part' 'pain' 'began'
 'must' 'toward' 'raymond' 'ancient' 'continu' 'evid' 'soon' 'made' 'among'
 'face' 'eye' 'tell' 'live' 'happi' 'man' 'everi' 'beyond' 'hous' 'passion'
 'human' 'word' 'hand' 'back' 'horror' 'never' 'observ' 'yet' 'hill'
 'shall' 'strang' 'hour' 'de' 'floor' 'inde' 'immedi' 'manner' 'tear'
 'time' 'west' 'saw' 'pass' 'miseri' 'hope' 'death' 'hideous' 'someth'
 'let' 'die' 'way' 'howev' 'fear' 'spirit' 'left' 'new' 'mad' 'voic'
 'cours' 'window' 'two' 'last' 'enter' 'remain' 'water' 'god' 'sound'
 'think' 'around' 'even' 'black' 'appear' 'matter' 'heard' 'point' 'bodi'
 'general' 'look' 'soul' 'dream' 'despair' 'alon' 'found' 'far']
['

['id' 'text' 'cleaned_text' 'cleaned_text_string' 'length' 'mws_index'
 'eap_index' 'hpl_index' 'mws' 'eap' 'hpl' 'stone' 'folk' 'idea' 'men'
 'although' 'year' 'know' 'mani' 'love' 'express' 'make' 'dear' 'moment'
 'littl' 'room' 'return' 'power' 'sun' 'us' 'reason' 'visit' 'dark'
 'father' 'say' 'seen' 'felt' 'good' 'life' 'though' 'part' 'pain' 'began'
 'must' 'toward' 'raymond' 'ancient' 'continu' 'evid' 'soon' 'made' 'among'
 'face' 'eye' 'tell' 'live' 'happi' 'man' 'everi' 'beyond' 'hous' 'passion'
 'human' 'word' 'hand' 'back' 'horror' 'never' 'observ' 'yet' 'hill'
 'shall' 'strang' 'hour' 'de' 'floor' 'inde' 'immedi' 'manner' 'tear'
 'time' 'west' 'saw' 'pass' 'miseri' 'hope' 'death' 'hideous' 'someth'
 'let' 'die' 'way' 'howev' 'fear' 'spirit' 'left' 'new' 'mad' 'voic'
 'cours' 'window' 'two' 'last' 'enter' 'remain' 'water' 'god' 'sound'
 'think' 'around' 'even' 'black' 'appear' 'matter' 'heard' 'point' 'bodi'
 'general' 'look' 'soul' 'dream' 'despair' 'alon' 'found' 'far' 'ol

['id' 'text' 'cleaned_text' 'cleaned_text_string' 'length' 'mws_index'
 'eap_index' 'hpl_index' 'mws' 'eap' 'hpl' 'stone' 'folk' 'idea' 'men'
 'although' 'year' 'know' 'mani' 'love' 'express' 'make' 'dear' 'moment'
 'littl' 'room' 'return' 'power' 'sun' 'us' 'reason' 'visit' 'dark'
 'father' 'say' 'seen' 'felt' 'good' 'life' 'though' 'part' 'pain' 'began'
 'must' 'toward' 'raymond' 'ancient' 'continu' 'evid' 'soon' 'made' 'among'
 'face' 'eye' 'tell' 'live' 'happi' 'man' 'everi' 'beyond' 'hous' 'passion'
 'human' 'word' 'hand' 'back' 'horror' 'never' 'observ' 'yet' 'hill'
 'shall' 'strang' 'hour' 'de' 'floor' 'inde' 'immedi' 'manner' 'tear'
 'time' 'west' 'saw' 'pass' 'miseri' 'hope' 'death' 'hideous' 'someth'
 'let' 'die' 'way' 'howev' 'fear' 'spirit' 'left' 'new' 'mad' 'voic'
 'cours' 'window' 'two' 'last' 'enter' 'remain' 'water' 'god' 'sound'
 'think' 'around' 'even' 'black' 'appear' 'matter' 'heard' 'point' 'bodi'
 'general' 'look' 'soul' 'dream' 'despair' 'alon' 'found' 'far' 'ol

['id' 'text' 'cleaned_text' 'cleaned_text_string' 'length' 'mws_index'
 'eap_index' 'hpl_index' 'mws' 'eap' 'hpl' 'stone' 'folk' 'idea' 'men'
 'although' 'year' 'know' 'mani' 'love' 'express' 'make' 'dear' 'moment'
 'littl' 'room' 'return' 'power' 'sun' 'us' 'reason' 'visit' 'dark'
 'father' 'say' 'seen' 'felt' 'good' 'life' 'though' 'part' 'pain' 'began'
 'must' 'toward' 'raymond' 'ancient' 'continu' 'evid' 'soon' 'made' 'among'
 'face' 'eye' 'tell' 'live' 'happi' 'man' 'everi' 'beyond' 'hous' 'passion'
 'human' 'word' 'hand' 'back' 'horror' 'never' 'observ' 'yet' 'hill'
 'shall' 'strang' 'hour' 'de' 'floor' 'inde' 'immedi' 'manner' 'tear'
 'time' 'west' 'saw' 'pass' 'miseri' 'hope' 'death' 'hideous' 'someth'
 'let' 'die' 'way' 'howev' 'fear' 'spirit' 'left' 'new' 'mad' 'voic'
 'cours' 'window' 'two' 'last' 'enter' 'remain' 'water' 'god' 'sound'
 'think' 'around' 'even' 'black' 'appear' 'matter' 'heard' 'point' 'bodi'
 'general' 'look' 'soul' 'dream' 'despair' 'alon' 'found' 'far' 'ol

['id' 'text' 'cleaned_text' 'cleaned_text_string' 'length' 'mws_index'
 'eap_index' 'hpl_index' 'mws' 'eap' 'hpl' 'stone' 'folk' 'idea' 'men'
 'although' 'year' 'know' 'mani' 'love' 'express' 'make' 'dear' 'moment'
 'littl' 'room' 'return' 'power' 'sun' 'us' 'reason' 'visit' 'dark'
 'father' 'say' 'seen' 'felt' 'good' 'life' 'though' 'part' 'pain' 'began'
 'must' 'toward' 'raymond' 'ancient' 'continu' 'evid' 'soon' 'made' 'among'
 'face' 'eye' 'tell' 'live' 'happi' 'man' 'everi' 'beyond' 'hous' 'passion'
 'human' 'word' 'hand' 'back' 'horror' 'never' 'observ' 'yet' 'hill'
 'shall' 'strang' 'hour' 'de' 'floor' 'inde' 'immedi' 'manner' 'tear'
 'time' 'west' 'saw' 'pass' 'miseri' 'hope' 'death' 'hideous' 'someth'
 'let' 'die' 'way' 'howev' 'fear' 'spirit' 'left' 'new' 'mad' 'voic'
 'cours' 'window' 'two' 'last' 'enter' 'remain' 'water' 'god' 'sound'
 'think' 'around' 'even' 'black' 'appear' 'matter' 'heard' 'point' 'bodi'
 'general' 'look' 'soul' 'dream' 'despair' 'alon' 'found' 'far' 'ol

['id' 'text' 'cleaned_text' 'cleaned_text_string' 'length' 'mws_index'
 'eap_index' 'hpl_index' 'mws' 'eap' 'hpl' 'stone' 'folk' 'idea' 'men'
 'although' 'year' 'know' 'mani' 'love' 'express' 'make' 'dear' 'moment'
 'littl' 'room' 'return' 'power' 'sun' 'us' 'reason' 'visit' 'dark'
 'father' 'say' 'seen' 'felt' 'good' 'life' 'though' 'part' 'pain' 'began'
 'must' 'toward' 'raymond' 'ancient' 'continu' 'evid' 'soon' 'made' 'among'
 'face' 'eye' 'tell' 'live' 'happi' 'man' 'everi' 'beyond' 'hous' 'passion'
 'human' 'word' 'hand' 'back' 'horror' 'never' 'observ' 'yet' 'hill'
 'shall' 'strang' 'hour' 'de' 'floor' 'inde' 'immedi' 'manner' 'tear'
 'time' 'west' 'saw' 'pass' 'miseri' 'hope' 'death' 'hideous' 'someth'
 'let' 'die' 'way' 'howev' 'fear' 'spirit' 'left' 'new' 'mad' 'voic'
 'cours' 'window' 'two' 'last' 'enter' 'remain' 'water' 'god' 'sound'
 'think' 'around' 'even' 'black' 'appear' 'matter' 'heard' 'point' 'bodi'
 'general' 'look' 'soul' 'dream' 'despair' 'alon' 'found' 'far' 'ol

In [52]:
df_test['lexical_diversity'] = df_test.text.apply(lexical_diversity)
df_test['punctuation_count'] = df_test.text.apply(compute_punctuation)

In [53]:
df_test['length2']=df_test['length']**2

In [54]:
df_test.columns

Index(['id', 'text', 'cleaned_text', 'cleaned_text_string', 'length',
       'mws_index', 'eap_index', 'hpl_index', 'mws', 'eap',
       ...
       'upon', 'world', 'whose', 'moon', 'endeavour', 'within', 'present',
       'lexical_diversity', 'punctuation_count', 'length2'],
      dtype='object', length=213)

In [55]:
df_train.columns

Index(['author2', 'id', 'text', 'author', 'cleaned_text',
       'cleaned_text_string', 'length', 'mws_index', 'eap_index', 'hpl_index',
       ...
       'whole', 'thus', 'day', 'upon', 'world', 'whose', 'moon', 'endeavour',
       'within', 'present'],
      dtype='object', length=215)

In [56]:
df_train = df_train.drop(['lexical_diversity', 'punctuation_count', 'length2'], 1)

df_train['lexical_diversity'] = df_train.text.apply(lexical_diversity)
df_train['punctuation_count'] = df_train.text.apply(compute_punctuation)

df_train['length2']=df_train['length']**2


In [57]:
df_test = df_test.drop(['lexical_diversity', 'punctuation_count', 'length2'], 1)

df_test['lexical_diversity'] = df_test.text.apply(lexical_diversity)
df_test['punctuation_count'] = df_test.text.apply(compute_punctuation)

df_test['length2']=df_test['length']**2

In [59]:
# df_train = df_train.drop(['lexical_diversity', 'punctuation_count', 'length2'], 1)

In [58]:
X, Y = df_train.values[:, 6:], df_train.values[:, 3]

In [59]:
from sklearn.ensemble import GradientBoostingClassifier

In [60]:
# df_train = df_train.drop(['mwseap', 'eaphpl', 'hplmws'], 1)

# df_test = df_test.drop(['mwseap', 'eaphpl', 'hplmws'], 1)

ValueError: labels ['mwseap' 'eaphpl' 'hplmws'] not contained in axis

In [61]:
classifier = GradientBoostingClassifier()

classifier.fit(X, Y)



GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)

In [77]:
# index = 0



# for i in X:
#     for j in i:
#         if pd.isnull(j) is True or j is None or j == np.inf:  
        
#             print(j)
#             print(i)
#             print(df_train.columns[6:])
#             index 
        
#     counter += 1
#     if counter == 5:
#         break

In [62]:
x_t = df_test.values[:, 4:]

In [63]:
probs = classifier.predict_proba(x_t)

In [64]:
probs.tolist()

[[0.01153451284831342, 0.008304767779220847, 0.9801607193724655],
 [0.9658422744103582, 0.021798146337602676, 0.012359579252039166],
 [0.05027612734482578, 0.9468380242377102, 0.002885848417464018],
 [0.7807300185485572, 0.21318500350522623, 0.006084977946216673],
 [0.956942347086469, 0.02536704503724599, 0.017690607876284802],
 [0.83345085812495, 0.15463376944620807, 0.011915372428842008],
 [0.5532673579531352, 0.42820831883909344, 0.01852432320777129],
 [0.04398094548336826, 0.12215825922452365, 0.8338607952921081],
 [0.9861194838820456, 0.011041069703689142, 0.0028394464142651116],
 [0.8426551124149351, 0.026791893316524894, 0.13055299426854006],
 [0.033850462286286465, 0.03380690494425442, 0.9323426327694592],
 [0.00647182864631432, 0.991563471583081, 0.0019646997706049773],
 [0.620609239433527, 0.27839050596399806, 0.101000254602475],
 [0.016175663244463935, 0.9807125921069283, 0.003111744648607717],
 [0.3445644952216204, 0.23558980086892065, 0.41984570390945897],
 [0.023144599194

In [117]:
probs.tolist()

[[0.009699427839926422, 0.006712903931690611, 0.9835876682283828],
 [0.9668294128134889, 0.018340974855920587, 0.014829612330590243],
 [0.04625253456814629, 0.9515654878658854, 0.0021819775659683606],
 [0.8030432618980553, 0.19176005475249863, 0.005196683349446022],
 [0.9539923431280052, 0.025884828903041206, 0.02012282796895354],
 [0.8500349101132699, 0.1381740978177463, 0.01179099206898364],
 [0.580158035266173, 0.39975988852420036, 0.020082076209626543],
 [0.03675653753535103, 0.11825004440334196, 0.8449934180613069],
 [0.9887592203341796, 0.009161156231411772, 0.002079623434408685],
 [0.8618205849534815, 0.020168482610602004, 0.11801093243591661],
 [0.03644919986789002, 0.019050270978284663, 0.9445005291538253],
 [0.0058333253980701666, 0.9927395329093721, 0.0014271416925577547],
 [0.6263002169652485, 0.27990885810474364, 0.09379092493000794],
 [0.008302802268121516, 0.9902197797471024, 0.0014774179847761447],
 [0.3418051381348325, 0.2587515194465739, 0.3994433424185937],
 [0.01872

In [65]:
classifier.classes_

array(['EAP', 'HPL', 'MWS'], dtype=object)

In [66]:
def save_prediction(prob_array):
    # сохранить предсказание при условии что вероятности
    # идут в нужном порядке
    head = 'id,EAP,HPL,MWS\n'
    
    with open("../data/export.csv", 'w') as ff:
        ff.write(head)
        
        for inum, i in enumerate(probs.tolist()):
            stttr = df_test.id.values[inum] + ','
            prob_str = ','.join([str(j) for j in i])
            stttr += prob_str + '\n'
            ff.write(stttr)
            
    
            
            

In [88]:
df_test.id.values

array(['id02310', 'id24541', 'id00134', ..., 'id13477', 'id13761',
       'id04282'], dtype=object)

In [67]:
save_prediction(probs)