In [1]:
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import pandas as pd
from nltk.stem.snowball import SnowballStemmer
import string
from string import digits

from scipy.sparse import csr_matrix

In [38]:
from stop_words import get_stop_words

In [2]:
df_train = pd.read_csv("../data/train.csv")
# 3 столбца - id, text, author
df_train.head(n=3)

Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",EAP
1,id17569,It never once occurred to me that the fumbling...,HPL
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP


In [3]:
len(df_train)

19579

In [4]:
remove_digits = str.maketrans('', '', digits)
def tokenize_stem(file_text):
    #firstly let's apply nltk tokenization
    file_text = file_text.translate(remove_digits)
    try:
        tokens = nltk.word_tokenize(file_text)
    except:
        nltk.download('punkt')
        tokens = nltk.word_tokenize(file_text)
        

    #let's delete punctuation symbols
    tokens = [i for i in tokens if ( i not in string.punctuation )]

    #deleting stop_words
    try:
        stop_words = stopwords.words('english')
    except LookupError:
        nltk.download('stopwords')
        stop_words = stopwords.words('english')
    tokens = [i for i in tokens if ( i not in stop_words )]

    #cleaning words
    stemmer = SnowballStemmer("english")
    
    tokens = [stemmer.stem(i) for i in tokens]

    return tokens

In [5]:
df_train['cleaned_text'] = df_train.text.apply(tokenize_stem)
df_train['cleaned_text_string'] = df_train.cleaned_text.apply(' '.join)
df_train.head(n=3)

Unnamed: 0,id,text,author,cleaned_text,cleaned_text_string
0,id26305,"This process, however, afforded me no means of...",EAP,"[this, process, howev, afford, mean, ascertain...",this process howev afford mean ascertain dimen...
1,id17569,It never once occurred to me that the fumbling...,HPL,"[it, never, occur, fumbl, might, mere, mistak]",it never occur fumbl might mere mistak
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP,"[in, left, hand, gold, snuff, box, caper, hill...",in left hand gold snuff box caper hill cut man...


In [6]:
df_train['length']=df_train['cleaned_text_string'].apply(len)
df_train.head(n=3)

Unnamed: 0,id,text,author,cleaned_text,cleaned_text_string,length
0,id26305,"This process, however, afforded me no means of...",EAP,"[this, process, howev, afford, mean, ascertain...",this process howev afford mean ascertain dimen...,145
1,id17569,It never once occurred to me that the fumbling...,HPL,"[it, never, occur, fumbl, might, mere, mistak]",it never occur fumbl might mere mistak,38
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP,"[in, left, hand, gold, snuff, box, caper, hill...",in left hand gold snuff box caper hill cut man...,116


In [7]:
df_hpl=df_train[df_train['author']=='HPL']
df_hpl.describe()

Unnamed: 0,length
count,5635.0
mean,93.395386
std,51.075096
min,7.0
25%,58.0
50%,85.0
75%,118.0
max,561.0


In [8]:
df_eap=df_train[df_train['author']=='EAP']
df_eap.describe()

Unnamed: 0,length
count,7900.0
mean,81.543165
std,60.100183
min,5.0
25%,40.0
50%,66.0
75%,106.0
max,925.0


In [9]:
df_mws=df_train[df_train['author']=='MWS']
df_mws.describe()

Unnamed: 0,length
count,6044.0
mean,86.124586
std,71.976281
min,4.0
25%,48.0
50%,74.0
75%,108.0
max,2715.0


In [10]:
# как мы будем эту штуку правильнее делать (возможно это жуткий костыль), я хз
# сначала создаем словарь где ключ - уникальное слово, а значение - его порядковый номер
# затем создаем разреженную матрицу, которую заполняем в зависимости от порядковых номеров 
word_dict = {}



In [11]:
#делаю сет со всеми словами
# и сразу заготовку под шапку(потом увидишь зачем)
counter = 0
head = []

for wordlist in df_train['cleaned_text']:
    for word in wordlist:
        if word not in word_dict:
            head.append(word)
            word_dict[word] = counter
            counter += 1


In [12]:
len(head)

15230

In [13]:
# видоизменять колонки в pandas руками по одному значению в строке или столбце - очень плохая идея
# колонка это numpy.ndarray, а значит при каждой итерации она будет пересоздаваться
# что угробит производительность
# делаем значит так. считаем где сколько и где встречались отдельные слова, затем создаем строку за строкой для 
# каждого предложения

list_of_lists = []

for wordlist in df_train['cleaned_text']:
    row = [0 for i in range(len(word_dict))]
    for word in wordlist:
        row[word_dict[word]] += 1
    list_of_lists.append(csr_matrix(row))



In [14]:
list_of_lists[0]

<1x15230 sparse matrix of type '<class 'numpy.int64'>'
	with 23 stored elements in Compressed Sparse Row format>

In [15]:
# ... и для того чтобы посмотреть встречаемость того или иного слова по авторам добавим такую колонку

count_frame = pd.DataFrame(list_of_lists)
count_frame['author'] = df_train['author']



In [17]:
count_frame.head()

Unnamed: 0,0,author
0,"(0, 0)\t1\n (0, 1)\t1\n (0, 2)\t1\n (0, 3...",EAP
1,"(0, 9)\t1\n (0, 23)\t1\n (0, 24)\t1\n (0,...",HPL
2,"(0, 29)\t1\n (0, 30)\t1\n (0, 31)\t1\n (0...",EAP
3,"(0, 48)\t1\n (0, 49)\t1\n (0, 50)\t1\n (0...",MWS
4,"(0, 32)\t1\n (0, 52)\t1\n (0, 70)\t1\n (0...",HPL


In [18]:
# выглядить довольно отстойно

In [19]:

df_train.head(n=5)

Unnamed: 0,id,text,author,cleaned_text,cleaned_text_string,length
0,id26305,"This process, however, afforded me no means of...",EAP,"[this, process, howev, afford, mean, ascertain...",this process howev afford mean ascertain dimen...,145
1,id17569,It never once occurred to me that the fumbling...,HPL,"[it, never, occur, fumbl, might, mere, mistak]",it never occur fumbl might mere mistak,38
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP,"[in, left, hand, gold, snuff, box, caper, hill...",in left hand gold snuff box caper hill cut man...,116
3,id27763,How lovely is spring As we looked from Windsor...,MWS,"[how, love, spring, as, look, windsor, terrac,...",how love spring as look windsor terrac sixteen...,144
4,id12958,"Finding nothing else, not even gold, the Super...",HPL,"[find, noth, els, even, gold, superintend, aba...",find noth els even gold superintend abandon at...,102


In [20]:
# попробуем не просто tf-idf посчитать, а слить все предложения для каждого писателя в один большой массив слов
# и после этого дальше считаем tf-idf для каждого слова при этом только для трех документов

In [21]:
raw_documents_authors = ['', '', '']


for index, row in df_train.iterrows():
    
    if row['author'] == 'EAP':
        raw_documents_authors[0] += row['cleaned_text_string'] + ' '
    elif row['author'] == 'HPL':
        raw_documents_authors[1] += row['cleaned_text_string'] + ' '
    else:
        raw_documents_authors[2] += row['cleaned_text_string'] + ' '


In [52]:
# удалим уникальные слова, не встречающиеся у других писателей

eap_only = set(raw_documents_authors[0].split(' ')) - set(raw_documents_authors[1].split(' ')) - set(raw_documents_authors[2].split(' '))
hpl_only = set(raw_documents_authors[1].split(' ')) - set(raw_documents_authors[0].split(' ')) - set(raw_documents_authors[2].split(' '))
msh_only = set(raw_documents_authors[2].split(' ')) - set(raw_documents_authors[0].split(' ')) - set(raw_documents_authors[1].split(' '))

unique_words = eap_only.union(hpl_only).union(msh_only)

In [54]:
len(unique_words)

8149

In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer

tf = TfidfVectorizer(analyzer='word')
idf_matrix =  tf.fit_transform(raw_documents_authors)
feature_names = tf.get_feature_names()
# dictionary_word = dict(zip(feature_names, idf_matrix))



In [23]:
dense_idf = [i.todense() for i in idf_matrix]
print(dense_idf)

[matrix([[ 0.00053369,  0.00053369,  0.00106738, ...,  0.00053369,
          0.        ,  0.        ]]), matrix([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.00135189,  0.00067595]]), matrix([[ 0.,  0.,  0., ...,  0.,  0.,  0.]])]


In [24]:
#print(dense_idf[0].tolist())

In [25]:
max_weighted_term = []

eap_dense_list = dense_idf[0].tolist()[0]
hpl_dense_list = dense_idf[1].tolist()[0]
mws_dense_list = dense_idf[2].tolist()[0]

for inum, i in enumerate(eap_dense_list):
    max_weighted_term.append(max(hpl_dense_list[inum], mws_dense_list[inum], 
                             i))

In [26]:
print(len(max_weighted_term))
print(len(feature_names))

15120
15120


In [27]:
max_tf_dict = dict(zip(feature_names, max_weighted_term))

In [None]:
# найдем теперь топ 20 слов по tf_idf

In [55]:
# TODO: вот это причесать и отавтоматизировать

def extract_top_words(tfidfdict, numwrd):

    top_word_dict, min_value, min_key = {}, 99, ''
    

    for k, v in max_tf_dict.items():
        # print(top_word_dict.values())
        # print(v)
        if k not in unique_words:
        
            if len(top_word_dict) < numwrd:
                top_word_dict[k] = v
                if v <= min_value:
                    min_key = k
            else:
                # print(v, min(list(top_word_dict.values())))
                if v > min(list(top_word_dict.values())):

                    min_value = min(top_word_dict.values())

                    for ky, va in top_word_dict.items():
                        if va == min_value:
                            min_key = ky

                    top_word_dict.pop(min_key)
                    top_word_dict[k] = v
                
    return top_word_dict
            

In [56]:
# так не годится. нужно дополнительно почтистить стоп слова


stop_words = get_stop_words('english')

max_tf_dict_clone = max_tf_dict

for wrd in stop_words:
    try:
        max_tf_dict_clone.pop(wrd)
    except KeyError:
        pass


In [57]:
another_top_words_dict = extract_top_words(max_tf_dict_clone, 30)

In [58]:
another_top_words_dict

{'came': 0.08623264906148835,
 'day': 0.10905692297089824,
 'even': 0.11284362168516554,
 'everi': 0.08823008004242809,
 'eye': 0.10829958322804477,
 'feel': 0.10186219541379037,
 'heart': 0.10981426271375169,
 'hope': 0.08633673068529443,
 'hous': 0.10260088337408568,
 'life': 0.12647573705652781,
 'like': 0.11817066723240997,
 'littl': 0.08731231706628834,
 'love': 0.16131336522778697,
 'man': 0.11178306359822565,
 'might': 0.10186219541379037,
 'night': 0.1069923608725874,
 'old': 0.15649628903751592,
 'one': 0.21181905078897387,
 'raymond': 0.13165372929973168,
 'said': 0.11221366381082544,
 'saw': 0.09421715360421876,
 'say': 0.10527911155285308,
 'seem': 0.1285505231379595,
 'thing': 0.17286452335011324,
 'though': 0.09341870314994571,
 'thought': 0.08823008004242809,
 'time': 0.11218228882536217,
 'upon': 0.3230870938373485,
 'us': 0.10299820502807056,
 'yet': 0.12041701911370013}

In [None]:
# будем считать что эти слова влияют на то автор это или нет