**Создание таблицы со статистическими признаками произведений корпуса**

Импорт Lensky и необходимых библиотек

In [1]:
from Lensky.lensky import *
import pandas as pd
import seaborn as sns

Загрузка корпуса по адресу

In [2]:
corpus = Corpus('data/corpus_ru')

Loading...


100%|████████████████████████████████████████████████████████████████████████████████| 180/180 [00:06<00:00, 27.61it/s]

Done!





In [3]:
print('Авторов в корпусе:', len(corpus.get_keys()))
print('Книг в корпусе:', len(corpus.get_subkeys()))

Авторов в корпусе: 180
Книг в корпусе: 755


Создание pd.DataFrame() с элементами корпуса

In [4]:
df = pd.DataFrame({'filename' : [x + '.txt' for x in corpus.get_subkeys()], 
                   'book' : list(map(lambda x: x.split('.')[1], corpus.get_subkeys())),
                   'author' : list(map(lambda x: x.split('.')[0], corpus.get_subkeys())),
                   'year' : list(map(lambda x: int(x.split('.')[2]), corpus.get_subkeys()))})
df

Unnamed: 0,filename,book,author,year
0,Авдеев.Варенька.1852.txt,Варенька,Авдеев,1852
1,Авдеев.Иванов.1852.txt,Иванов,Авдеев,1852
2,Авдеев.Тетрадь_из_записок_Тамарина.1852.txt,Тетрадь_из_записок_Тамарина,Авдеев,1852
3,Авенариус.Поветрие.1867.txt,Поветрие,Авенариус,1867
4,Авенариус.Современная_идиллия.1867.txt,Современная_идиллия,Авенариус,1867
...,...,...,...,...
750,Ясинский.Верочка.1887.txt,Верочка,Ясинский,1887
751,Ясинский.Учитель.1888.txt,Учитель,Ясинский,1888
752,Яхина.Дети_мои.2018.txt,Дети_мои,Яхина,2018
753,Яхина.Зулейха_открывает_глаза.2015.txt,Зулейха_открывает_глаза,Яхина,2015


Объявим функцию и по имени файла определим век написания книги

In [5]:
def cent(year):
    if year < 1801:
        return 18
    if year >= 1801 and year < 1901:
        return 19
    if year >= 1901 and year < 2001:
        return 20
    if year >= 2001:
        return 21

In [6]:
cents = []
for year in df['year'].to_numpy():
    cents.append(cent(year))
df['cent'] = cents
df

Unnamed: 0,filename,book,author,year,cent
0,Авдеев.Варенька.1852.txt,Варенька,Авдеев,1852,19
1,Авдеев.Иванов.1852.txt,Иванов,Авдеев,1852,19
2,Авдеев.Тетрадь_из_записок_Тамарина.1852.txt,Тетрадь_из_записок_Тамарина,Авдеев,1852,19
3,Авенариус.Поветрие.1867.txt,Поветрие,Авенариус,1867,19
4,Авенариус.Современная_идиллия.1867.txt,Современная_идиллия,Авенариус,1867,19
...,...,...,...,...,...
750,Ясинский.Верочка.1887.txt,Верочка,Ясинский,1887,19
751,Ясинский.Учитель.1888.txt,Учитель,Ясинский,1888,19
752,Яхина.Дети_мои.2018.txt,Дети_мои,Яхина,2018,21
753,Яхина.Зулейха_открывает_глаза.2015.txt,Зулейха_открывает_глаза,Яхина,2015,21


Определим длину произведений в словах

In [7]:
%%time
df['len'] = corpus.apply(len).get_subvalues()
df

Processing...


100%|████████████████████████████████████████████████████████████████████████████████████████| 180/180 [00:00<?, ?it/s]

Done!
Wall time: 0 ns





Unnamed: 0,filename,book,author,year,cent,len
0,Авдеев.Варенька.1852.txt,Варенька,Авдеев,1852,19,152668
1,Авдеев.Иванов.1852.txt,Иванов,Авдеев,1852,19,170248
2,Авдеев.Тетрадь_из_записок_Тамарина.1852.txt,Тетрадь_из_записок_Тамарина,Авдеев,1852,19,154196
3,Авенариус.Поветрие.1867.txt,Поветрие,Авенариус,1867,19,256336
4,Авенариус.Современная_идиллия.1867.txt,Современная_идиллия,Авенариус,1867,19,305536
...,...,...,...,...,...,...
750,Ясинский.Верочка.1887.txt,Верочка,Ясинский,1887,19,119630
751,Ясинский.Учитель.1888.txt,Учитель,Ясинский,1888,19,121856
752,Яхина.Дети_мои.2018.txt,Дети_мои,Яхина,2018,21,780736
753,Яхина.Зулейха_открывает_глаза.2015.txt,Зулейха_открывает_глаза,Яхина,2015,21,712961


Загрузим предобработанный токенезированный корпус 

In [8]:
corpus_cleared = Corpus('data/corpus_ru_tokenized')

Loading...


100%|████████████████████████████████████████████████████████████████████████████████| 180/180 [00:05<00:00, 35.07it/s]

Done!





Определим количество уникальных слов в произведениях (богатство их словаря)

In [9]:
%%time
df['len_unique'] = corpus_cleared.apply(lambda x: len(np.unique(x.split()))).get_subvalues()
df

Processing...


100%|████████████████████████████████████████████████████████████████████████████████| 180/180 [00:18<00:00,  9.75it/s]

Done!
Wall time: 18.5 s





Unnamed: 0,filename,book,author,year,cent,len,len_unique
0,Авдеев.Варенька.1852.txt,Варенька,Авдеев,1852,19,152668,6555
1,Авдеев.Иванов.1852.txt,Иванов,Авдеев,1852,19,170248,7549
2,Авдеев.Тетрадь_из_записок_Тамарина.1852.txt,Тетрадь_из_записок_Тамарина,Авдеев,1852,19,154196,6731
3,Авенариус.Поветрие.1867.txt,Поветрие,Авенариус,1867,19,256336,12694
4,Авенариус.Современная_идиллия.1867.txt,Современная_идиллия,Авенариус,1867,19,305536,13909
...,...,...,...,...,...,...,...
750,Ясинский.Верочка.1887.txt,Верочка,Ясинский,1887,19,119630,6392
751,Ясинский.Учитель.1888.txt,Учитель,Ясинский,1888,19,121856,6295
752,Яхина.Дети_мои.2018.txt,Дети_мои,Яхина,2018,21,780736,31275
753,Яхина.Зулейха_открывает_глаза.2015.txt,Зулейха_открывает_глаза,Яхина,2015,21,712961,27269


Воспользуемся новым словарем частотных слов О. Н. Ляшевская, С. А. Шаров, Частотный словарь современного русского языка (на материалах Национального корпуса русского языка). М.: Азбуковник, 2009. для определения количества слов, не попадающих в 20 000 самых популярных в языке

In [10]:
dic = pd.read_csv('D:\DDPronin\PyCharm\RU Classic Literature graphs\data\LEMMY_FREQ.csv')
dic_words = set(dic['WORD'])

In [11]:
def uniq_words_not_in_pop(book):
    book = np.array(book.split())
    uniq = np.unique(book)
    n = 0
    for word in uniq:
        if word not in dic_words and len(book[book==word]) > 3:
            n += 1
    return n

In [12]:
%%time
df['uniq_words_not_in_pop'] = corpus_cleared.apply(uniq_words_not_in_pop).get_subvalues()
df

Processing...


100%|██████████████████████████████████████████████████████████████████████████████| 180/180 [1:36:01<00:00, 32.01s/it]

Done!
Wall time: 1h 36min 1s





Unnamed: 0,filename,book,author,year,cent,len,len_unique,uniq_words_not_in_pop
0,Авдеев.Варенька.1852.txt,Варенька,Авдеев,1852,19,152668,6555,469
1,Авдеев.Иванов.1852.txt,Иванов,Авдеев,1852,19,170248,7549,494
2,Авдеев.Тетрадь_из_записок_Тамарина.1852.txt,Тетрадь_из_записок_Тамарина,Авдеев,1852,19,154196,6731,491
3,Авенариус.Поветрие.1867.txt,Поветрие,Авенариус,1867,19,256336,12694,705
4,Авенариус.Современная_идиллия.1867.txt,Современная_идиллия,Авенариус,1867,19,305536,13909,923
...,...,...,...,...,...,...,...,...
750,Ясинский.Верочка.1887.txt,Верочка,Ясинский,1887,19,119630,6392,348
751,Ясинский.Учитель.1888.txt,Учитель,Ясинский,1888,19,121856,6295,375
752,Яхина.Дети_мои.2018.txt,Дети_мои,Яхина,2018,21,780736,31275,2996
753,Яхина.Зулейха_открывает_глаза.2015.txt,Зулейха_открывает_глаза,Яхина,2015,21,712961,27269,2767


С помощью того же словаря оценим среднюю частоту ipm слова в произведении (какова средняя частота употребляемых в книге слов?)

In [13]:
def mean_freq(book):
    book = np.array(book.split())
    top = list(pd.Series(book).value_counts().index)[:1000]
    freqs = 0
    n = 0
    for word in top:
        if word in dic_words:
            freqs += dic[dic['WORD']==word]['FREQ'].to_numpy()[0]
            n += 1
    return freqs/n

In [14]:
%%time
df['mean_freq1k'] = corpus_cleared.apply(mean_freq).get_subvalues()
df

Processing...


100%|████████████████████████████████████████████████████████████████████████████████| 180/180 [04:09<00:00,  1.38s/it]

Done!
Wall time: 4min 9s





Unnamed: 0,filename,book,author,year,cent,len,len_unique,uniq_words_not_in_pop,mean_freq1k
0,Авдеев.Варенька.1852.txt,Варенька,Авдеев,1852,19,152668,6555,469,759.008835
1,Авдеев.Иванов.1852.txt,Иванов,Авдеев,1852,19,170248,7549,494,702.438997
2,Авдеев.Тетрадь_из_записок_Тамарина.1852.txt,Тетрадь_из_записок_Тамарина,Авдеев,1852,19,154196,6731,491,744.420935
3,Авенариус.Поветрие.1867.txt,Поветрие,Авенариус,1867,19,256336,12694,705,747.801066
4,Авенариус.Современная_идиллия.1867.txt,Современная_идиллия,Авенариус,1867,19,305536,13909,923,748.249222
...,...,...,...,...,...,...,...,...,...
750,Ясинский.Верочка.1887.txt,Верочка,Ясинский,1887,19,119630,6392,348,749.065562
751,Ясинский.Учитель.1888.txt,Учитель,Ясинский,1888,19,121856,6295,375,795.539469
752,Яхина.Дети_мои.2018.txt,Дети_мои,Яхина,2018,21,780736,31275,2996,737.782698
753,Яхина.Зулейха_открывает_глаза.2015.txt,Зулейха_открывает_глаза,Яхина,2015,21,712961,27269,2767,699.971191


Воспользуемся библиотекой Dostoyevsky для классификации предложений текстов. Вычислим среднее по тексту для каждого из возможных классов

In [15]:
from razdel import sentenize
from dostoevsky.tokenization import RegexTokenizer
from dostoevsky.models import FastTextSocialNetworkModel

tokenizer = RegexTokenizer()
model = FastTextSocialNetworkModel(tokenizer=tokenizer)

def tonalnost(book):
    substrings = list(sentenize(book))
    sents = []
    for i in range(len(substrings)):
        sents.append(substrings[i].text)
    messages = sents
    results = model.predict(messages, k=2)
    neu, pos, neg, sp, sk = 0, 0, 0, 0, 0
    n = 0
    for message, sentiment in zip(messages, results):
        if 'neutral' not in sentiment.keys():
            sentiment['neutral'] = 0
        if 'positive' not in sentiment.keys():
            sentiment['positive'] = 0
        if 'negative' not in sentiment.keys():
            sentiment['negative'] = 0
        if 'speech' not in sentiment.keys():
            sentiment['speech'] = 0
        if 'skip' not in sentiment.keys():
            sentiment['skip'] = 0
        neu += sentiment['neutral']
        pos += sentiment['positive']
        neg += sentiment['negative']
        sp += sentiment['speech']
        sk += sentiment['skip']
        n += 1
    return {'neutral' : neu/n, 'positive' : pos/n, 'negative' : neg/n, 'speech' : sp/n, 'skip' : sk/n}



In [16]:
%%time
res = corpus.apply(tonalnost).get_subvalues()

Processing...


100%|████████████████████████████████████████████████████████████████████████████████| 180/180 [06:02<00:00,  2.01s/it]

Done!
Wall time: 6min 2s





In [17]:
neu = []
pos = []
neg = []
sp = []
sk = []
for r in res:
    neu.append(r['neutral'])
    pos.append(r['positive'])
    neg.append(r['negative'])
    sp.append(r['speech'])
    sk.append(r['skip'])
df['dost_neu'] = neu
df['dost_pos'] = pos
df['dost_neg'] = neg
df['dost_sp'] = sp
df['dost_sk'] = sk
df

Unnamed: 0,filename,book,author,year,cent,len,len_unique,uniq_words_not_in_pop,mean_freq1k,dost_neu,dost_pos,dost_neg,dost_sp,dost_sk
0,Авдеев.Варенька.1852.txt,Варенька,Авдеев,1852,19,152668,6555,469,759.008835,0.439804,0.080240,0.127926,0.004833,0.181318
1,Авдеев.Иванов.1852.txt,Иванов,Авдеев,1852,19,170248,7549,494,702.438997,0.459784,0.082918,0.138329,0.004442,0.116841
2,Авдеев.Тетрадь_из_записок_Тамарина.1852.txt,Тетрадь_из_записок_Тамарина,Авдеев,1852,19,154196,6731,491,744.420935,0.415810,0.107288,0.148392,0.003905,0.129035
3,Авенариус.Поветрие.1867.txt,Поветрие,Авенариус,1867,19,256336,12694,705,747.801066,0.489591,0.067748,0.115472,0.004542,0.163701
4,Авенариус.Современная_идиллия.1867.txt,Современная_идиллия,Авенариус,1867,19,305536,13909,923,748.249222,0.535464,0.058955,0.102114,0.002779,0.160927
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
750,Ясинский.Верочка.1887.txt,Верочка,Ясинский,1887,19,119630,6392,348,749.065562,0.445304,0.101620,0.150788,0.002689,0.125544
751,Ясинский.Учитель.1888.txt,Учитель,Ясинский,1888,19,121856,6295,375,795.539469,0.487984,0.083779,0.144104,0.003416,0.113553
752,Яхина.Дети_мои.2018.txt,Дети_мои,Яхина,2018,21,780736,31275,2996,737.782698,0.447696,0.052052,0.156440,0.002084,0.120678
753,Яхина.Зулейха_открывает_глаза.2015.txt,Зулейха_открывает_глаза,Яхина,2015,21,712961,27269,2767,699.971191,0.503221,0.045603,0.125077,0.002296,0.158750


Найдем долю слов, которая не попадает в список 20 000 самых частотных слов в языке

In [18]:
%%time
df['part_unique'] = df['len_unique']/df['len']
df['part_uniq_words_not_in_pop'] = df['uniq_words_not_in_pop']/df['len_unique']
df

Wall time: 997 µs


Unnamed: 0,filename,book,author,year,cent,len,len_unique,uniq_words_not_in_pop,mean_freq1k,dost_neu,dost_pos,dost_neg,dost_sp,dost_sk,part_unique,part_uniq_words_not_in_pop
0,Авдеев.Варенька.1852.txt,Варенька,Авдеев,1852,19,152668,6555,469,759.008835,0.439804,0.080240,0.127926,0.004833,0.181318,0.042936,0.071548
1,Авдеев.Иванов.1852.txt,Иванов,Авдеев,1852,19,170248,7549,494,702.438997,0.459784,0.082918,0.138329,0.004442,0.116841,0.044341,0.065439
2,Авдеев.Тетрадь_из_записок_Тамарина.1852.txt,Тетрадь_из_записок_Тамарина,Авдеев,1852,19,154196,6731,491,744.420935,0.415810,0.107288,0.148392,0.003905,0.129035,0.043652,0.072946
3,Авенариус.Поветрие.1867.txt,Поветрие,Авенариус,1867,19,256336,12694,705,747.801066,0.489591,0.067748,0.115472,0.004542,0.163701,0.049521,0.055538
4,Авенариус.Современная_идиллия.1867.txt,Современная_идиллия,Авенариус,1867,19,305536,13909,923,748.249222,0.535464,0.058955,0.102114,0.002779,0.160927,0.045523,0.066360
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
750,Ясинский.Верочка.1887.txt,Верочка,Ясинский,1887,19,119630,6392,348,749.065562,0.445304,0.101620,0.150788,0.002689,0.125544,0.053431,0.054443
751,Ясинский.Учитель.1888.txt,Учитель,Ясинский,1888,19,121856,6295,375,795.539469,0.487984,0.083779,0.144104,0.003416,0.113553,0.051659,0.059571
752,Яхина.Дети_мои.2018.txt,Дети_мои,Яхина,2018,21,780736,31275,2996,737.782698,0.447696,0.052052,0.156440,0.002084,0.120678,0.040058,0.095795
753,Яхина.Зулейха_открывает_глаза.2015.txt,Зулейха_открывает_глаза,Яхина,2015,21,712961,27269,2767,699.971191,0.503221,0.045603,0.125077,0.002296,0.158750,0.038248,0.101471


Рассчитаем среднюю длину слова в произведении

In [19]:
from razdel import tokenize, sentenize

def mean_word_len(book):
    subs = list(tokenize(book))
    words = []
    for sub in subs:
        words.append(len(sub.text))
    return np.mean(words)

In [20]:
%%time
df['mean_word_len'] = corpus.apply(mean_word_len).get_subvalues()
df

Processing...


100%|████████████████████████████████████████████████████████████████████████████████| 180/180 [04:24<00:00,  1.47s/it]

Done!
Wall time: 4min 24s





Unnamed: 0,filename,book,author,year,cent,len,len_unique,uniq_words_not_in_pop,mean_freq1k,dost_neu,dost_pos,dost_neg,dost_sp,dost_sk,part_unique,part_uniq_words_not_in_pop,mean_word_len
0,Авдеев.Варенька.1852.txt,Варенька,Авдеев,1852,19,152668,6555,469,759.008835,0.439804,0.080240,0.127926,0.004833,0.181318,0.042936,0.071548,4.045030
1,Авдеев.Иванов.1852.txt,Иванов,Авдеев,1852,19,170248,7549,494,702.438997,0.459784,0.082918,0.138329,0.004442,0.116841,0.044341,0.065439,4.338396
2,Авдеев.Тетрадь_из_записок_Тамарина.1852.txt,Тетрадь_из_записок_Тамарина,Авдеев,1852,19,154196,6731,491,744.420935,0.415810,0.107288,0.148392,0.003905,0.129035,0.043652,0.072946,4.159333
3,Авенариус.Поветрие.1867.txt,Поветрие,Авенариус,1867,19,256336,12694,705,747.801066,0.489591,0.067748,0.115472,0.004542,0.163701,0.049521,0.055538,4.218230
4,Авенариус.Современная_идиллия.1867.txt,Современная_идиллия,Авенариус,1867,19,305536,13909,923,748.249222,0.535464,0.058955,0.102114,0.002779,0.160927,0.045523,0.066360,4.189221
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
750,Ясинский.Верочка.1887.txt,Верочка,Ясинский,1887,19,119630,6392,348,749.065562,0.445304,0.101620,0.150788,0.002689,0.125544,0.053431,0.054443,4.167650
751,Ясинский.Учитель.1888.txt,Учитель,Ясинский,1888,19,121856,6295,375,795.539469,0.487984,0.083779,0.144104,0.003416,0.113553,0.051659,0.059571,4.302829
752,Яхина.Дети_мои.2018.txt,Дети_мои,Яхина,2018,21,780736,31275,2996,737.782698,0.447696,0.052052,0.156440,0.002084,0.120678,0.040058,0.095795,4.534071
753,Яхина.Зулейха_открывает_глаза.2015.txt,Зулейха_открывает_глаза,Яхина,2015,21,712961,27269,2767,699.971191,0.503221,0.045603,0.125077,0.002296,0.158750,0.038248,0.101471,4.479854


Рассчитаем среднюю длину предложения в произведении

In [21]:
def mean_sent_len(book):
    subs = list(sentenize(book))
    sents = []
    for sub in subs:
        sents.append(len(list(tokenize(sub.text)))-1) 
    return np.mean(sents)

In [22]:
%%time
df['mean_sent_len'] = corpus.apply(mean_sent_len).get_subvalues()
df

Processing...


100%|████████████████████████████████████████████████████████████████████████████████| 180/180 [03:43<00:00,  1.24s/it]

Done!
Wall time: 3min 43s





Unnamed: 0,filename,book,author,year,cent,len,len_unique,uniq_words_not_in_pop,mean_freq1k,dost_neu,dost_pos,dost_neg,dost_sp,dost_sk,part_unique,part_uniq_words_not_in_pop,mean_word_len,mean_sent_len
0,Авдеев.Варенька.1852.txt,Варенька,Авдеев,1852,19,152668,6555,469,759.008835,0.439804,0.080240,0.127926,0.004833,0.181318,0.042936,0.071548,4.045030,16.316079
1,Авдеев.Иванов.1852.txt,Иванов,Авдеев,1852,19,170248,7549,494,702.438997,0.459784,0.082918,0.138329,0.004442,0.116841,0.044341,0.065439,4.338396,18.624033
2,Авдеев.Тетрадь_из_записок_Тамарина.1852.txt,Тетрадь_из_записок_Тамарина,Авдеев,1852,19,154196,6731,491,744.420935,0.415810,0.107288,0.148392,0.003905,0.129035,0.043652,0.072946,4.159333,16.986055
3,Авенариус.Поветрие.1867.txt,Поветрие,Авенариус,1867,19,256336,12694,705,747.801066,0.489591,0.067748,0.115472,0.004542,0.163701,0.049521,0.055538,4.218230,12.612826
4,Авенариус.Современная_идиллия.1867.txt,Современная_идиллия,Авенариус,1867,19,305536,13909,923,748.249222,0.535464,0.058955,0.102114,0.002779,0.160927,0.045523,0.066360,4.189221,12.075693
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
750,Ясинский.Верочка.1887.txt,Верочка,Ясинский,1887,19,119630,6392,348,749.065562,0.445304,0.101620,0.150788,0.002689,0.125544,0.053431,0.054443,4.167650,11.298264
751,Ясинский.Учитель.1888.txt,Учитель,Ясинский,1888,19,121856,6295,375,795.539469,0.487984,0.083779,0.144104,0.003416,0.113553,0.051659,0.059571,4.302829,10.775581
752,Яхина.Дети_мои.2018.txt,Дети_мои,Яхина,2018,21,780736,31275,2996,737.782698,0.447696,0.052052,0.156440,0.002084,0.120678,0.040058,0.095795,4.534071,17.942318
753,Яхина.Зулейха_открывает_глаза.2015.txt,Зулейха_открывает_глаза,Яхина,2015,21,712961,27269,2767,699.971191,0.503221,0.045603,0.125077,0.002296,0.158750,0.038248,0.101471,4.479854,12.679253


Определим долю каждой из частей речи в произведениях

In [23]:
from navec import Navec
from slovnet import Morph
from razdel import tokenize
navec = Navec.load('models//navec_news_v1_1B_250K_300d_100q.tar')
morph = Morph.load('models//slovnet_morph_news_v1.tar')
morph.navec(navec)

def morphing(book):
    tokens = morph([x.text for x in list(tokenize(book))]).tokens
    res = []
    for token in tokens:
        res.append(token.pos)
    len_ = len(book.split())
    return {'ADJ' : res.count('ADJ')/len_, 
           'ADP' : res.count('ADP')/len_,
           'ADV' : res.count('ADV')/len_,
           'AUX' : res.count('AUX')/len_,
           'CCONJ' : res.count('CCONJ')/len_,
           'DET' : res.count('DET')/len_,
           'INTJ' : res.count('INTJ')/len_,
           'NOUN' : res.count('NOUN')/len_,
           'NUM' : res.count('NUM')/len_,
           'PART' : res.count('PART')/len_,
           'PRON' : res.count('PRON')/len_,
           'PROPN' : res.count('PROPN')/len_,
           'PUNCT' : res.count('PUNCT')/len_,
           'SCONJ' : res.count('SCONJ')/len_,
           'PUNCT' : res.count('PUNCT')/len_,
           'SYM' : res.count('SYM')/len_,
           'VERB' : res.count('VERB')/len_,
           'X' : res.count('X')/len_}

In [24]:
%%time
res = corpus.apply(morphing).get_subvalues()

Processing...


100%|████████████████████████████████████████████████████████████████████████████████| 180/180 [28:07<00:00,  9.37s/it]

Done!
Wall time: 28min 7s





In [25]:
m = {'ADJ' : [],
'ADP' : [],
'ADV' : [],
'AUX' : [],
'CCONJ' : [],
'DET' : [],
'INTJ' :[],
'NOUN' : [],
'NUM' : [],
'PART' : [],
'PRON' : [],
'PROPN' : [],
'PUNCT' : [],
'SCONJ' : [],
'SYM' : [],
'VERB' : [],
'X' : []}

In [26]:
for r in res:
    for key in m.keys():
        m[key].append(r[key])
for key in m.keys():
        df[key] = m[key]

In [27]:
df

Unnamed: 0,filename,book,author,year,cent,len,len_unique,uniq_words_not_in_pop,mean_freq1k,dost_neu,...,NOUN,NUM,PART,PRON,PROPN,PUNCT,SCONJ,SYM,VERB,X
0,Авдеев.Варенька.1852.txt,Варенька,Авдеев,1852,19,152668,6555,469,759.008835,0.439804,...,0.164474,0.006164,0.055597,0.146570,0.048569,0.264046,0.030311,0.000000,0.167576,0.003809
1,Авдеев.Иванов.1852.txt,Иванов,Авдеев,1852,19,170248,7549,494,702.438997,0.459784,...,0.179738,0.007481,0.047394,0.118560,0.041129,0.246628,0.028709,0.000000,0.164996,0.000663
2,Авдеев.Тетрадь_из_записок_Тамарина.1852.txt,Тетрадь_из_записок_Тамарина,Авдеев,1852,19,154196,6731,491,744.420935,0.415810,...,0.168965,0.006130,0.047310,0.149082,0.033086,0.250108,0.031514,0.000039,0.166451,0.000825
3,Авенариус.Поветрие.1867.txt,Поветрие,Авенариус,1867,19,256336,12694,705,747.801066,0.489591,...,0.211074,0.007128,0.058478,0.110938,0.035516,0.311703,0.022050,0.000000,0.157011,0.009298
4,Авенариус.Современная_идиллия.1867.txt,Современная_идиллия,Авенариус,1867,19,305536,13909,923,748.249222,0.535464,...,0.196245,0.007834,0.060576,0.103942,0.036025,0.319947,0.027060,0.000000,0.157280,0.019945
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
750,Ясинский.Верочка.1887.txt,Верочка,Ясинский,1887,19,119630,6392,348,749.065562,0.445304,...,0.197364,0.006226,0.040210,0.136920,0.037823,0.283958,0.025112,0.000000,0.178842,0.001608
751,Ясинский.Учитель.1888.txt,Учитель,Ясинский,1888,19,121856,6295,375,795.539469,0.487984,...,0.198243,0.005270,0.042263,0.131646,0.057556,0.263136,0.026608,0.000000,0.179695,0.000258
752,Яхина.Дети_мои.2018.txt,Дети_мои,Яхина,2018,21,780736,31275,2996,737.782698,0.447696,...,0.275472,0.007331,0.039981,0.041392,0.037842,0.239701,0.018335,0.000000,0.156017,0.000793
753,Яхина.Зулейха_открывает_глаза.2015.txt,Зулейха_открывает_глаза,Яхина,2015,21,712961,27269,2767,699.971191,0.503221,...,0.267870,0.008460,0.037172,0.049176,0.042556,0.287307,0.016902,0.000000,0.164584,0.001254


In [28]:
df

Unnamed: 0,filename,book,author,year,cent,len,len_unique,uniq_words_not_in_pop,mean_freq1k,dost_neu,...,NOUN,NUM,PART,PRON,PROPN,PUNCT,SCONJ,SYM,VERB,X
0,Авдеев.Варенька.1852.txt,Варенька,Авдеев,1852,19,152668,6555,469,759.008835,0.439804,...,0.164474,0.006164,0.055597,0.146570,0.048569,0.264046,0.030311,0.000000,0.167576,0.003809
1,Авдеев.Иванов.1852.txt,Иванов,Авдеев,1852,19,170248,7549,494,702.438997,0.459784,...,0.179738,0.007481,0.047394,0.118560,0.041129,0.246628,0.028709,0.000000,0.164996,0.000663
2,Авдеев.Тетрадь_из_записок_Тамарина.1852.txt,Тетрадь_из_записок_Тамарина,Авдеев,1852,19,154196,6731,491,744.420935,0.415810,...,0.168965,0.006130,0.047310,0.149082,0.033086,0.250108,0.031514,0.000039,0.166451,0.000825
3,Авенариус.Поветрие.1867.txt,Поветрие,Авенариус,1867,19,256336,12694,705,747.801066,0.489591,...,0.211074,0.007128,0.058478,0.110938,0.035516,0.311703,0.022050,0.000000,0.157011,0.009298
4,Авенариус.Современная_идиллия.1867.txt,Современная_идиллия,Авенариус,1867,19,305536,13909,923,748.249222,0.535464,...,0.196245,0.007834,0.060576,0.103942,0.036025,0.319947,0.027060,0.000000,0.157280,0.019945
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
750,Ясинский.Верочка.1887.txt,Верочка,Ясинский,1887,19,119630,6392,348,749.065562,0.445304,...,0.197364,0.006226,0.040210,0.136920,0.037823,0.283958,0.025112,0.000000,0.178842,0.001608
751,Ясинский.Учитель.1888.txt,Учитель,Ясинский,1888,19,121856,6295,375,795.539469,0.487984,...,0.198243,0.005270,0.042263,0.131646,0.057556,0.263136,0.026608,0.000000,0.179695,0.000258
752,Яхина.Дети_мои.2018.txt,Дети_мои,Яхина,2018,21,780736,31275,2996,737.782698,0.447696,...,0.275472,0.007331,0.039981,0.041392,0.037842,0.239701,0.018335,0.000000,0.156017,0.000793
753,Яхина.Зулейха_открывает_глаза.2015.txt,Зулейха_открывает_глаза,Яхина,2015,21,712961,27269,2767,699.971191,0.503221,...,0.267870,0.008460,0.037172,0.049176,0.042556,0.287307,0.016902,0.000000,0.164584,0.001254


Сохраним результаты

In [29]:
df.to_excel('RuLitStat.xlsx', index=False)