In [1]:
from google.colab import drive

In [2]:
drive.mount("/content/Lit2Vec/")

Mounted at /content/Lit2Vec/


In [5]:
pip install aiofiles gensim requests nltk pymorphy2 

Collecting aiofiles
  Downloading aiofiles-0.7.0-py3-none-any.whl (13 kB)
Collecting pymorphy2
  Downloading pymorphy2-0.9.1-py3-none-any.whl (55 kB)
[K     |████████████████████████████████| 55 kB 2.4 MB/s 
Collecting pymorphy2-dicts-ru<3.0,>=2.4
  Downloading pymorphy2_dicts_ru-2.4.417127.4579844-py2.py3-none-any.whl (8.2 MB)
[K     |████████████████████████████████| 8.2 MB 9.8 MB/s 
[?25hCollecting dawg-python>=0.7.1
  Downloading DAWG_Python-0.7.2-py2.py3-none-any.whl (11 kB)
Installing collected packages: pymorphy2-dicts-ru, dawg-python, pymorphy2, aiofiles
Successfully installed aiofiles-0.7.0 dawg-python-0.7.2 pymorphy2-0.9.1 pymorphy2-dicts-ru-2.4.417127.4579844


In [6]:
pip install razdel

Collecting razdel
  Downloading razdel-0.5.0-py3-none-any.whl (21 kB)
Installing collected packages: razdel
Successfully installed razdel-0.5.0


In [7]:
import asyncio
import aiofiles
import bs4
import threading
import os
import re
import gensim
import requests
import time
import random
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.corpus import stopwords
import pymorphy2
from pymorphy2 import MorphAnalyzer
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import cross_val_score, KFold, StratifiedKFold
from sklearn.decomposition import PCA
import plotly.express as px
import matplotlib.pyplot as plt
from string import punctuation

In [207]:
! mkdir /content/Lit2Vec/MyDrive/Projects/
! mkdir /content/Lit2Vec/MyDrive/Projects/Lit2Vec/

mkdir: cannot create directory ‘/content/Lit2Vec/MyDrive/Projects1/Lit2Vec2/’: No such file or directory


In [8]:
os.chdir("/content/Lit2Vec/MyDrive/Projects/Lit2Vec/")

# Preprocessing

In [9]:
import nltk
nltk.download("stopwords")

from nltk.corpus import stopwords
from string import punctuation

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [27]:
from razdel import tokenize

russian_stopwords = stopwords.words("russian")

In [36]:
morph = pymorphy2.MorphAnalyzer()

def prepare_text(text) -> list:
    tokens = tokenize(text)
    res = list()
    for token in tokens:
        p = morph.parse(token.text)[0]
        word_normal_form = p.normal_form
        if word_normal_form not in russian_stopwords and word_normal_form not in punctuation:
            res.append(word_normal_form)
    return res

In [None]:
!unzip /content/Lit2Vec/MyDrive/Projects/Lit2Vec/Data.zip

In [37]:
def read_file(file_path, tags):
    with open(file_path, encoding="utf-8") as f:
        file_content = f.read() 
    return file_content, tags


def preprocess_file(file_content, tags, tokens_only=False, 
                    delete_set=frozenset()):
    pattern = f"[A-Za-z0-9]|[{punctuation}\n]"
    text = re.sub(pattern, " ", file_content)
    tokens = [word for word in prepare_text(text) if word not in delete_set]
    if tokens_only:
        return tokens
    else:
        # For training data, add tags
        return gensim.models.doc2vec.TaggedDocument(tokens, tags)

In [106]:
from concurrent.futures import ThreadPoolExecutor, wait


data_dir = f"./Data"


def get_processed_data(delete_set=frozenset()):
    tasks = []
    with ThreadPoolExecutor() as executor:
        for centery in os.listdir(data_dir):
            print(centery)
            centery_dir = f"{data_dir}/{centery}"
            for author in os.listdir(centery_dir):
                author_dir = f"{centery_dir}/{author}"

                for text in os.listdir(author_dir):
                    reading_future = \
                        executor.submit(read_file, 
                                        file_path=f"{author_dir}/{text}", 
                                        tags=[author])
                    tasks.append(reading_future)
        
        done, _ = wait(tasks)
        word_dict = {}
        data = []
        target = []

        total = len(tasks)
        prefix_list = ("\\\\", "//")
        last_string_length = 0
        print()
        for index, reading_future in enumerate(done, 1):
            log_string = f"\r{prefix_list[index % 2]}{index / total * 100: .2f}%"
            print("\r" + " " * last_string_length, end="")
            print(log_string, end="")
            last_string_length = len(log_string)

            read, tags = reading_future.result()
            preprocessed = preprocess_file(read, tags, delete_set=delete_set)
            if len(preprocessed.words) > 0:
                for word in preprocessed.words:
                    word_dict[word] = 1 if word_dict.get(word) is None else word_dict[word] + 1
                data.append(preprocessed)
                target = np.append(target, values=tags[0])
    return data, target, word_dict

In [35]:
data, target, word_dict = get_processed_data()

18
19
20

\\ 100.00%

In [107]:
name2class = {author: i for i, author in enumerate(set(target))}
class2name = {value: key for key, value in name2class.items()}

## Data trimming

In [39]:
corpus_volume = len(word_dict)
x = [0]  # "Total"
y = [1]

remaining_volume = corpus_volume
listed_dict = [(word, word_impact) for word, word_impact in word_dict.items()]
listed_dict.sort(key=lambda x: x[1])

for index, (word, word_impact) in enumerate(listed_dict, 1):
    remaining_volume -= word_impact
    x.append(index)
    y.append(remaining_volume / corpus_volume)

y = np.array(y)
delete_words_max_index = np.argmax(y < 0.95)
delete_set = frozenset(x[:delete_words_max_index+1])
print(len(word_dict), len(word_dict) - delete_words_max_index)

101280 96215


In [40]:
cleaned_data = [TaggedDocument(words=[word for word in doc.words if word not in delete_set], tags=doc.tags) for doc in data]

In [54]:
q25, q75 = np.quantile(a=[len(doc.words) for doc in cleaned_data], q=[0.25, 0.75])
lower_bound = q25 / 1.5
upper_bound = q75 * 1.5
print(q25, q75)
print(lower_bound, upper_bound)

37.0 92.0
24.666666666666668 138.0


In [110]:
trimmed_data = []
trimmed_target = []
for i in range(len(cleaned_data)):
    doc = cleaned_data[i]
    if q25 <= len(doc.words) <= q75:
        trimmed_data.append(doc)
        trimmed_target.append(doc.tags[0])

In [111]:
len(trimmed_data) / len(data)

0.5030287264484001

# Train models

In [59]:
from gensim.models.doc2vec import Doc2Vec

In [64]:
model = Doc2Vec(vector_size=50, alpha=0.025, dm=0, epochs=100, 
                negative=7, ns_exponent=0.75, workers=4)
model.build_vocab(trimmed_data)
model.train(trimmed_data, total_examples=model.corpus_count, 
            epochs=model.epochs)

In [None]:
! mkdir ./models/

In [112]:
from collections import Counter

n_lyrics_per_author = [value for value in Counter([i for i in trimmed_target]).values()]
median_n_neighbours = int(np.median(n_lyrics_per_author))
mean_n_neighbours = int(np.mean(n_lyrics_per_author))
q25_n_neighbours = int(np.quantile(n_lyrics_per_author, q=0.25))

print(median_n_neighbours, mean_n_neighbours, q25_n_neighbours)

69 141 34


In [None]:
tsne_training = [model.infer_vector(i.words) for i in trimmed_data]

In [116]:
metric_vis = {"mean": [mean_n_neighbours, None],
              "median": [median_n_neighbours, None],
              "q25": [q25_n_neighbours, None]}

In [117]:
for metric in metric_vis:
    print(metric)
    if metric_vis[metric][1] is None:
        tsne = TSNE(2, n_jobs=-1, perplexity=metric_vis[metric][0])
        metric_vis[metric][1] = tsne.fit_transform(tsne_training, 
                                                   y=[name2class[i] for i in trimmed_target])

mean
median
q25


In [118]:
import joblib

# Load data

In [None]:
model = Doc2Vec.load("./models/model.model")
visualization = joblib.load("./models/visualization.pkl")
target = joblib.load("./models/target.pkl")
name2class = joblib.load("./models/name2class.pkl")
class2name = joblib.load("./models/class2name.pkl")
# color_dict = joblib.load(f"{models_path}/color_dict.pkl")

# Save data

In [None]:
model.save("./models/model_50.model")
joblib.dump(visualization, f"./models/visualization_50.pkl")
joblib.dump(target, f"./models/target.pkl")
joblib.dump(name2class, f"./models/name2class.pkl")
joblib.dump(class2name, f"./models/class2name.pkl")
# color_dict = joblib.load(f"{models_path}/color_dict.pkl")

['./models/class2name.pkl']

# Visualization

In [None]:
pip install requests




In [119]:
eighteen_cent_authors = os.listdir("./Data/18/")
nineteen_cent_authors = os.listdir("./Data/19/")
twenty_cent_authors = os.listdir("./Data/20/")

In [91]:
pip install plotly



In [121]:
import plotly.express as px

In [124]:
num_target = np.array([name2class[i] for i in trimmed_target])

In [172]:
color_list = []

In [173]:
popular_authors = [key for key in nineteen_cent_authors]# sorted(c.items(), key=lambda x: x[1])[0:30]
popular_authors_ind = [name2class[i] for i in popular_authors if name2class.get(i) is not None]
x_axis_19 = np.array([])
y_axis_19 = np.array([])
labels_19 = []
for i in popular_authors_ind:
    x_axis_author = metric_vis['median'][1][:, 0][num_target == i]
    y_axis_author = metric_vis['median'][1][:, 1][num_target == i] 
    labels_author = [class2name[i]] * len(x_axis_author)
    
    x_axis_19 = np.append(x_axis_19, x_axis_author)
    y_axis_19 = np.append(y_axis_19, y_axis_author)
    labels_19 += labels_author
assert len(x_axis_19) == len(y_axis_19) == len(labels_19)

In [174]:
color_list += [0] * len(x_axis_19)

In [175]:
fig = px.scatter(x=x_axis_19, y=y_axis_19, color=labels_19, width=950, height=650, title="19 век")
fig.show()

In [176]:
popular_authors = [key for key in eighteen_cent_authors]# sorted(c.items(), key=lambda x: x[1])[0:30]
popular_authors_ind = [name2class[i] for i in popular_authors]
x_axis_18 = np.array([])
y_axis_18 = np.array([])
labels_18 = []
for i in popular_authors_ind:
    x_axis_author = metric_vis['median'][1][:, 0][num_target == i]
    y_axis_author = metric_vis['median'][1][:, 1][num_target == i] 
    labels_author = [class2name[i]] * len(x_axis_author)
    
    x_axis_18 = np.append(x_axis_18, x_axis_author)
    y_axis_18 = np.append(y_axis_18, y_axis_author)
    labels_18 += labels_author
assert len(x_axis_18) == len(y_axis_18)

In [177]:
color_list += [1] * len(x_axis_18)

In [178]:
fig = px.scatter(x=x_axis_18, y=y_axis_18, color=labels_18, width=950, height=650, title="18 век")
fig.show()

In [179]:
popular_authors = [key for key in twenty_cent_authors]# sorted(c.items(), key=lambda x: x[1])[0:30] | silver_cent_dict.keys()
popular_authors_ind = [name2class[i] for i in popular_authors]
x_axis_20 = np.array([])
y_axis_20 = np.array([])
labels_20 = []

for i in popular_authors_ind:
    x_axis_author = metric_vis['median'][1][:, 0][num_target == i]
    y_axis_author = metric_vis['median'][1][:, 1][num_target == i] 
    labels_author = [class2name[i]] * len(x_axis_author)
    x_axis_20 = np.append(x_axis_20, x_axis_author)
    y_axis_20 = np.append(y_axis_20, y_axis_author)
    labels_20 += labels_author

c = [0] * len(y_axis_20)
assert len(x_axis_20) == len(y_axis_20) == len(labels_20)

In [180]:
color_list += [2] * len(x_axis_20)

In [181]:
fig = px.scatter(x=x_axis_20, y=y_axis_20, color=labels_20, 
                 width=950, height=650, title="20 век")
fig.show()

In [182]:
x_all = np.append(np.append(x_axis_19, x_axis_18), x_axis_20)
y_all = np.append(np.append(y_axis_19, y_axis_18), y_axis_20)
labels_all = labels_19 + labels_18 + labels_20

In [183]:
color_dict = {0: "19 век", 1: "18 век", 2: "20 век"}

In [184]:
color_list = np.asarray(color_list)

In [185]:
fig = px.scatter(x=x_all, 
                 y=y_all, 
                 color=[color_dict[i] for i in color_list], 
                 symbol=labels_all, 
                 width=950, height=650, title="Карта по векам")
fig.show()

In [186]:
fig = px.scatter(x=x_all[::-1], 
                 y=y_all[::-1], 
                 color=labels_all[::-1], 
                 width=950, height=650, title="Все авторы", symbol=[color_dict[i] for i in color_list])
fig.show()

In [187]:
import plotly
import plotly.offline as offline
import plotly.graph_objects as go

In [188]:
color_dict = {0: '18 век', 1: '19 век', 2: '20 век'}


def define_cent(name):
    if name in eighteen_cent_authors:
        return 0 
    elif name in nineteen_cent_authors:
        return 1
    else:
        return 2

In [199]:
cords_per_author = []
cluster_target = []
cluster_size = []
clusters = []
cluster_plot = []
all_plot = []

for target_i in np.unique(num_target):
    current_author = metric_vis['median'][1][num_target==target_i]
    authors_name = class2name[target_i]
    all_plot.append(go.Scattergl(visible=False, 
                    x=current_author[:, 0], y=current_author[:, 1], 
                    mode="markers", marker=dict(color=target_i), 
#                     legendgroup=color_dict[define_cent(authors_name)],
                    name=f"{color_dict[define_cent(authors_name)]}, {authors_name}",
                    text=f"{color_dict[define_cent(authors_name)]}, {authors_name}", hoverinfo="text"))
    
    cluster_cords = current_author.mean(axis=0)
    cluster_size.append(len(current_author))
    cluster_target.append(class2name[target_i])
    cords_per_author.append(cluster_cords)
    
    cluster_plot.append(go.Scatter(visible=False, x=[cluster_cords[0]], y=[cluster_cords[1]], mode="markers",
                                   marker=dict(color=target_i, size=np.sqrt(len(current_author)*5)), 
#                                    legendgroup=f"{color_dict[define_cent(authors_name)]}",
                                   name=f"{color_dict[define_cent(authors_name)]}, {authors_name}",
                                   text=f"{color_dict[define_cent(authors_name)]}, {authors_name}", 
                                   hoverinfo="text"))
    
all_plot.sort(key=lambda x: x["name"])
cluster_size = np.asarray(cluster_size) 
cords_per_author = np.asarray(cords_per_author)

In [200]:
h = [[size, cords, t, define_cent(t)] for size, cords, t in \
     zip(cluster_size, cords_per_author, cluster_target)]
h = np.asarray(sorted(h, key=lambda x: x[-1]))[:, :-1]
cluster_size = h[:, 0] 
cords_per_author = h[:, 1]  
cluster_target = h[:, 2]  


Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray



In [201]:
cluster_plot = sorted(cluster_plot, key=lambda x:x["name"])

In [202]:
fill_color = {0: "red", 1: "green", 2: "blue"}

In [205]:
fig = go.Figure()

century_plot = []

for cluster, t, size in zip(cords_per_author, cluster_target, cluster_size):
    cluster_x, cluster_y = cluster
    century_plot.append(go.Scatter(visible=False, x=[cluster_x], y=[cluster_y], mode="markers",
                                   text=f"{color_dict[define_cent(t)]}, {t}", 
                                   name=f"{color_dict[define_cent(t)]}, {t}",
                                   marker_color=fill_color[define_cent(t)],
                                   marker_size=np.sqrt(size*5), 
                                   hoverinfo="text"))


# ===========================================================
data = cluster_plot + all_plot + century_plot
updatemenus = list([
    dict(active=-1,
        buttons=list([
            dict(label = 'Карта произведений',
                 method = 'update',
                 args = [{'visible': [False for _ in range(len(cluster_plot))] + \
                                     [True for _ in range(len(all_plot))] + \
                                     [False] * len(century_plot)},
                         {'title': 'Lit2Vec: карта произведений'}]),
            
            dict(label = 'Кластеры авторов по векам',
                 method = 'update',
                 args = [{'visible': [False for _ in range(len(cluster_plot))] + \
                                     [False for _ in range(len(all_plot))] + \
                                     [True] * len(century_plot)},
                         {"title": {"text": "Lit2Vec: кластеры авторов по векам",
                                    "font": {"size": 30}}},
                         {"legend": {"title": {"text": "Авторы",
                                               "font": {"size": 18}}}}
                  ])
        ]),
    )
])

layout = dict(title={"text": "Lit2Vec",
                     "font": {"size": 30}}, 
              showlegend=True,
              updatemenus=updatemenus)

fig = dict(data=data, layout=layout)

plotly.offline.iplot(fig)
plotly.offline.plot(fig, auto_open=True, show_link=False, 
                    filename=f"{os.getcwd()}/median.html")

'/content/Lit2Vec/My Drive/Projects/Lit2Vec/median.html'