In [None]:
import pandas as pd
import nltk
import numpy as np
import pymorphy2 as pymorphy2
import seaborn as sns
import inspect
import matplotlib.pyplot as plt

%matplotlib inline
sns.set(style="darkgrid")

import re

nltk.download('punkt')

In [None]:
df = pd.read_excel("posts.xlsx")
df.head()

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df['comment'] = df['comment'].fillna(0)

In [None]:
import re

def standardize_text(text):
    text = text.lower()
    text = re.sub('@[^\s]+', '', text)
    text = re.sub('_', ' ', text)
    text = re.sub('http[^\s]+', '', text)
    text = re.sub('\$\d+\.\d{2}', '', text)
    text = re.sub(r'\d+', '', text)
    text = re.sub("[^\w\s]","", text) # возможно плохая
    text = re.sub('\n', ' ', text)
    text = re.sub('\xa0', ' ', text)
    text= re.sub(' +',' ', text)
    return text



In [None]:
df['text'][8]


In [None]:
df['text'] = df['text'].apply(standardize_text)

In [None]:
df['text'][8]

#Tokenization with NLTK

In [None]:
from nltk.tokenize import word_tokenize
nltk.download('stopwords')

from nltk.corpus import stopwords

In [None]:
df['token'] = df['text'].apply(word_tokenize)

In [None]:
all_words = [word for tokens in df["token"] for word in tokens]
text_lengths = [len(tokens) for tokens in df["token"]]
VOCAB = sorted(list(set(all_words)))

print("%s words total, with a vocabulary size of %s" % (len(all_words), len(VOCAB)))
print("Max sentence length is %s" % max(text_lengths))

In [None]:
fig = plt.figure(figsize=(8, 8))
plt.xlabel('Number of words')
plt.ylabel('Count')
plt.hist(text_lengths, bins=20)
plt.show()

### Remove stop words

In [None]:
def remove_stopwords(text, vocab_set):
    words = [w for w in text if w not in vocab_set]
    return words

In [None]:
%%time
df['token'] = df['token'].apply(lambda x: remove_stopwords(x, stopwords.words('russian')))

In [None]:
df['token']

### Let's look how many words left after removing stop words

In [None]:
all_words_tr = [word for tokens in df["token"] for word in tokens]
text_lengths_tr = [len(tokens) for tokens in df["token"]]
vocab_tr = sorted(list(set(all_words_tr)))

print("%s words total, with a vocabulary size of %s" % (len(all_words_tr), len(vocab_tr)))
print("Max sentence length is %s" % max(text_lengths_tr))

In [None]:
fig = plt.figure(figsize=(8, 8))
plt.xlabel('Number of words')
plt.ylabel('Count')
plt.hist(text_lengths_tr, bins=20)
plt.show()

In [None]:
print("Word's dropped in total:", len(all_words)-len(all_words_tr))
print("Word's dropped from vocabulary:", len(VOCAB)-len(vocab_tr))

### Lemmatization

In [None]:
import pymorphy2
from pymorphy2 import MorphAnalyzer

In [None]:
morph = MorphAnalyzer()

def lemmatize(text):
    text = [morph.normal_forms(x)[0] for x in text]
    return text

In [None]:
series_lem = df['token'].apply(lemmatize)

#### Check words frequency

In [None]:
from nltk.probability import FreqDist

In [None]:
corpus = [word for i in series_lem for word in i]
corpus[10:20]

In [None]:
most_common = FreqDist(corpus).most_common(20)
words, frequency = [], []
for word, count in most_common:
    words.append(word)
    frequency.append(count)

sns.barplot(x = frequency, y = words)

In [None]:
FreqDist(corpus).most_common(50)

We can see, that words like "**который**" and "**это**" are popular, but will not have impact on model. We should drop them

In [None]:
stop_words = ["это", "который", "свой", 'такой', 'также']
series_lem = series_lem.apply(lambda x: remove_stopwords(x, stop_words))

In [None]:
corpus = [word for i in series_lem for word in i]
FreqDist(corpus).most_common(100)

In [None]:
from nltk import ngrams
n=2
ngrams_2_series = series_lem.apply(lambda x: list(map(' '.join, ngrams(x, n=n))))
FreqDist([word for i in ngrams_2_series for word in i]).most_common(100)

In [None]:
n=3
ngrams_3_series = series_lem.apply(lambda x: list(map(' '.join, ngrams(x, n=n))))
FreqDist([word for i in ngrams_3_series for word in i]).most_common(100)

In [None]:
n=4
ngrams_4_series = series_lem.apply(lambda x: list(map(' '.join, ngrams(x, n=n))))
FreqDist([word for i in ngrams_4_series for word in i]).most_common(100)

In [None]:
series_lem

In [None]:
series_lem_sentences = [" ".join(i) for i in series_lem]

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_df=0.8, max_features=10000,
                                 min_df=0.01,
                                 use_idf=True, ngram_range=(1,3))
X = vectorizer.fit_transform(series_lem_sentences)
#vectorizer.get_feature_names_out()

In [None]:
from sklearn.cluster import DBSCAN, k_means
from sklearn.preprocessing import StandardScaler


db = DBSCAN(eps=1.18, min_samples=5).fit(X)
labels = db.labels_

# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)
print("Estimated number of clusters: %d" % n_clusters_)
print("Estimated number of noise points: %d" % n_noise_)

In [None]:
df_clust = df.copy()

In [None]:
df_clust['cluster'] = labels

In [None]:
df_clust[df_clust['cluster'] == 1].head()

In [None]:
df_clust['post_link'] = "https://www.tinkoff.ru/invest/social/profile/" + df_clust['publisher'] + "/" + df_clust['id']

In [None]:
df_clust[df_clust['cluster'] == 6].head(10)

In [None]:
for i in range(-1, 8):
    df_clust[df_clust['cluster'] == i]['post_link'].to_excel(f'./Clusters/cluster_{i}.xls')