***

In [None]:
import pandas as pd
df = pd.read_csv('russian_tweets.csv')

In [None]:
df = df.rename(columns={'date': 'datetime'})
df = df.dropna(axis = 0, how = 'any')

In [None]:
groups = df.groupby('user_name')


In [None]:
groups_size = groups.size()

In [None]:
import matplotlib.pyplot as plt
result = groups_size.sort_values(ascending = False)[:20]
plt.bar(result.index, result.values, color ='red',
        width = 0.4)
plt.xlabel("Users")
plt.ylabel("Number of tweets")
plt.title("Number of tweets per user")
plt.show()

In [None]:
groups = df.groupby(pd.Grouper(key='date', axis=0, freq='D'))

In [None]:
groups_size = groups.size()

In [None]:
result = groups_size.sort_index(ascending = True)
result.to_frame()
result.plot()
plt.title("Number of tweets per day")
plt.show()

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.corpus import stopwords

df['text'].str.lower()
stop = stopwords.words('english')
df['tweet_without_stopwords'] = df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
texts = df['text'].to_list()

In [None]:
document_term_vectorizer = CountVectorizer()


In [None]:
document_term = document_term_vectorizer.fit_transform(texts)

In [None]:
print(document_term.shape)

In [None]:
tfidf_vectorizer = TfidfVectorizer()

In [None]:
tfidf = tfidf_vectorizer.fit_transform(texts)

In [None]:
from sklearn.decomposition import TruncatedSVD
document_term_svd = TruncatedSVD(n_components=100).fit(document_term)


In [None]:
reduced_document_term = document_term_svd.transform(document_term)

In [None]:
tfidf_svd = TruncatedSVD(n_components=100).fit(tfidf)


In [None]:
reduced_tfidf = tfidf_svd.transform(tfidf)

In [None]:
from sklearn.preprocessing import Normalizer
document_term_normalizer = Normalizer().fit(reduced_document_term)


In [None]:
normalized_document_term = document_term_normalizer.transform(reduced_document_term)

In [None]:
tfidf_normalizer = Normalizer().fit(reduced_tfidf)

In [None]:
normalized_tfidf = tfidf_normalizer.transform(reduced_tfidf)

In [None]:
from sklearn.cluster import KMeans
kmeans_normalized_document_term = KMeans(n_clusters=2).fit(normalized_document_term)

In [None]:

sse = []
for k in range(1, 16):
        kmeans = KMeans(n_clusters = k).fit(normalized_document_term)
        current_sse = kmeans.inertia_
        sse.append(current_sse)
    

In [None]:
plt.clf()
plt.cla()

plt.plot(sse)
plt.xlabel("Number of clusters")
plt.ylabel("SSE")
plt.title("SSE vs. Number of clusters")
plt.show()

In [None]:
kmeans_normalized_document_term = KMeans(n_clusters = 16).fit(normalized_document_term)

In [None]:
clustered_document_term = kmeans_normalized_document_term.transform(normalized_document_term)

In [None]:
kmeans_normalized_tfidf = KMeans(n_clusters = 16).fit(normalized_tfidf)

In [None]:
clustered_tfidf = kmeans_normalized_tfidf.transform(normalized_tfidf)

In [None]:
def get_top_terms_per_cluster(vectorizer, svd, kmeans, n_terms=20):
    original_space_centroids = svd.inverse_transform(kmeans.cluster_centers_)
    cluster_word_indices = original_space_centroids.argsort()[:, :-(n_terms+1):-1]
    
    terms = vectorizer.get_feature_names()
    
    print('Top terms per cluster:')
    for cluster_num, word_indices in enumerate(cluster_word_indices):
        print(f'Cluster {cluster_num}:')
        for i in word_indices:
            print(f'\t{terms[i]}')
        print()

In [None]:
get_top_terms_per_cluster(document_term_vectorizer, document_term_svd, kmeans_normalized_document_term)

In [None]:
get_top_terms_per_cluster(tfidf_vectorizer, tfidf_svd, kmeans_normalized_tfidf)