In [None]:
import nltk
import pandas as pd
import numpy as np
#nltk.download('stopwords')

from nltk.corpus import stopwords

# Setting Up

#### Importing data

In [None]:
full_df = pd.read_csv('../../data/final/futurice_blog_data.csv', sep='\t', index_col='index')

full_df = full_df.dropna(how='any', axis=0) #Get rid of any blogs that could cause models to crash

#Formatting values into easy-to-use arrays for models
texts = full_df['text'].values
titles = full_df['title'].values
categories = full_df['category'].values

#### Preprocessing

In [None]:
from nltk import SnowballStemmer, WordNetLemmatizer
import gensim

In [None]:
#Guting's preprocessing functions
stemmer = SnowballStemmer("english")
def lemmatize_stem(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text, min_len=3):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stem(token))
    return result

#Preprocessing texts
lem_texts = []
for t in texts:
    l = preprocess(t)
    lem_texts.append(' '.join(l))

print(lem_texts[0])

#### Getting Features with TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
#Mainly going to use this model as features
tf_idf_vectorizer = TfidfVectorizer(use_idf=True, norm='l2')

tf_idf_matrix = tf_idf_vectorizer.fit_transform(lem_texts)

#### Plotting Points

In [None]:
#Color table to keep colors consistent throughout graphs
color_table = {
    "Opinion": '#000000',              #Black
    "Technology": '#0009FF',           #Blue
    "Innovation & Design": '#27E4DD', #Cyan
    "Ways of Working": '#CCCC00', #Yellow
    "Culture": '#FF007F',       #Dark Pink
    "Events": '#FD69F3',        #Pink
    "Emerging Tech": '#FF7700', #Orange
    "Strategy": '#FF0000', #Red
    "News": '#401E00',     #Brown
    "Learning": '#063E40', #Dark Blue
    "Projects": '#193300', #Ugly Green
    "Product": '#808080' #Grey
}

In [None]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

In [None]:
dense_matrix = tf_idf_matrix.todense()
color_real = [color_table[x] for x in categories]

embeddings = TSNE(n_components = 2)
Y = embeddings.fit_transform(dense_matrix)

plt.scatter(Y[:, 0], Y[:, 1], c=color_real)

Culture seems to be relatively well clustered. If classifier show some logic there then there is definitely some potential in clustering

# Models

#### Chekcing whether vectors are classifiable with KNeighbors

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import plot_confusion_matrix
import matplotlib

In [None]:
#Setting up a map from text to integer for easier plotting
tag_to_num = {"Opinion": 0,
              "Technology": 1,
              "Innovation & Design": 2,
              "Ways of Working": 3,
              "Culture": 4,
              "Events": 5,
              "Emerging Tech": 6,
              "Strategy": 7,
              "News": 8,
              "Learning": 9,
              "Projects": 10,
              "Product": 11}

tags = [tag_to_num[elem] for elem in categories]


k_neigh = KNeighborsClassifier(n_neighbors=3, metric='cosine')

#500 training elements(Just randomly chose this for)
k_neigh.fit(tf_idf_matrix[:500], tags[:500])

In [None]:
#Confusion Matrix on whole remaining text
plot_confusion_matrix(k_neigh, tf_idf_matrix, tags, labels=np.unique(tags))

In [None]:
pred_lab = k_neigh.predict(tf_idf_matrix)

plt.scatter(Y[:, 0], Y[:, 1], c=pred_lab)

In [None]:
err = 0
for i in range(len(pred_lab)):
    if pred_lab[i] != tags[i]:
        err += 1
print(err/len(pred_lab))

Further testing in this same manner showed that Culture showed an error rate of only 4 percent. This probably means that the texts can indeed be classified but the current tags are not properly representing groups

## Clustering

#### Testing 3 clustering models

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

kmeans_model = KMeans(n_clusters=9) #9 comes from Guting's topic modeling

kmeans_model.fit(tf_idf_matrix)
kmeans_l = kmeans_model.labels_

plt.scatter(Y[:, 0], Y[:, 1], c=kmeans_l)

In [None]:
print(silhouette_score(tf_idf_matrix, kmeans_model.labels_))  #Score close to 0, so not good neither bad. We can at least see some clusters occuring

In [None]:
#Grid Search for DBSCAN
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score

best_result = -2
best_params = (0, 1)

for n in range(1, 110, 1):
    for m in range(2, 10):
        temp_mod = DBSCAN(eps=n/10, min_samples=m, metric='cosine').fit(tf_idf_matrix)
        if len(np.unique(temp_mod.labels_)) == 1:
            sil = -2
        else:
            sil = silhouette_score(tf_idf_matrix, temp_mod.labels_)
        if sil > best_result:
            best_result = sil
            best_params = (n, m)


print(best_params)
print(best_result)

In [None]:
best_db = DBSCAN(eps=0.9, min_samples=7, metric='cosine').fit(tf_idf_matrix)

plt.scatter(Y[:, 0], Y[:, 1], c=best_db.labels_)


Ok let's avoid DBSCAN

In [None]:
#Agglo cluster
from sklearn.cluster import AgglomerativeClustering

agglo_model = AgglomerativeClustering(n_clusters=9, affinity='euclidean').fit(dense_matrix)
plt.scatter(Y[:, 0], Y[:, 1], c=agglo_model.labels_)

Both Agglo and KMeans seem usable, let's see whether they can cluster Culture properly like the classifier does

#### Comparing Models

In [None]:
kmeans_comp = pd.DataFrame({'title': np.array(titles), 'real': np.array(categories), 'pred': np.array(kmeans_l)}, columns=['title', 'real', 'pred'])

for i in range(0, 9):
    print(str(i) + ": " + str(kmeans_comp[kmeans_comp['pred'] == i].shape[0]))

In [None]:
for i in range(0, 9):
    temp = {}
    for elem in kmeans_comp[kmeans_comp['pred'] == i]['real'].values:
        if elem in temp:
            temp[elem] += 1
        else:
            temp[elem] = 1
    temp = {k: v for k, v in sorted(temp.items(), key=lambda item: item[1])}
    print(str(i) + ": " + str(temp))

In [None]:
agglo_comp = pd.DataFrame({'title': np.array(titles), 'real': np.array(categories), 'pred': np.array(agglo_model.labels_)}, columns=['title', 'real', 'pred'])

for i in range(0, 9):
    print(str(i) + ": " + str(agglo_comp[agglo_comp['pred'] == i].shape[0]))

In [None]:
for i in range(0, 9):
    temp = {}
    for elem in agglo_comp[agglo_comp['pred'] == i]['real'].values:
        if elem in temp:
            temp[elem] += 1
        else:
            temp[elem] = 1
    temp = {k: v for k, v in sorted(temp.items(), key=lambda item: item[1])}
    print(str(i) + ": " + str(temp))

Welp, both of them do well in both clustering Culture, but also clustering technology. I'm gonna run manual tests you can ignore and just skip to the TLDR

TLDR:
KMeans does a great job clustering, and while it is not regular in its clusters, some seem to appear everytime. Those are:
-FutuStories and similar style docs
-Mobility
-Energy
-Data
-Cloud
-Strategy

Some others that appear but sometimes get merged with others:
-Health
-Robotics
-Design

In our case, I saved a model's results which seemed to do a good job clustering without overfitting some topics. These results are saved  in cluster_temp_save.csv

# Visualising Results

In [None]:
tf_idf_matrix

In [None]:
#I did my original clustering with blog_text, so gonna make sure both contain the same docs

analytics = pd.read_csv("../../data/final/futurice_blog_data.csv", sep='\t')

save = pd.read_csv("../../data/interim/cluster_temp_save.csv", sep='²')
s_title = save['title'].values
s_pred = save['cluster_pred'].values
for i in range(len(s_pred)):
    if s_pred[i] == "Company oriented Strategy":
        s_pred[i] = "Company"

to_remove = []
pg_views = []
for i in range(len(s_title)):
    idk = analytics[analytics['title'] == s_title[i]]['pageviews'].values
    if len(idk) == 0:
        to_remove.append(i)
        continue
    if len(idk) > 1:
        pg_views.append(idk.sum())
    else:
        pg_views.append(idk[0])

for elem in to_remove:
    s_title = np.delete(s_title, elem)
    s_pred = np.delete(s_pred, elem)

print(len(s_title), len(s_pred), len(pg_views))


#### Pageviews per Cluster

In [None]:
count = {}
views = {}

for i in range(len(s_title)):
    if s_pred[i] in count.keys():
        count[s_pred[i]] += 1
        views[s_pred[i]] += pg_views[i]
    else:
        count[s_pred[i]] = 1
        views[s_pred[i]] = pg_views[i]

avg = {}
for k in count.keys():
    avg[k] = views[k]/count[k]

print(avg)

In [None]:
fig = plt.figure()
ax = fig.add_axes([0, 0, 1, 1])
ax.bar(avg.keys(), avg.values())
plt.title('Average pageviews per category')
plt.show()

In [None]:
#Repeating above by combining similar topics
avg_2 = {
    'Futustories': avg['Futustories'],
    'Design/Strategy': (views['Company'] + views['Mobility'] + views['Strategy'] + views['Design'] + views['Strategy'])/(count['Company'] + count['Mobility'] + count['Strategy'] + count['Design'] + count['Strategy']),
    'Tech': (views['Data'] + views['AI'] + views['Cloud'])/(count['Data'] + count['AI'] + count['Cloud']),
    'Futurice': avg['Futurice']
}

print(avg_2)

In [None]:
fig = plt.figure()
ax = fig.add_axes([0, 0, 1, 1])
ax.bar(avg_2.keys(), avg_2.values())
plt.title('Average views per category')
plt.show()

#### Visualizing Clusters

In [None]:
cluster_to_num = {
    'Futustories': '#000000',
    'Company': '#0009FF',
    'Data': '#27E4DD',
    'Mobility': '#CCCC00',
    'Strategy': '#FF007F',
    'AI': '#FD69F3',
    'Futurice': '#FF7700',
    'Design': '#401E00',
    'Energy': '#FF0000',
    'Cloud': '#063E40'
}

big_to_color = {
    'Tech': '#000000',
    'Design/Strategy': '#FF0000',
    'Futustories': '#27E4DD',
    'Futurice': '#FF7700',
}

big_tag = []
for tag in s_pred:
    if tag in ['Data', 'AI', 'Cloud']:
        big_tag.append('Tech')
    elif tag in ['Company', 'Mobility', 'Strategy', 'Design', 'Energy']:
        big_tag.append("Design/Strategy")
    else:
        big_tag.append(tag)

In [None]:
mapping = pd.DataFrame({'tf': tf_idf_matrix, 'category': np.array(s_pred), 'big_category': np.array(big_tag), 'x': Y[:, 0],  'y': Y[:, 1]}, columns=['tf', 'category', 'big_category', 'x', 'y'])
fig, ax = plt.subplots()
for tag in cluster_to_num.keys():
    tmp = mapping[mapping['category'] == tag]
    
    ax.scatter(tmp['x'].values, tmp['y'].values, c=cluster_to_num[tag], label=tag)

ax.legend()
plt.show()

In [None]:
fig, ax = plt.subplots()
for tag in big_to_color.keys():
    tmp = mapping[mapping['big_category'] == tag]
    
    ax.scatter(tmp['x'].values, tmp['y'].values, c=big_to_color[tag], label=tag)

ax.legend()
plt.show()