In [1]:
import re
import nltk
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.pipeline import Pipeline
from nltk.stem.snowball import SnowballStemmer
from sklearn.metrics import pairwise_distances
from sklearn.cluster import MeanShift, estimate_bandwidth, KMeans, DBSCAN
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer


nltk.download('punkt')

In [2]:
### Import the data

df = pd.read_csv('../../data/final/futurice_blog_data.csv', delimiter='\t')
print(df.info())

### Drop the rows that have NaN title:
df.dropna(subset=['title'], inplace=True)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 814 entries, 0 to 813
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   url               814 non-null    object 
 1   title             814 non-null    object 
 2   time              786 non-null    object 
 3   category          782 non-null    object 
 4   text              805 non-null    object 
 5   pageviews         814 non-null    int64  
 6   unique_pageviews  814 non-null    int64  
 7   avg_time          814 non-null    float64
 8   bounce_rate       814 non-null    float64
 9   exit%             814 non-null    float64
dtypes: float64(3), int64(2), object(5)
memory usage: 63.7+ KB
None


In [3]:
data = np.array(df['title'])

In [4]:
### Use for tokenize in the tf-idf. Taken from http://brandonrose.org/clustering#Visualizing-document-clusters

# Stemmer from nltk snowball stemmer
stemmer = SnowballStemmer("english")


def tokenize_and_stem(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems

In [5]:
## Using tfidf_Vectorizer to calculate the tfidf matrix
# tfidf_vectorizer = TfidfVectorizer(max_df=0.5,
                                #  min_df=5, stop_words='english',
                                #  use_idf=True, tokenizer=tokenize_and_stem)

tfidf_vectorizer = TfidfVectorizer(max_df=0.5, min_df=5, stop_words="english")

tfidf_matrix = tfidf_vectorizer.fit_transform(df['title'])

In [8]:
### Function for finding the best epsilon
## The feature matrix should be in the form of row matrix, meaning that each sample is in one row of the matrix
## k is the number of nearest point that the function will consider


def best_eps_cooking(feature_matrix,k=3):
    dist_matrix = pairwise_distances(feature_matrix)   # Pairwise distance of the samples
    
    min_dist_arr = np.zeros(dist_matrix.shape[0] * k)

    ## Find the 3 nearest distance for each of the samples
    for i in range(dist_matrix.shape[0]):
        nearest_k = np.sort(dist_matrix[i,:])[1:(1+k)]
        min_dist_arr[i:(i+k)] = nearest_k


    min_dist_arr = np.sort(min_dist_arr)
    print(min_dist_arr)

    ## Finding the maximum slope of the distance, and return this value as the optimal epsilon
    eps = max([x - z for x, z in zip(min_dist_arr[:-1], min_dist_arr[1:])])

    return (eps, min_dist_arr)  # The dist_arr can be used to visualize the point,
    # return max_slope            # For compactness, use this return statement instead of the one before it


### Small test for the function
results = best_eps_cooking(tfidf_matrix,3)
eps = results[0]
dist_sorted = results[1]

# sns.lineplot(data=dist_sorted).set(title="Best epsilon: {:.3f}".format(eps))
eps


[0. 0. 0. ... 1. 1. 1.]


0.0

In [7]:
## Apply dbscan to the generated matrix
dbs = DBSCAN(eps=best_eps_cooking(tfidf_matrix)[0], min_samples=10, metric='cosine')
dbs.fit(tfidf_matrix)

[0. 0. 0. ... 1. 1. 1.]


ValueError: eps == 0.0, must be > 0.0.

In [None]:
### Create a dataframe that only contains the url, the category and the labels

df['label'] = dbs.labels_

clustered = df[['url', 'category', 'label']]
clustered.head()

clustered.groupby('label').size()

dist = pairwise_distances()

In [None]:
# # Define a pipeline combining a text feature extractor with a simple clusterer

# nCluster = 10

# ## Maybe run the k-means for some k times
# pipe = Pipeline(steps=
#     [
#         ("tfidfVec", TfidfVectorizer(max_df=0.5, min_df=5, stop_words="english")),
#         ("dbs", DBSCAN(eps=0.7, min_samples=4)),
#     ]
# )

# pipe.fit(data)
# # km = pipe['km']
# df['label'] = pipe['dbs'].labels_
# clustered = df[['title', 'label', 'category']].sort_values(['label', 'category'])
# clustered['category'] = clustered['category'].replace({'Culture':'C', 'Emerging Tech':'ET', 'Events':'E', 'Innovation & Design':'I&D', 'Learning':'L', 'News':'N', 'Opinion':'O', 'Product':'P', 'Strategy':'S', 'Technology':'T', 'Ways of Working': 'WW'})
# clustered.head()
# # silhouette_score(X=pipe.transform(clustered['title']), labels = clustered['label'])



# # clustered['category'].drop_duplicates()

# Plotting

The figure below is the histograms of the categories in each cluster label

In [None]:
# plt.rcParams['figure.figsize'] = (20, 12)
# figure, axis = plt.subplots(5, 2, sharex=True)

# for i in range(10):
#     axis[i//2, i%2].hist(clustered[clustered['label'] == i]['category'], ec='black')
#     axis[i//2, i%2].set_title('Label = {}'.format(i))

# # plt.hist(clustered[clustered['label'] == 0]['category'], ec='black')



# plt.show()

# categories = ['C', 'L', 'N', 'WW', 'ET', 'E', 'I&D', 'O', 'S', 'T', 'P']





# Correlation between cluster labels and original categories
The statistic of interest is Cramer's V. This statistic takes into account the number of times that a label and a category are observed together. Since the statistic is usually optimistic, I used a bias corrected version of the statistic from Wikipedia.

The following notations are used:

-   $n$ is the number of blogs
-   $n_{ij}$ is the number of times that a blog of categories $i$ is clustered into cluster $j$
-   $n_i$ is the number of blogs in category $i$
-   $n_j$ is the number of number of blogs in cluster $j$
-   $r$ is the number of categories
-   $k$ is the number of clusters
-   $\chi^2$ is the chi-squared statistic:
    $$\chi^2 = \sum_{i, j}{\frac{(n_{ij} - \frac{n_i \cdot n_j}{n})^2}{\frac{n_i \cdot n_j}{n}}}$$

In addition, let:

-   $\tilde{\varphi} = \min{(0, \frac{\chi^2}{n} - \frac{(k-1)(r-1)}{n-1})}$

-   $\tilde{r} = r - \frac{(r-1)^2}{n-1}$

-   $\tilde{k} = k - \frac{(k-1)^2}{n-1}$

Thus, the formula for this statistic is:

$$\tilde{V} = \sqrt{\frac{\tilde{\varphi}^2}{\min{(\tilde{k}-1, \tilde{n}-1)}}}$$


In [None]:
# def cramerV(frame, k, category_List):
#     freq_count = frame.set_index(["label", "category"]).sort_index()  
#     freq_count = freq_count.groupby(level=[0,1]).size().unstack().fillna(0).stack()
#     freq_label = freq_count.sum(level=0)      # Group by labels (i) -> Sum over categories (j)
#     freq_category = freq_count.sum(level=1)   # Group by categories (j) -> Sum over labels (i)
#     n = len(frame)
#     r = len(category_List)
#     chi_squared = 0.0

#     for label in range(nCluster):    # i
#         for category in category_List:  # j
#             n_i = freq_label[label]         # Sum over j
#             n_j = freq_category[category]   # Sum over i
#             n_ij = freq_count[label][category]
            
#             # Calculate the statistic to add
#             denom = (n_i * n_j)/n
#             statistic = ( (n_ij - denom)**2 ) / denom
#             chi_squared += statistic

#     corrected_coef = (k-1)*(r-1)/(n-1)
#     corrected_chi_squared = max(0, chi_squared/n - corrected_coef)

#     k_tilde = k - (k-1)**2/(n-1)
#     r_tilde = r - (r-1)**2/(n-1)
#     return np.sqrt(corrected_chi_squared / min(k_tilde-1, r_tilde - 1))

# cramerV(clustered, nCluster, categories)
