In [None]:
# importing libraries

import pandas as pd
import ast
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import spacy
import networkx as nx
from pyvis.network import Network

In [None]:
# reading dataset
data = pd.read_csv(r"<address>", index_col=0)

In [None]:
data.head()

# Creating a corpus of keywords

In [None]:
keywords = [] # store key words/phrases

In [None]:
# filter the selections
selected = data[data.Result == 1]

In [None]:
selected.index

In [None]:
for i in selected.index:
    kw = selected.about_keywords[i]
    
    # filter selections that have an About
    if isinstance(kw, str):
        words = ast.literal_eval(selected.about_keywords[i]) # convert to list
        
        for j in range(len(words)):
            print(words[j][0])
            keywords.append(words[j][0]) # add keywords to the list

In [None]:
len(keywords)

# K-Means clustering

In [None]:
# vectorize the keywords to pass them through a K-Means
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(keywords)
words = vectorizer.get_feature_names()

In [None]:
len(words)

In [None]:
X.shape

In [None]:
# elbow curve
wcss = []
for i in range(1,11):
    kmeans = KMeans(n_clusters=i,init='k-means++',max_iter=300,n_init=10,random_state=0)
    kmeans.fit(X)
    wcss.append(kmeans.inertia_)
plt.plot(range(1,11),wcss)
plt.title('The Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.savefig('elbow.png')
plt.show()

In [None]:
# fitting the clusters
kmeans = KMeans(n_clusters = 5, n_init = 20, n_jobs = 1) # n_init(number of iterations for clsutering) n_jobs(number of cpu cores to use)
kmeans.fit(X)
# We look at 5 the clusters generated by k-means.
common_words = kmeans.cluster_centers_

In [None]:
# We look at 5 the clusters generated by k-means.
common_words = kmeans.cluster_centers_.argsort()[:,-1:-5:-1]
for num, centroid in enumerate(common_words):
    print(str(num) + ' : ' + ', '.join(words[word] for word in centroid))

In [None]:
# store cluster labels in a variable
clusters = kmeans.labels_

In [None]:
pd.Series(clusters).value_counts()  # frequency of clusters

# Calculating cosine similarity of the keywords

In [None]:
nlp = spacy.load('en_core_web_lg') # load spacy model

In [None]:
network = []   # list of dicts to store the similarity scores

In [None]:
# removing common words because they lead to a high similarity score
def remove_common_words(k1, k2):
    
    str1_words = set(k1.split())
    str2_words = set(k2.split())
    common = str1_words & str2_words
    
    for w in common:
        
        k1 = k1.replace(w, '')
        k2 = k2.replace(w, '')
    
    return k1, k2

In [None]:
# finding the similarity scores
for i in range(len(keywords)-1):
    #print(i)
    kw1, kw2 = remove_common_words(keywords[i].lower(), keywords[i+1].lower())
    
    if len(kw1)>1 and len(kw2)>1:
        
        t1 = nlp(kw1)
        t2 = nlp(kw2)

        tmp = {"word1":str(t1), "word2":str(t2), "similarity":t1.similarity(t2)}
        print(i)
        network.append(tmp)

In [None]:
network

In [None]:
# converting list of dicts to dataframe
network_data = pd.DataFrame(network)

In [None]:
network_data.info()

In [None]:
# remove the entities with similarity score < 0 since it is garbage value  (similarity score cannot be <0)
network_data.drop(network_data[network_data.similarity < 0].index, inplace=True)

# Creating a Network with Networkx

In [None]:
# load pandas dataframe as networkx graph
G = nx.from_pandas_edgelist(network_data, 
                            source='word1', 
                            target='word2', 
                            edge_attr='similarity')
print("No of unique characters:", len(G.nodes))
print("No of connections:", len(G.edges))

# Visualization of network with Pyviz

In [None]:
net = Network(height='750px', width='100%', bgcolor='black', font_color='white')

sources = network_data['word1']
targets = network_data['word2']
weights = network_data['similarity']

edge_data = zip(sources, targets, weights)

for e in edge_data:
    src = e[0]
    dst = e[1]
    w = e[2]

    net.add_node(src, src, title=src)
    net.add_node(dst, dst, title=dst)
    net.add_edge(src, dst, value=w, weight=w)

neighbor_map = net.get_adj_list()

# add neighbor data to node hover data
for node in net.nodes:
    node['value'] = len(neighbor_map[node['id']])
    node['size'] = len(neighbor_map[node['id']])

net.show('network_ccd.html')