Loading the libraries

In [17]:
import json
import gzip
import pandas as pd
import re
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import networkx as nx
import matplotlib.pyplot as plt
import itertools

Code for creating the ontology

In [56]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import networkx as nx
import itertools
import matplotlib.pyplot as plt

'''
Calculates the cosine simularity between words
'''
cosine_function = lambda a, b: np.inner(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))


def conditional_prob(x, y):
    '''
    Calculates the conditional probability of having x in the document given y and vice versa
    '''
    x_given_y = np.count_nonzero(pd.Series(x * y)) / np.count_nonzero(y)
    y_given_x = np.count_nonzero(pd.Series(x * y)) / np.count_nonzero(x)
    return x_given_y, y_given_x


def weighted_n(x, y):
    '''
    Calculates the weighted similarity between words
    '''
    return len(set(x) & set(y)) / ((len(x) + len(y) / 2))


def L(x, y, docs_words, cooccured):
    '''
    Calculates the L metric for discovering hierarchical links 
    '''
    P_x_y, P_y_x = conditional_prob(docs_words[x], docs_words[y])
    c_x_y = cosine_function(docs_words[x], docs_words[y])
    N_x_y = weighted_n(cooccured[x], cooccured[y])
    return (P_y_x - P_x_y) * c_x_y * (N_x_y + 1)


def find_ind_for_removal(known_edges, all_edges):
    '''
    Finds the index of the word for which we can potentially remove an edge from the graph
    '''
    for i in range(len(all_edges)):
        if all_edges[i][0] == known_edges[0] and all_edges[i][1] == known_edges[1]:
            return i


def s(x, y, docs_words, G1):
    '''
    Calculates the s metric for discovering related/equivalent words
    '''
    cos_sim = cosine_function(docs_words[x], docs_words[y])
    parents = nx.ancestors(G1, x) & nx.ancestors(G1, y)
    permutated_parents = list(itertools.combinations(parents, 2))

    av_sim = []
    sup_sim = 0
    for pair in permutated_parents:
        av_sim.append(cosine_function(docs_words[pair[0]], docs_words[pair[1]]))

    if av_sim != []:
        sup_sim = sum(av_sim) / len(av_sim)

    P_x_y, P_y_x = conditional_prob(docs_words[x], docs_words[y])
    abs_dif = np.abs(P_x_y - P_y_x)

    return cos_sim - 0.2 * sup_sim - 0.2 * abs_dif


def distance(x, y, docs_words, G1):
    '''
    Calculates the distance between words as 1 / s(x, y)
    If the distance is too small, returns a numbers obviously outside of range
    '''
    d = s(x, y, docs_words, G1)
    if d != 0:
        return 1 / d
    else:
        return 1000


def single_linkage_distance(clust_a, clust_b, docs_words, G1):
    '''
    Calculates the single linkage distance for 2 clusters
    '''
    minimum_dist = 100
    a = pd.concat([clust_a, clust_b])

    all_links = list(itertools.combinations(list(a), 2))

    for link in all_links:
        d = distance(link[0], link[1], docs_words, G1)

        if d < minimum_dist:
            minimum_dist = d

    return minimum_dist


def create_ontology(united_columns, min_df=0.01, top_ngram=1, min_cooccur=10):
    '''
    Function for creating the ontology
    '''
    #Initializing the value which checks if the ontology is ready
    check = -1

    while check != 0:
        
        #Initializing TF-IDF with inputed parameters and using it on the input data
        tfidf = TfidfVectorizer(ngram_range=(1, top_ngram), min_df=min_df)
        tf_idf_united = tfidf.fit_transform(united_columns)
        #Results of TF-IDF are saved into docs_words - dataframe which has words as columns and document indexes as rows
        docs_words = pd.DataFrame(tf_idf_united.toarray(), columns=tfidf.get_feature_names_out())
        
        cooccured = {}
        
        #For every word in docs_words we create a list of cooccured words 
        #which happen in min_occur or more documents where the word is
        for cur_word in enumerate(docs_words.columns):
            current_list = []
            for other_word in enumerate(docs_words.columns):
                if cur_word[0] != other_word[0] and np.count_nonzero(
                        pd.Series(docs_words.iloc[:, cur_word[0]] * docs_words.iloc[:, other_word[0]])) >= min_cooccur:
                    current_list.append(other_word[1])
            
            cooccured[cur_word[1]] = current_list

        work_nodes = set()
        work_edges = []
        
        #For every word in docs_words and their cooccured words
        #we calculates L metric
        #if it is above the critical value, then we infer a hierarchical link
        for word in docs_words.columns:
            for other_word in cooccured[word]:
                if L(word, other_word, docs_words, cooccured) > 0.2:
                    work_nodes.add(word)
                    work_nodes.add(other_word)
                    work_edges.append(
                        [other_word, word, {'weight': np.round(L(word, other_word, docs_words, cooccured), 4)}])

        just_edges = {}
        
        #Getting a list of all edges in the ontology
        for edge in work_edges:
            if edge[0] not in just_edges:
                just_edges[edge[0]] = [edge[1]]
            else:
                just_edges[edge[0]].append(edge[1])

        edge_for_removal = []
        
        #Finding all edges which make a triangle
        for cur_node in just_edges:
            for other_node in just_edges[cur_node]:
                if other_node in just_edges and set(just_edges[cur_node]) & set(just_edges[other_node]) != set():
                    edge_for_removal.append(
                        [cur_node, other_node, list(set(just_edges[cur_node]) & set(just_edges[other_node]))])
        
        #Removing the weakest edge (= with the least value of L metric)
        for edges in edge_for_removal:
            for member in edges[2]:
                ind1 = find_ind_for_removal([edges[0], member], work_edges)
                ind2 = find_ind_for_removal([edges[1], member], work_edges)

                if ind1 is not None and ind2 is not None:
                    if work_edges[ind1][2]['weight'] >= work_edges[ind2][2]['weight']:
                        work_edges.pop(ind2)
                    else:
                        work_edges.pop(ind1)
        
        #Saving the ontology into a graph structure
        G1 = nx.DiGraph()

        G1.add_nodes_from(work_nodes)
        G1.add_edges_from(work_edges)

        rel_equiv = set()

        visited = set()
        
        #Checking all words in the ontology for possible related/equivalent relationship
        #If s metric for two words is above the critical value
        #We assume that they are related/equivalent
        for word in list(nx.nodes(G1)):
            for other_word in list(nx.nodes(G1)):
                if word != other_word and other_word not in visited:
                    if s(word, other_word, docs_words, G1) >= 0.75:
                        rel_equiv.add(word)
                        rel_equiv.add(other_word)
            visited.add(word)
        
        #Manually conctructed hierarchical clustering algoritm using single linkage = 1/s metric
        clustering = pd.DataFrame(list(rel_equiv), columns=['word'])
        clustering['cluster'] = list(clustering.index)

        curr_clusters = clustering['cluster'].copy()
        visited_clusts = set()

        for clust in curr_clusters:
            visited_clusts.add(clust)
            curr_clust = clustering[clustering['cluster'] == clust]
            other_clust = clustering[clustering['cluster'] != clust]

            for other in other_clust['cluster']:
                if other not in visited_clusts:
                    another = other_clust[other_clust['cluster'] == other]

                    if single_linkage_distance(curr_clust['word'], another['word'], docs_words, G1) <= (1 / 0.75):
                        clustering.loc[clustering['cluster'] == other, ['cluster']] = clust
        
        #Printing the results of clustering to see what was found
        print(clustering)
        
        #Checking if ontology is ready
        #If yes, the algorithm stops and returns the final processed dataset
        check = len(clustering['cluster']) - len(clustering['cluster'].unique())
        
        #If not
        if check != 0:
            
            #For every cluster we create an aggravated word (separator between them is _)
            for cluster in clustering['cluster'].unique():
                curr = clustering[clustering['cluster'] == cluster]
                in_work = curr.word.str.replace(' ', '_')
                new_word_unique = set('_'.join(in_work).split('_'))
                new_word = '_'.join(list(new_word_unique))
                clustering.loc[clustering['cluster'] == cluster, ['replace']] = new_word

            k = united_columns.copy()

            new_united = []
            
            #Processing the original dataset to include the related/equivalent relationships
            for row in k:
                
                #For every cluster we take the list of original words and the aggrevated word
                #for this cluster
                for cluster in clustering['cluster'].unique():
                    words = clustering[clustering['cluster'] == cluster]
                    words = words['word']
                    replacement = clustering[clustering['cluster'] == cluster]
                    replacement = replacement['replace'].iloc[0]
                    
                    #Remove all original words from the document
                    count = 0
                    for w in words:
                        while row.find(w) != -1:
                            count += 1
                            row = row.replace(w, ' ')
                            
                    #If any words were removed, normalize the whitespaces
                    while row.find('  ') != -1:
                        row = row.replace('  ', ' ')
                        
                    #If any words from this cluster were removed, add the aggrevated word at the end of the document
                    if count != 0:
                        row = row + ' ' + replacement

                new_united.append(row)
            
            #Update the dataset and start Klink algorithm again
            united_columns = new_united
    
    #Return the ontology
    return united_columns


def visualise_ontology(united_columns, check_vis_list = False, min_df=0.01, top_ngram=1, min_cooccur=50):
    '''
    Function for visualising the ontology
    '''
    #Everything is the same as in create_ontology
    tfidf = TfidfVectorizer(ngram_range=(1, top_ngram), min_df=min_df)
    tf_idf_united = tfidf.fit_transform(united_columns)
    docs_words = pd.DataFrame(tf_idf_united.toarray(), columns=tfidf.get_feature_names_out())
    
    cooccured = {}

    for cur_word in enumerate(docs_words.columns):
        current_list = []
        for other_word in enumerate(docs_words.columns):
            if cur_word[0] != other_word[0] and np.count_nonzero(
                    pd.Series(docs_words.iloc[:, cur_word[0]] * docs_words.iloc[:, other_word[0]])) >= min_cooccur:
                current_list.append(other_word[1])
        cooccured[cur_word[1]] = current_list

    work_nodes = set()
    work_edges = []

    for word in docs_words.columns:
        for other_word in cooccured[word]:
            if L(word, other_word, docs_words, cooccured) > 0.2:
                work_nodes.add(word)
                work_nodes.add(other_word)
                work_edges.append(
                    [other_word, word, {'weight': np.round(L(word, other_word, docs_words, cooccured), 4)}])

    just_edges = {}

    for edge in work_edges:
        if edge[0] not in just_edges:
            just_edges[edge[0]] = [edge[1]]
        else:
            just_edges[edge[0]].append(edge[1])

    edge_for_removal = []

    for cur_node in just_edges:
        for other_node in just_edges[cur_node]:
            if other_node in just_edges and set(just_edges[cur_node]) & set(just_edges[other_node]) != set():
                edge_for_removal.append(
                    [cur_node, other_node, list(set(just_edges[cur_node]) & set(just_edges[other_node]))])

    for edges in edge_for_removal:
        for member in edges[2]:
            ind1 = find_ind_for_removal([edges[0], member], work_edges)
            ind2 = find_ind_for_removal([edges[1], member], work_edges)

            if ind1 is not None and ind2 is not None:
                if work_edges[ind1][2]['weight'] >= work_edges[ind2][2]['weight']:
                    work_edges.pop(ind2)
                else:
                    work_edges.pop(ind1)
    
    #Inputing the ontology into a graph structure
    G1 = nx.DiGraph()
    
    if check_vis_list:
        final_list = {}

        for work_edge in work_edges:
            if work_edge[0] not in final_list:
                final_list[work_edge[0]] = []
        
            final_list[work_edge[0]].append([work_edge[1], work_edge[2]])

        return final_list
    
    G1.add_nodes_from(work_nodes)
    G1.add_edges_from(work_edges)
    
    #Setting up the parameters of visualisation
    plt.rcParams["figure.figsize"] = (20, 20)

    pos = nx.spring_layout(G1, seed=47)
    node_sizes = [i for i in range(len(G1))]

    nodes = nx.draw_networkx_nodes(G1, pos, node_size=node_sizes, node_color="indigo", alpha=0.75)

    M = G1.number_of_edges()
    edge_colors = range(2, M + 2)
    edge_alphas = [(5 + i) / (M + 4) for i in range(M)]
    cmap = plt.cm.plasma

    edges = nx.draw_networkx_edges(
            G1,
            pos,
            node_size=node_sizes,
            arrowstyle="->",
            arrowsize=10,
            edge_color=edge_colors,
            edge_cmap=cmap,
            width=1,
        )

    for i in range(M):
        edges[i].set_alpha(edge_alphas[i])
    
    #Show the visualisation
    nx.draw_networkx_labels(G1, pos, font_size=14, verticalalignment='top')
    plt.show()

Function to save it into csv (to be later uploaded into Neo4j)

In [57]:
def graph_to_csv(vocab, node_file, edge_file):
    
    all_edges = []
    all_nodes = set()
    
    for word in vocab.keys():
        for i in vocab[word]:
            all_edges.append([i[0], word, i[1]['weight']])
            all_nodes.add(i[0])
        all_nodes.add(word)
    
    nodes = pd.DataFrame(all_nodes, columns = ['Word'])
    edges = pd.DataFrame(all_edges, columns = ['Word', 'Subtopic_of', 'Weight'])
    
    nodes.to_csv(node_file, index = False)
    edges.to_csv(edge_file, index = False)

Loading libraries

In [37]:
import pandas as pd
import numpy as np
import re
import html
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\anutk\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\anutk\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

Preprocessing for the dataset

In [49]:
def check_for_remove_short(x):
    '''
    Removes tokens which have the length of 1
    '''
    line = ''
    for i in str(x).split():
        if len(i) > 1:
            line += i + ' '

    return line.strip()
    
def check_for_remove_stopwords(x):
    '''
    Removes stopwords from the line
    '''
    stop_words = set(stopwords.words('english'))
    line = ''

    for i in str(x).split():
        if i not in stop_words:
            line += i + ' '

    return line.strip()

class TextPreprocessor:
    '''
    Class for general text preprocessing of a dataset
    Removes
        - punctuation
        - normalizes whitespaces after removal of punctuation
        - tokens which have the length of 1
        - stopwords
        - numbers
    Normalizes 
        - whitespaces after removal of punctuation
    Lemmatizes all words in the dataset
    Shortens the sentences if they are above word limit
    '''

    def __init__(self):
        '''
        Initilizes an object of the class
        '''
        self.data = None

    def fit(self, data: pd.DataFrame) -> None:
        '''
        Fits the input data into an object of the class
        '''
        self.data = data

    def fit_transform(self, data: pd.DataFrame, columns) -> pd.DataFrame:
        '''
        Fits and transforms the input data into an object of the class
        '''
        self.fit(data)
        return self.transform(columns)

    def transform(self, columns) -> pd.DataFrame:
        '''
        Tranforms the input data
        Removes
            - punctuation
            - normalizes whitespaces after removal of punctuation
            - tokens which have the length of 1
            - stopwords
            - numbers
        Normalizes 
            - whitespaces after removal of punctuation
        Lemmatizes all words in the dataset
        Shortens the sentences if they are above word limit
        '''
        self.__remove_punc(columns)
        self.__normalize_whitespace(columns)
        self.__remove_short(columns)
        self.__lemmatize(columns)
        self.__remove_stopwords(columns)
        self.__remove_numbers(columns)
        return self.data

    def __remove_punc(self, columns):
        '''
        Removes punctuation from the data
        '''
        pattern = re.compile(r'[^\w\s]+')
        for column in columns:
            self.data[column] = self.data[column].apply(lambda x: re.sub(pattern, ' ', str(x)))

    def __normalize_whitespace(self, columns):
        '''
        Normalizes whitespaces
        '''
        for column in columns:
            self.data[column] = self.data[column].str.replace('  ', ' ')

    def __remove_short(self, columns):
        '''
        Removes short words which have the length of 1
        '''
        for column in columns:
            self.data[column] = self.data[column].apply(check_for_remove_short)
            
    def __lemmatize(self, columns):
        '''
        Lemmatizes the words in the dataset
        '''
        morph = WordNetLemmatizer()
        
        for column in columns:
            self.data[column] = self.data[column].apply(lambda x: ' '.join([morph.lemmatize(i.lower()) for i in x.split()]))

    def __remove_stopwords(self, columns):
        '''
        Removes stopwords
        '''
        for column in columns:
            self.data[column] = self.data[column].apply(check_for_remove_stopwords)
        
    def __shorten(self, columns):
        '''
        Shortens the lines which are above the word limit
        '''
        for column in columns:
            self.data[column] = self.data[column].apply(lambda x: check_len(x, 256))
    
    def __remove_numbers(self, columns):
        '''
        Removes numbers
        '''
        for column in columns:
            self.data[column] = self.data[column].apply(lambda x: re.sub('(\d)', '', x))

Preprocessing and saving the dataset

In [53]:
#prep_df = pd.read_csv('bbc_data.csv')
#TP = TextPreprocessor()
#prep_df = TP.fit_transform(prep_df, ['data'])
#prep_df.to_csv('bbc_processed_data.csv', index = False)

In [59]:
df = pd.read_csv('bbc_processed_data.csv')
df['data']

0       musician tackle u red tape musician group tack...
1       us desire number one u three prestigious gramm...
2       rocker doherty stage fight rock singer pete do...
3       snicket top u box office chart film adaptation...
4       ocean twelve raid box office ocean twelve crim...
                              ...                        
2221    fast lift rise record book two high speed lift...
2222    nintendo add medium playing nintendo releasing...
2223    fast moving phone virus appear security firm w...
2224    hacker threat apple itunes user apple music ju...
Name: data, Length: 2225, dtype: object

Creating the ontology via Klink

In [60]:
created = create_ontology(df['data'], 0.015, 1)
written = visualise_ontology(created, True, 0.015, 1)
    
node_name = 'bbc' + '_nodes.csv'
edge_name = 'bbc' + '_edges.csv'

graph_to_csv(written, node_name, edge_name)

       word  cluster
0  democrat        0
1       lib        1
2      dems        1
3   liberal        0
        word  cluster
0       eral        0
1  democrat_        0
Empty DataFrame
Columns: [word, cluster]
Index: []
