In [None]:
pip install Unidecode

Collecting Unidecode
  Downloading Unidecode-1.3.4-py3-none-any.whl (235 kB)
[?25l[K     |█▍                              | 10 kB 19.5 MB/s eta 0:00:01[K     |██▉                             | 20 kB 9.7 MB/s eta 0:00:01[K     |████▏                           | 30 kB 7.7 MB/s eta 0:00:01[K     |█████▋                          | 40 kB 3.5 MB/s eta 0:00:01[K     |███████                         | 51 kB 3.5 MB/s eta 0:00:01[K     |████████▍                       | 61 kB 4.2 MB/s eta 0:00:01[K     |█████████▊                      | 71 kB 4.4 MB/s eta 0:00:01[K     |███████████▏                    | 81 kB 4.4 MB/s eta 0:00:01[K     |████████████▌                   | 92 kB 4.8 MB/s eta 0:00:01[K     |██████████████                  | 102 kB 4.1 MB/s eta 0:00:01[K     |███████████████▎                | 112 kB 4.1 MB/s eta 0:00:01[K     |████████████████▊               | 122 kB 4.1 MB/s eta 0:00:01[K     |██████████████████              | 133 kB 4.1 MB/s eta 0:00:01

In [None]:
import pandas as pd
import glob
import os
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.cluster import Birch
from sklearn.decomposition import PCA
from scipy.sparse import hstack
import nltk
nltk.download("stopwords")
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import re
import unidecode

def tokenize_and_stem(text, porter_stemmer, stop_words):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(
        text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)

    #filtered_tokens = [unidecode.unidecode(word) for word in filtered_tokens if word[0].isupper()]
    stems = [porter_stemmer.stem(t) for t in filtered_tokens]
    return stems


def one_hot_encode(df):
    mlb = MultiLabelBinarizer(sparse_output=True)
    sparse_df = mlb.fit_transform(df)
    return sparse_df


def remove_stop_words(df, stop_words):

    for row in df.itertuples():
        if type(row.heading) == float:
            df.loc[row.Index, 'heading'] = ['#']
            continue

        porter_stemmer = PorterStemmer()

        processed_data = tokenize_and_stem(
            row.heading, porter_stemmer, stop_words)
        stop_word_removed_data = []
        for word in processed_data:
            if word.lower() not in stop_words:
                stop_word_removed_data.append(word)

        df.loc[row.Index, 'heading'] = ' '.join(stop_word_removed_data)

    return df

def remove_redundancy(input_dir, output_dir):

    stop_words = set(stopwords.words('english'))

    # parameters, determined through experiments
    birch_thresh = 2.4
    count_thresh = 0.1

    perform_pca = False
    path = input_dir
    output_path = output_dir
    file_name = '*.csv'
    all_files = glob.glob(os.path.join(path, file_name))

    for f in all_files:

        file_prefix = f.split('.')[0]
        file_prefix = file_prefix.split('/')[-1]

        df = pd.read_csv(f, header=None, encoding='latin-1')

        df.columns = ['record_id', 'date', 'url', 'counts', 'themes', 'locations', 'persons',
                      'organizations', 'tone', 'heading']

        # Retaining only those news which have non-null locations and heading
        df = df[pd.notnull(df['locations'])]
        df = df[pd.notnull(df['heading'])]

        # removing news with wrong scraped title e.g. bloomberg instead of article title
        try:
            mask = (df['heading'].str.len() >= 20)
            df = df.loc[mask]
        except:
            continue

        # retaining original heading for analysis afterwards
        df['heading_original'] = df['heading']

        # stop-word removal and stemming
        df = remove_stop_words(df, stop_words)

        df_locations = pd.DataFrame(df['locations'])
        df_heading = pd.DataFrame(df['heading'])

        # dictionary that maps row number to row, helps later in forming clusters through cluster labels
        row_dict = df.copy(deep=True)
        row_dict.fillna('', inplace=True)
        row_dict.index = range(len(row_dict))
        row_dict = row_dict.to_dict('index')

        try:
            df_locations = pd.DataFrame(
                df_locations['locations'].str.split(';'))  # splitting locations
        except:
            continue

        for row in df_locations.itertuples():
            for i in range(0, len(row.locations)):
                try:
                    row.locations[i] = (row.locations[i].split('#'))[
                        3]  # for retaining only ADM1 Code
                except:
                    continue

        sparse_heading = one_hot_encode(df_heading['heading'])
        sparse_locations = one_hot_encode(df_locations['locations'])

        df = hstack([sparse_heading, sparse_locations])

        # Reducing dimensions through principal component analysis
        if perform_pca:
            pca = PCA(n_components=None)
            df = pd.DataFrame(pca.fit_transform(df))

        brc = Birch(branching_factor=50, n_clusters=None,
                    threshold=birch_thresh, compute_labels=True)
        try:
            predicted_labels = brc.fit_predict(df)
        except:
            continue

        clusters = {}
        n = 0

        for item in predicted_labels:
            if item in clusters:
                clusters[item].append(
                    list((row_dict[n]).values()))  # since row_dict[n] is itself a dictionary
            else:
                clusters[item] = [list((row_dict[n]).values())]
            n += 1

        # clustering within each cluster, on counts
        # dictionary which maps original_cluster_key to new clusters within that cluster
        count_clusters = {}
        for item in clusters:
            count_clusters[item] = {}
            cluster_df = pd.DataFrame(clusters[item])
            cluster_row_dict = cluster_df.copy(deep=True)
            cluster_row_dict.fillna('', inplace=True)
            cluster_row_dict.index = range(len(cluster_row_dict))
            cluster_row_dict = cluster_row_dict.to_dict('index')

            df_counts = pd.DataFrame(cluster_df[cluster_df.columns[[3]]])
            df_counts.columns = ['counts']
            df_counts = pd.DataFrame(
                df_counts['counts'].str.split(';'))  # splitting counts

            for row in df_counts.itertuples():

                for i in range(0, len(row.counts)):
                    try:
                        temp_list = row.counts[i].split('#')
                        row.counts[i] = temp_list[0] + '#' + temp_list[1] + '#' + temp_list[
                            5]  # for retaining only COUNT_TYPE and QUANTITY and LOCATION ADM1 Code
                    except:
                        continue

                row.counts[:] = [x for x in row.counts if not x.startswith(
                    'CRISISLEX')]  # Removing CRISISLEX Entries due to elevated false positive rate

                if len(row.counts) == 1 and row.counts[0] == '':
                    # so that news with no counts are clustered together
                    row.counts.append('#')
                    row.counts.pop(0)

                if row.counts[len(row.counts) - 1] == '':
                    row.counts.pop()

            mlb4 = MultiLabelBinarizer()
            df_counts = pd.DataFrame(mlb4.fit_transform(df_counts['counts']),
                                     columns=mlb4.classes_, index=df_counts.index)

            brc2 = Birch(branching_factor=50, n_clusters=None,
                         threshold=count_thresh, compute_labels=True)
            predicted_labels2 = brc2.fit_predict(df_counts)

            n2 = 0
            for item2 in predicted_labels2:
                if item2 in count_clusters[item]:
                    count_clusters[item][item2].append(
                        list((cluster_row_dict[
                            n2]).values()))  # since cluster_row_dict[n2] is itself a dictionary
                else:
                    count_clusters[item][item2] = [
                        list((cluster_row_dict[n2]).values())]
                n2 += 1

        data = []
        for item in count_clusters:
            for item2 in count_clusters[item]:
                data.append(count_clusters[item][item2][0])
        df = pd.DataFrame(data)
        df.sort_values(by=[0], inplace=True)
        
        df.to_csv(output_path+file_prefix+'.csv',
                  sep=',', index=0, header=None)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
input_dir = "drive/MyDrive/Final Year Project/ground_truth_chains/"
output_dir = "drive/MyDrive/Final Year Project/redundancy_removed_chains/"

print("Removing redundancy")

remove_redundancy(input_dir, output_dir)

input_dir = output_dir
output_dir = "drive/MyDrive/Final Year Project/per_day_data/"

print("Preparing per day files")

path = input_dir

file_name = '*.csv'
all_files = glob.glob(os.path.join(path, file_name))

per_day_data = {}

for f in all_files:
    df = pd.read_csv(f, header=None, encoding='latin-1')
    df_list = df.values.tolist()

    for row in df_list:
        try:
            day = row[0][0:8]
            if day not in per_day_data:
                per_day_data[day] = []

            per_day_data[day].append(row)
        except:
            continue

for key in per_day_data:
    df = pd.DataFrame(per_day_data[key])
    df.sort_values(by=[0], inplace=True)
    df.to_csv(output_dir + key + '.csv', sep=',', index=0, header=None)

days = sorted(per_day_data.keys())
days.sort()

with open('days.txt', 'w') as f:
    for item in days:
        f.write("%s\n" % item)

Removing redundancy
Preparing per day files


In [None]:
pip install scipy



In [None]:
pip install sklearn



In [None]:
pip install hdbscan

Collecting hdbscan
  Downloading hdbscan-0.8.28.tar.gz (5.2 MB)
[K     |████████████████████████████████| 5.2 MB 4.2 MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Building wheels for collected packages: hdbscan
  Building wheel for hdbscan (PEP 517) ... [?25l[?25hdone
  Created wheel for hdbscan: filename=hdbscan-0.8.28-cp37-cp37m-linux_x86_64.whl size=2330808 sha256=680c009c5ac12c4fc6cbae4cf829847de04424d9f495779f2e5319debf493cc7
  Stored in directory: /root/.cache/pip/wheels/6e/7a/5e/259ccc841c085fc41b99ef4a71e896b62f5161f2bc8a14c97a
Successfully built hdbscan
Installing collected packages: hdbscan
Successfully installed hdbscan-0.8.28


In [None]:
from scipy import sparse
from operator import itemgetter

import nltk
import pandas as pd
import numpy as np
from scipy.sparse import hstack
from sklearn.cluster import Birch
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
import hdbscan
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import jaccard_score

def tokenize(text):

    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(
        text) for word in nltk.word_tokenize(sent)]
    return tokens

def run_algorithm(input_dir, output_dir, birch_thresh, window_size):

    file_index = {}
    fIndex = 0

    path = input_dir  # use your path

    temp_path = output_dir

    days = []

    with open('days.txt') as file:
        for line in file:
            line = line.strip()
            days.append(line)

    i = 1
    progress_df = pd.DataFrame()
    for k in range(0, len(days), window_size):

        first_half = days[k: k + window_size]

        df_list = []
        for file in first_half:
            df = pd.read_csv(path + file + '.csv',
                             header=None, encoding="latin-1")
            df_list.append(df)

        df = pd.concat(df_list, ignore_index=True)

        themes = pd.DataFrame(df[4])
        locations = pd.DataFrame(df[5])
        heading = pd.DataFrame(df[9])

        themes.columns = ['themes']
        locations.columns = ['locations']
        heading.columns = ['heading']

        for row in heading.itertuples():
            if type(row.heading) == float:
                heading.loc[row.Index, 'heading'] = ['#']
                continue

            # one hot approach
            tokenized_data = tokenize(row.heading.lower())
            heading.loc[row.Index, 'heading'] = tokenized_data

        row_dict = df.copy(deep=True)
        row_dict.fillna('', inplace=True)
        row_dict.index = range(len(row_dict))
        # dictionary that maps row number to row
        row_dict = row_dict.to_dict('index')

        locations = pd.DataFrame(
            locations['locations'].str.split(';'))  # splitting locations

        for row in locations.itertuples():
            try:
                row.locations[:] = [(row.locations[0].split('#'))[3]]
            except:
                continue

        mlb = MultiLabelBinarizer(sparse_output=False)
        sparse_heading = pd.DataFrame(mlb.fit_transform(
            heading['heading']), columns=mlb.classes_, index=heading.index)

        mlb2 = MultiLabelBinarizer(sparse_output=False)
        sparse_locations = pd.DataFrame(mlb2.fit_transform(
            locations['locations']), columns=mlb2.classes_, index=locations.index)

        np_array = np.hstack([sparse_heading, sparse_locations])
        df = pd.DataFrame(np_array)

        # no_clusters = min(df.shape[0],35)
        # kmeans = KMeans(n_clusters=no_clusters, random_state=0).fit(df)
        # predicted_labels = kmeans.labels_

        brc = Birch(branching_factor=50, n_clusters=None,
                    threshold=birch_thresh, compute_labels=True)
        predicted_labels = brc.fit_predict(df)

        # hdb_scan = hdbscan.HDBSCAN(min_cluster_size=5)
        # predicted_labels = hdb_scan.fit_predict(df)

        clusters = {}
        n = 0

        for item in predicted_labels:
            if item in clusters:
                # since row_dict[n] is itself a dictionary
                clusters[item].append(list((row_dict[n]).values()))
            else:
                clusters[item] = [list((row_dict[n]).values())]
            n += 1

        for item in clusters:
            if len(clusters[item]) > 0:
                clusters[item].sort(key=itemgetter(1))
                file_path_temp = os.path.join(
                    temp_path, "f" + str(fIndex) + ".csv")
                fIndex += 1
                df = pd.DataFrame(clusters[item])

                eR = df.head(1)  # eR : earliest representative

                for index, row in progress_df.iterrows():
                    temp_df = pd.DataFrame(eR)
                    temp_df = temp_df.append(row)

                    locations = pd.DataFrame(temp_df[5])
                    locations = locations.reset_index(drop=True)
                    locations.columns = ['locations']

                    heading = pd.DataFrame(temp_df[9])
                    heading = heading.reset_index(drop=True)
                    heading.columns = ['heading']

                    locations = pd.DataFrame(
                        locations['locations'].str.split(';'))  # splitting locations

                    for l_row in locations.itertuples():

                        for i in range(0, len(l_row.locations)):
                            try:
                                l_row.locations[i] = (l_row.locations[i].split('#'))[
                                    3]  # for retaining only ADM1 Code
                            except:
                                continue

                    for h_row in heading.itertuples():
                        if type(h_row.heading) == float:
                            heading.loc[h_row.Index, 'heading'] = ['#']
                            continue

                        tokenized_data = tokenize(h_row.heading.lower())
                        heading.at[h_row.Index, 'heading'] = tokenized_data

                    mlb = MultiLabelBinarizer(sparse_output=False)
                    sparse_heading = pd.DataFrame(mlb.fit_transform(heading['heading']), columns=mlb.classes_,
                                                  index=heading.index)

                    mlb2 = MultiLabelBinarizer(sparse_output=False)
                    sparse_locations = pd.DataFrame(mlb2.fit_transform(
                        locations['locations']), columns=mlb2.classes_, index=locations.index)

                    row_list = sparse_heading.values.tolist()
                    heading_similarity = jaccard_score(
                        row_list[0], row_list[1])

                    row_list = sparse_locations.values.tolist()
                    loc_similarity = jaccard_score(
                        row_list[0], row_list[1])

                    if heading_similarity > 0.1 and loc_similarity > 0.1:
                        previous_chain_id = temp_df[0].iloc[1]
                        file_path_temp = file_index[previous_chain_id]
                        conDf = pd.read_csv(
                            file_path_temp, header=None, encoding="latin-1")
                        df = pd.concat([conDf, df], ignore_index=True)
                        break

                lR = pd.DataFrame(df.tail(1))   # latest representative
                file_index[lR[0].iloc[0]] = file_path_temp

                progress_df = lR
                df.drop_duplicates(subset=0, keep="first", inplace=True)
                df.sort_values(by=[0], inplace=True)
                df.to_csv(file_path_temp, sep=',', index=0, header=None)

        i += 1

In [None]:
import pandas as pd
from sklearn import metrics
import numpy as np
from scipy.special import comb
import glob
import os


'''
This script is for evaluation of event chain algorithm
'''


def myComb(a, b):
    return comb(a, b, exact=True)


vComb = np.vectorize(myComb)


def get_tp_fp_tn_fn(cooccurrence_matrix):
    tp_plus_fp = vComb(cooccurrence_matrix.sum(0, dtype=int), 2).sum()
    tp_plus_fn = vComb(cooccurrence_matrix.sum(1, dtype=int), 2).sum()
    tp = vComb(cooccurrence_matrix.astype(int), 2).sum()
    fp = tp_plus_fp - tp
    fn = tp_plus_fn - tp
    tn = comb(cooccurrence_matrix.sum(), 2) - tp - fp - fn

    return [tp, fp, tn, fn]


def precision_recall_fmeasure(cooccurrence_matrix):
    tp, fp, tn, fn = get_tp_fp_tn_fn(cooccurrence_matrix)
    # print ("TP: %d, FP: %d, TN: %d, FN: %d" % (tp, fp, tn, fn))

    rand_index = (float(tp + tn) / (tp + fp + fn + tn))
    precision = float(tp) / (tp + fp)
    recall = float(tp) / (tp + fn)
    f1 = ((2.0 * precision * recall) / (precision + recall))

    return rand_index, precision, recall, f1


def evaluate_algorithm(input_dir, output_dir):

    original_clusters_path = input_dir
    file_name = '*.csv'
    all_files = glob.glob(os.path.join(original_clusters_path, file_name))

    gkg_id_to_index = {}
    class_labels_dict = {}
    label = 1
    index = 0

    for f in all_files:
        df = pd.read_csv(f, header=None, encoding='latin-1')
        df_list = df.values.tolist()

        for row in df_list:
            try:
                gkg_id = row[0].strip()
            except AttributeError:
                continue
            class_labels_dict[gkg_id] = label
            gkg_id_to_index[gkg_id] = index
            index += 1

        label += 1

    class_labels = [None]*len(class_labels_dict)
    for key, value in class_labels_dict.items():
        class_labels[gkg_id_to_index[key]] = value

    formed_clusters_path = output_dir
    file_name = '*.csv'
    all_files = glob.glob(os.path.join(formed_clusters_path, file_name))

    cluster_labels_dict = {}
    label = 1
    for f in all_files:
        df = pd.read_csv(f, header=None, encoding='latin-1')
        df_list = df.values.tolist()

        for row in df_list:
            gkg_id = row[0]
            cluster_labels_dict[gkg_id] = label

        label += 1

    cluster_labels = [0] * len(cluster_labels_dict)
    for key, value in cluster_labels_dict.items():
        cluster_labels[gkg_id_to_index[key]] = value

    matrix = metrics.cluster.contingency_matrix(class_labels, cluster_labels)
    rand_index, precision, recall, f1 = precision_recall_fmeasure(matrix)

    ari = metrics.cluster.adjusted_rand_score(class_labels, cluster_labels)
    nmi = metrics.normalized_mutual_info_score(class_labels, cluster_labels)

    result = [precision, recall, f1, ari, nmi]
    return result

In [None]:
import matplotlib.pyplot as plt

input_directory = "drive/MyDrive/Final Year Project/redundancy_removed_chains/"
per_day_data = "drive/MyDrive/Final Year Project/per_day_data/"
f_scores = []

# for k in range (2, 21, 2):
#     output_directory = "drive/MyDrive/Final Year Project/output/window_size_"+'{}'.format(k)
#     run_algorithm(per_day_data, output_directory, 2.3, k)
#     result = evaluate_algorithm(input_directory, output_directory)
#     f_scores.append(result[2])
#     print('Window Size: {}, Birch Threshold: {}, Precision: {:.2f}, Recall: {:.2f}, F1-Score: {:.2f}, NMI: {:.2f}, ARI: {:.2f}'.format(
#             k, 2.3, result[0], result[1], result[2], result[3], result[4]))

output_directory = "drive/MyDrive/Final Year Project/output_dbscan"
run_algorithm(per_day_data, output_directory, 2.25, 8)
result = evaluate_algorithm(input_directory, output_directory)
print('Window Size: {}, Birch Threshold: {}, Precision: {:.2f}, Recall: {:.2f}, F1-Score: {:.2f}, NMI: {:.2f}, ARI: {:.2f}'.format(
            8, 2.25, result[0], result[1], result[2], result[3], result[4]))

# l = 1
# k = 0.25
# while k <= 3.0:
#     output_directory = "drive/MyDrive/Final Year Project/output/threshold_"+'{}'.format(l)
#     run_algorithm(per_day_data, output_directory, k, 8)
#     result = evaluate_algorithm(input_directory, output_directory)
#     f_scores.append(result[2])
#     print('Window Size: {}, Birch Threshold: {}, Precision: {:.2f}, Recall: {:.2f}, F1-Score: {:.2f}, NMI: {:.2f}, ARI: {:.2f}'.format(
#             8, k, result[0], result[1], result[2], result[3], result[4]))
#     l += 1
#     k += 0.25

# window_sizes = [2,4,6,8,10,12,14,16,18,20]
# threshold = [0.25,0.50,0.75,1.0,1.25,1.50,1.75,2.0,2.25,2.50,2.75,3.0]
# print(f_scores)
# plt.scatter(window_sizes, f_scores)
# plt.scatter(threshold, f_scores)
# plt.show()

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
