In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
# import the dataset loader
from src.data.dataset import load_dataset, DATA_PATHS

# load the raw articles
dataset = load_dataset(DATA_PATHS["processed"])

In [None]:
data_frame = pd.DataFrame(dataset)
data_frame

In [None]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
import spacy

nlp = spacy.load('en_core_web_sm')

In [None]:
dataNBA = []

#selecting news from x date to y date
import datetime
x = datetime.datetime(2020, 10, 1)  #from this date
y = datetime.datetime(2020, 10, 31) #to this date
for a in dataset:
    if a["date"]>= x and a["date"] <= y:
        dataNBA.append(a)

In [None]:
dataNBA = pd.DataFrame(dataNBA)

#selecting news from preselected concepts (in this case: basketball OR nba)
selection = ["basketball", "nba"]  
mask = dataNBA.concepts.apply(lambda x: any(item for item in selection if item in x))
dataNBA = dataNBA[mask]
len(dataNBA)

In [None]:
#selecting only english articles
selection = ["eng"]  
mask = dataNBA.lang.apply(lambda x: any(item for item in selection if item in x))
dataNBA = dataNBA[mask]
len(dataNBA)

In [None]:
dataNBA.lang.unique()

# Finding and deleting duplicate articles

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# From dataframe deletes all duplicates
def del_duplicates(dataframe):
    corpus = dataframe["body"].tolist()
    
    # Create a TfidfVectorizer object and fit it to the corpus
    vectorizer = TfidfVectorizer()
    tfidf = vectorizer.fit_transform(corpus)

    # Compute the cosine similarity between all pairs of documents
    cos_sim = cosine_similarity(tfidf)
    
    # Creating a list of duplicates with index pointer
    x = []
    threshold = 0.8
    for i in range(len(corpus)):
        for j in range(i+1, len(corpus)):
            if cos_sim[i,j] > threshold:
                if len([item for item in x if item[1] == i]) == 0:
                    x.append((i,j,cos_sim[i,j]))
                    
    
    #adding duplicate column
    dataframe['duplicate_of'] = np.nan
    
    dataframe = dataframe.reset_index(drop=True)
    
    keep_cols = ['title', 'body', 'duplicate_of','dateTime','date','eventUri']
    dataframe = dataframe[dataframe.columns.intersection(keep_cols)]
    
    for duplicate in x:
        DuplicateIndex = duplicate[1]
        dataframe.at[DuplicateIndex, "duplicate_of"] = duplicate[0]
    
    return dataframe[np.isnan(dataframe['duplicate_of'])]

# Binary divison with K-means

In [None]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

def k_means(dataframe):
    if 'embeddings' not in dataframe.columns:    
        dataframeBodyList = dataframe["body"].tolist()
        model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
        embeddings = model.encode(dataframeBodyList)
        dataframe['embeddings'] = embeddings.tolist()

    dataframe['cluster'] = [[-1] for _ in range(len(dataframe))]
    clusterMYDataframe(dataframe)
    clusterNumb(dataframe)
    dataframe.cluster = dataframe.cluster.apply(lambda x: x[-1]) 
    return dataframe


def KmeansCluster(Dataframe):
    embeddings = Dataframe["embeddings"].tolist()
    clusters = KMeans(n_clusters=2, random_state=42).fit(embeddings)
    clusters = clusters.labels_.tolist()
    
    addToClusterColumn(Dataframe, clusters)
    
    dataOne = Dataframe[Dataframe.cluster.str[-1] == 0] 
    dataTwo = Dataframe[Dataframe.cluster.str[-1] == 1]
    
    return dataOne, dataTwo, cosine(embeddings)

# Calculates silhouette score
def silhouette(clusters, embeddings):
    score = silhouette_score(embeddings, clusters, metric = "cosine")
    if score <= 0.5:
        return True
    return False

# Calculates cosine similarity
def cosine(embeddings):
    distance = find_average_distance_to_centroid(embeddings)
    if distance > 0.1:
        return True
    return False

# Return True if cluster has 5 or more articles
def condition(Dataframe):
    NumOfDocs = Dataframe.shape[0]
    if NumOfDocs >= 5:
        return True
    return False

def addToClusterColumn(Dataframe, clusters):
    y = 0
    for val in Dataframe.cluster:
        val.append(clusters[y])
        y+=1
        
def clusterMYDataframe(Dataframe):
    dataOne, dataTwo, condi = KmeansCluster(Dataframe)
    if condition(dataOne) and condi:
        clusterMYDataframe(dataOne)
    if condition(dataTwo) and condi:
        clusterMYDataframe(dataTwo)

        
#CLUSTER NUMBER
def MaxListLength(Dataframe):
    length = 0
    y = 0
    for val in Dataframe.cluster:
        if len(val) > length:
            length = len(val)
        y+=1
    return length

#this is used to create a unique decimal number by adding zeroos to binary numbers that are > of the MAX binary number
def ZerosToList(mylist, maxLength):
    x = len(mylist)
    N = maxLength - x
    my_array = np.asarray(mylist)
    my_array = np.pad(my_array, (0, N), 'constant')
    mylist = list(my_array)
    return mylist

def clusterNumb(Dataframe):
    maxListLen = MaxListLength(Dataframe)
    y = 0
    for val in Dataframe.cluster:
        val.pop(0)
        Binar = ZerosToList(val, maxListLen)
        res = int("".join(str(x) for x in Binar), 2)
        val.append(res)
        y+=1

# TIME window

In [None]:
from datetime import timedelta

def TimeWindow(dataframe, IndexDateLIst, NumOfDays):
    max_window_start = None
    max_window_end = None
    max_window_count = 0

    # Sort the list of tuples by date
    sorted_list = sorted(IndexDateLIst, key=lambda x: x[1])

    # Loop through the sorted list of tuples and count the number of indexes within each (NumOfDays) window
    for i in range(len(sorted_list)):
        window_start = sorted_list[i][1]
        window_end = window_start + timedelta(days=NumOfDays)
        window_count = 1
        for j in range(i+1, len(sorted_list)):
            if sorted_list[j][1] >= window_end:
                break
            window_count += 1

        if window_count > max_window_count:
            max_window_count = window_count
            max_window_start = window_start
            max_window_end = window_end

    # List of index-es of documents published outside the time window
    sorted_list = [x[0] for x in sorted_list if x[1] < max_window_start or x[1] > max_window_end]

    dataframe = dataframe.drop(index=sorted_list)

    return dataframe


def Time(dataframe, NumOfDays):
    clusters = dataframe.cluster.unique()
    for ClusterID in clusters:
        IndexDateLIst = list(zip(dataframe[dataframe['cluster'] == ClusterID].index, dataframe[dataframe.cluster == ClusterID].date))
        dataframe = TimeWindow(dataframe, IndexDateLIst, NumOfDays)
    return dataframe 

# Cosine similarity

In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def find_centroid(embeddings):
    centroid = np.mean(embeddings, axis=0)
    return centroid

def find_average_distance_to_centroid(embeddings):
    # Find the centroid of the embeddings
    centroid = find_centroid(embeddings)
    # Compute the cosine similarities between each embedding and the centroid
    similarities = cosine_similarity(embeddings, centroid.reshape(1, -1)).flatten()
    # Compute the distances between each embedding and the centroid
    distances = 1 - similarities
    # Compute the average distance
    avg_distance = np.mean(distances)
    return avg_distance

def inside_cluster_sim(dataframe):
    delClusters = []
    clusters = dataframe.cluster.unique()
    for cluster in clusters:
        embedds = dataframe[dataframe.cluster == cluster]["embeddings"].tolist()
        if len(dataframe[dataframe.cluster == cluster]) == 1:
            delClusters.append(cluster)
            continue
        if len(dataframe[dataframe.cluster == cluster]) == 2:
            if cosine_similarity(embedds, embedds)[0][1] < 0.85:
                delClusters.append(cluster)
                continue

        distance = find_average_distance_to_centroid(embedds)
        if distance > 0.2:
            delClusters.append(cluster)

    return dataframe[dataframe.cluster.isin(delClusters) == False]

# NER (Named entity recognition)

In [None]:
#calculating jaccard similarity
def jaccard_similarity(list1, list2):
    intersection = len(list(set(list1).intersection(list2)))
    union = (len(set(list1)) + len(set(list2))) - intersection
    return float(intersection) / union


def cluster_ents(cluster, dataframe):
    even = dataframe[dataframe.cluster == cluster]["body"]
    index_of_body = dataframe[dataframe.cluster == cluster].index
    entitete = []
    for a, b in zip(even, index_of_body):
        doc = nlp(a)
        entitete.append([b, [sent.text.strip() for sent in doc.ents]])
    return entitete


def cluster_jaccard_similarity(entitete):
    Inside_cluster_news_jaccard_similarity = []
    delRows = []
    for i in range(len(entitete)):
        simil = 0
        for j in range(len(entitete)):
            if i == j:
                continue
            similarity = jaccard_similarity(entitete[i][1], entitete[j][1])
            simil = simil + similarity
            
        simil = simil/(len(entitete)-1)
        
        if simil < 0.1:
            delRows.append(entitete[i][0])
        Inside_cluster_news_jaccard_similarity.append([entitete[i][0],simil])
    return delRows


def NERjaccard(dataframe):
    delRows_NER = []
    clusters = dataframe.cluster.unique()
    
    for cluster in clusters:            
        entitete = cluster_ents(cluster, dataframe)
        Inside_cluster_news_jaccard_similarity = cluster_jaccard_similarity(entitete)

        delRows_NER.extend(Inside_cluster_news_jaccard_similarity)
        
    return dataframe[dataframe.index.isin(delRows_NER) == False]

# Reading

In [None]:
import re

class TextFormat:
    @staticmethod
    def normalize_whitespaces(text: str):
        return re.sub(r"\s+", " ", text)

    @staticmethod
    def strip_trailing_whitespaces(text: str):
        return text.strip()

    @staticmethod
    def clean_text(text: str):
        text = TextFormat.normalize_whitespaces(text)
        text = TextFormat.strip_trailing_whitespaces(text)
        return text
    
def clean_body(dataframe):
    dataframe["body"] = dataframe["body"].apply(lambda x: TextFormat.clean_text(x))
    dataframe["title"] = dataframe["title"].apply(lambda x: TextFormat.clean_text(x))
    return dataframe

# ENTIRE pipeline

In [None]:
def cluster_by_event(dataframe, days):
    dataframe = del_duplicates(dataframe)
    orig = dataframe.copy(deep=True)
    clustered = pd.DataFrame()
    for _ in tqdm(range(1)):
        if _ == 0:
            dataframe = k_means(dataframe)
            dataframe = Time(dataframe, days)
            dataframe = inside_cluster_sim(dataframe)
            dataframe = NERjaccard(dataframe)
            clustered = dataframe.copy(deep=True)
        else:
            dataframe = orig.drop(clustered.index)
            dataframe = k_means(dataframe)
            dataframe = Time(dataframe, days)
            dataframe = inside_cluster_sim(dataframe)
            dataframe = NERjaccard(dataframe)
            clustered = pd.concat([clustered, dataframe]).sort_index()
    return clustered          

In [None]:
testing = cluster_by_event(dataNBA, 2)

In [None]:
testing