# Clustering papers based on titles, keywords and topics

In [1]:
# importing libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re
import string
import itertools
from tqdm import tqdm_notebook as tqdm

In [2]:
# Reading data from a csv file
df_train = pd.read_csv("AAAI.csv")

In [3]:
df_train

Unnamed: 0,Title,Keywords,Topics,High-Level Keyword(s),Abstract
0,The cascade auction – a mechanism for deterrin...,Mediators\nAuctions\nCollusion\nAd Exchanges,Auctions and Market-Based Systems\nE-Commerce\...,Multiagent Systems,We introduce a sealed bid auction of a single ...
1,Basis Adaptation for Sparse Nonlinear Reinforc...,Reinforcement learning\nSparsity\nMirror desce...,Dimension Reduction/Feature Selection\nOnline ...,Machine Learning,This paper presents a new approach to basis ad...
2,Optimal Coalition Structures in Cooperative Gr...,Cooperative Game Theory\nCoalition Structure G...,Coordination and Collaboration\nGame Theory,Multiagent Systems,Representation languages for coalitional game...
3,External Memory Best-First Search for Multiple...,External-Memory Search\nParallel Search\nMulti...,Heuristic Search\nEvaluation and Analysis (Sea...,Heuristic Search and Optimization,Multiple sequence alignment (MSA) is a central...
4,Posted Prices Exchange for Display Advertising...,Display Advertising\nDynamic Pricing\nMarket E...,Auctions and Market-Based Systems\nE-Commerce\...,Multiagent Systems,We propose a new market design for display adv...
...,...,...,...,...,...
145,Probabilistic Sense Sentiment Similarity throu...,Sentiment Similarity\nIndirect yse/no Question...,Information Extraction\nQuestion Answering\nNa...,Natural Language Processing,Sentiment Similarity of word pairs reflects th...
146,Strategic Behavior when Allocating Indivisible...,Fair division\nElicition free protocol\nBackwa...,Game Theory\nMechanism Design,Multiagent Systems,We study a simple sequential allocation mechan...
147,A Pattern Matching Based Graphical Model for Q...,Opinion Question\nSubjectivity Detection\nOpin...,Natural Language Processing (General/Other),Natural Language Processing,This paper presents the results of developing ...
148,Grounding Natural Language References to Unvis...,Human-robot interaction\nIntegrated perception...,Natural Language Processing (General/Other)\nR...,Natural Language Processing,While much research exists on resolving spatia...


In [4]:
all_possible_predictions = list(set(df_train["High-Level Keyword(s)"]))
print(f"All possible prediction classes are {all_possible_predictions}")

All possible prediction classes are ['Heuristic Search and Optimization', 'Multidisciplinary Topics', 'Constraints and Satisfiability', 'Reasoning about Plans, Processes, and Actions', 'Natural Language Processing', 'Multiagent Systems', 'Knowledge Representation and Reasoning', 'Robotics', 'Machine Learning']


In [5]:
print(f"Printing length of clusters:\n")
for i in all_possible_predictions:
    num = df_train[df_train["High-Level Keyword(s)"] == i].shape[0]
    print(f"{i}: {num}")

Printing length of clusters:

Heuristic Search and Optimization: 9
Multidisciplinary Topics: 7
Constraints and Satisfiability: 10
Reasoning about Plans, Processes, and Actions: 12
Natural Language Processing: 13
Multiagent Systems: 32
Knowledge Representation and Reasoning: 17
Robotics: 5
Machine Learning: 45


In [6]:
# sorting all_possible_predictions list on number of predictions in dataset 
all_possible_predictions = sorted(all_possible_predictions, key = lambda x: df_train[df_train["High-Level Keyword(s)"] == x].shape[0])
all_possible_predictions = all_possible_predictions[::-1]
all_possible_predictions

['Machine Learning',
 'Multiagent Systems',
 'Knowledge Representation and Reasoning',
 'Natural Language Processing',
 'Reasoning about Plans, Processes, and Actions',
 'Constraints and Satisfiability',
 'Heuristic Search and Optimization',
 'Multidisciplinary Topics',
 'Robotics']

In [7]:
# Removing the High-Level Keyword(s) column, X_train is the feature vector
X_train = df_train.drop("High-Level Keyword(s)", axis=1)

In [8]:
X_train

Unnamed: 0,Title,Keywords,Topics,Abstract
0,The cascade auction – a mechanism for deterrin...,Mediators\nAuctions\nCollusion\nAd Exchanges,Auctions and Market-Based Systems\nE-Commerce\...,We introduce a sealed bid auction of a single ...
1,Basis Adaptation for Sparse Nonlinear Reinforc...,Reinforcement learning\nSparsity\nMirror desce...,Dimension Reduction/Feature Selection\nOnline ...,This paper presents a new approach to basis ad...
2,Optimal Coalition Structures in Cooperative Gr...,Cooperative Game Theory\nCoalition Structure G...,Coordination and Collaboration\nGame Theory,Representation languages for coalitional game...
3,External Memory Best-First Search for Multiple...,External-Memory Search\nParallel Search\nMulti...,Heuristic Search\nEvaluation and Analysis (Sea...,Multiple sequence alignment (MSA) is a central...
4,Posted Prices Exchange for Display Advertising...,Display Advertising\nDynamic Pricing\nMarket E...,Auctions and Market-Based Systems\nE-Commerce\...,We propose a new market design for display adv...
...,...,...,...,...
145,Probabilistic Sense Sentiment Similarity throu...,Sentiment Similarity\nIndirect yse/no Question...,Information Extraction\nQuestion Answering\nNa...,Sentiment Similarity of word pairs reflects th...
146,Strategic Behavior when Allocating Indivisible...,Fair division\nElicition free protocol\nBackwa...,Game Theory\nMechanism Design,We study a simple sequential allocation mechan...
147,A Pattern Matching Based Graphical Model for Q...,Opinion Question\nSubjectivity Detection\nOpin...,Natural Language Processing (General/Other),This paper presents the results of developing ...
148,Grounding Natural Language References to Unvis...,Human-robot interaction\nIntegrated perception...,Natural Language Processing (General/Other)\nR...,While much research exists on resolving spatia...


# Cleaning Text 

In [9]:
def remove_punctuation(s):
    '''
    s: string 
    String will be returned by removing punctuations and newline character '\n' from s
    '''
    s = s.replace('\n', ' ')
    translator = str.maketrans(string.punctuation, ' '*len(string.punctuation)) #map punctuation to space
    s = s.translate(translator)
    
    # remove redundant spaces after replacing puncutations and newline with spaces
    s = re.sub(' +', ' ', s)
    return s

def clean_df(df):
    '''
    df: Pandas DataFrame
    Inplace changes dataframe
    '''
    cols = df.columns
    for col in cols:
        for i in range(len(df[col])):
            df[col][i] = remove_punctuation(df[col][i])

In [10]:
# Cleaning X_train
clean_df(X_train)

In [11]:
ord(df_train["Title"][0][20])

8211

In [12]:
ord('-')

45

In [13]:
ord(df_train["Title"][28][31])

8212

In [14]:
X_train

Unnamed: 0,Title,Keywords,Topics,Abstract
0,The cascade auction – a mechanism for deterrin...,Mediators Auctions Collusion Ad Exchanges,Auctions and Market Based Systems E Commerce G...,We introduce a sealed bid auction of a single ...
1,Basis Adaptation for Sparse Nonlinear Reinforc...,Reinforcement learning Sparsity Mirror descent...,Dimension Reduction Feature Selection Online L...,This paper presents a new approach to basis ad...
2,Optimal Coalition Structures in Cooperative Gr...,Cooperative Game Theory Coalition Structure Ge...,Coordination and Collaboration Game Theory,Representation languages for coalitional games...
3,External Memory Best First Search for Multiple...,External Memory Search Parallel Search Multipl...,Heuristic Search Evaluation and Analysis Searc...,Multiple sequence alignment MSA is a central p...
4,Posted Prices Exchange for Display Advertising...,Display Advertising Dynamic Pricing Market Equ...,Auctions and Market Based Systems E Commerce M...,We propose a new market design for display adv...
...,...,...,...,...
145,Probabilistic Sense Sentiment Similarity throu...,Sentiment Similarity Indirect yse no Question ...,Information Extraction Question Answering Natu...,Sentiment Similarity of word pairs reflects th...
146,Strategic Behavior when Allocating Indivisible...,Fair division Elicition free protocol Backward...,Game Theory Mechanism Design,We study a simple sequential allocation mechan...
147,A Pattern Matching Based Graphical Model for Q...,Opinion Question Subjectivity Detection Opinio...,Natural Language Processing General Other,This paper presents the results of developing ...
148,Grounding Natural Language References to Unvis...,Human robot interaction Integrated perception ...,Natural Language Processing General Other Robo...,While much research exists on resolving spatia...


# Utility Funcitons ( Jacardian Similarity, getting thresholds etc.)

JC = $\dfrac{H \cap S}{H \cup S}$

In [15]:
def Intersection(H, S):
    '''
    Returns a list Intersection of H and S set
    '''
    return set(H) - (set(H) - set(S))

def Union(H, S):
    '''
    Returns a list of union of H and S set
    '''
    return set(H + S)

def Jacardian_score(H, S):
    '''
    Returns jacardian similarity
    '''
    if len(Union(H, S)) == 0:
        print(f"Empty H and S Passed")
        return 0
    return len(Intersection(H, S))/ len(Union(H, S))

def dist(H, S):
    '''
        Returns Jacardian Score
    '''
    return Jacardian_score(H, S)

def calc_dist(a, b, f, X):
    '''
    a: list of indices of cluster from X which belongs to one segment
    b: list of indices of cluster from X which belongs to one segment
    f: funtion, for single use min, multi use max
    Uses function dist to calculate jacardian distance between two clusters a and b
    '''
    distances = []
    for i in a:
        for j in b:
            distances.append(dist(X[i], X[j]))
    return f(distances)

# Agglomerative Clustering

In [18]:
def agglomerative_clustering_single(X_train, n_cluster):
    '''
    X_train: Training dataset on which it will calculate similarity
    n_cluster: Number of clusters required
    Returns a cluster list which will contain informations of clusters
    '''
    #Every points in datset is a single cluster initially
    clusters = []
    for i in range(len(X_train)):
        clusters.append([i])
    # curr cluster is number of current clusters, dist_store will store distances
    curr_clusters = len(X_train)
    dist_store = []
    while curr_clusters > n_cluster:
        dist_store.clear()
        for i in range(curr_clusters):
            for j in range(i+1, curr_clusters):
                dist_store.append([calc_dist(clusters[i], clusters[j], min, X_train), j, i])
        # sorting distances in ascending order and merging two lists having min distances
        dist_store = sorted(dist_store, key = lambda x:x[0])
        # now merge first one
        to_merge = dist_store[0]
        clusters[to_merge[1]].extend(clusters[to_merge[2]])
        del clusters[to_merge[2]]
        curr_clusters -= 1
    clusters = sorted(clusters, key = lambda x: len(x))
    clusters = clusters[::-1]
    return clusters
    

In [19]:
def agglomerative_clustering_multi(X_train, n_cluster):
    '''
    X_train: Training dataset on which it will calculate similarity
    n_cluster: Number of clusters required
    Returns a cluster list which will contain informations of clusters
    '''
    #Every points in datset is a single cluster initially
    clusters = []
    for i in range(len(X_train)):
        clusters.append([i])
    # curr cluster is number of current clusters, dist_store will store distances
    curr_clusters = len(X_train)
    dist_store = []
    while curr_clusters > n_cluster:
        dist_store.clear()
        for i in range(curr_clusters):
            for j in range(i+1, curr_clusters):
                dist_store.append([calc_dist(clusters[i], clusters[j], max, X_train), j, i])
        # sorting distances in ascending order and merging two lists having min distances
        dist_store = sorted(dist_store, key = lambda x:x[0])
        # now merge first one
        to_merge = dist_store[0]
        clusters[to_merge[1]].extend(clusters[to_merge[2]])
        del clusters[to_merge[2]]
        curr_clusters -= 1
    clusters = sorted(clusters, key = lambda x: len(x))
    clusters = clusters[::-1]
    return clusters
    

# Clustering based on Title

In [20]:
X_title = X_train["Title"].str.split(" ")

In [21]:
title_pred_single = agglomerative_clustering_single(X_title, 9)

In [64]:
print(f"Printing clusters for single linkage hierarchical clustering, considering only title")
for i, cluster in enumerate(title_pred_single):
    print(f"{i}: {cluster}")

Printing clusters for single linkage hierarchical clustering, considering only title
0: [141, 140, 139, 138, 137, 136, 135, 134, 133, 132, 131, 130, 129, 128, 127, 126, 125, 124, 123, 122, 121, 120, 119, 118, 117, 116, 115, 114, 113, 112, 111, 110, 109, 108, 107, 106, 105, 104, 103, 102, 101, 100, 99, 98, 97, 96, 95, 94, 93, 92, 91, 90, 89, 88, 87, 86, 85, 84, 83, 82, 81, 80, 79, 78, 77, 76, 75, 74, 73, 72, 71, 70, 69, 68, 67, 66, 65, 64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
1: [149]
2: [148]
3: [147]
4: [146]
5: [145]
6: [144]
7: [143]
8: [142]


In [23]:
title_pred_multi = agglomerative_clustering_multi(X_title, 9)

In [66]:
print(f"Printing clusters for multi linkage hierarchical clustering, considering only title")
for i, cluster in enumerate(title_pred_multi):
    print(f"{i}: {cluster}")

Printing clusters for multi linkage hierarchical clustering, considering only title
0: [148, 127, 111, 94, 84, 71, 62, 54, 49, 46, 36, 31, 28, 23, 21, 18, 15, 12, 10, 8, 7, 6, 5, 4, 3, 2, 1, 0]
1: [149, 129, 113, 95, 85, 73, 63, 56, 55, 50, 45, 41, 40, 39, 38, 34, 30, 142, 125, 109, 104, 131, 128]
2: [144, 123, 105, 93, 83, 79, 72, 143, 136, 119, 97, 86, 74, 64, 57, 51, 47, 43, 134, 116, 112, 106]
3: [146, 126, 110, 92, 81, 69, 60, 53, 44, 35, 32, 29, 26, 24, 20, 16, 14, 139, 130]
4: [140, 118, 96, 87, 75, 61, 52, 48, 42, 37, 33, 27, 25, 22, 19, 17, 13, 11, 9]
5: [141, 122, 107, 103, 135, 117, 115, 133, 114, 100, 98, 88, 76, 67, 59]
6: [147, 132, 137, 120, 99, 90, 77, 68, 65]
7: [145, 124, 108, 89, 82, 78, 70, 66, 58]
8: [138, 121, 102, 101, 91, 80]


# Clustering based on Keywords

In [25]:
X_keywords = X_train["Keywords"].str.split(" ")

In [26]:
keywords_pred_single = agglomerative_clustering_single(X_keywords, 9)

In [68]:
print(f"Printing clusters for single linkage hierarchical clustering, considering only keywords")
for i, cluster in enumerate(keywords_pred_single):
    print(f"{i}: {cluster}")

Printing clusters for single linkage hierarchical clustering, considering only keywords
0: [141, 140, 139, 138, 137, 136, 135, 134, 133, 132, 131, 130, 129, 128, 127, 126, 125, 124, 123, 122, 121, 120, 119, 118, 117, 116, 115, 114, 113, 112, 111, 110, 109, 108, 107, 106, 105, 104, 103, 102, 101, 100, 99, 98, 97, 96, 95, 94, 93, 92, 91, 90, 89, 88, 87, 86, 85, 84, 83, 82, 81, 80, 79, 78, 77, 76, 75, 74, 73, 72, 71, 70, 69, 68, 67, 66, 65, 64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
1: [149]
2: [148]
3: [147]
4: [146]
5: [145]
6: [144]
7: [143]
8: [142]


In [28]:
keywords_pred_multi = agglomerative_clustering_multi(X_keywords, 9)

In [67]:
print(f"Printing clusters for multi linkage hierarchical clustering, considering only keywords")
for i, cluster in enumerate(keywords_pred_multi):
    print(f"{i}: {cluster}")

Printing clusters for multi linkage hierarchical clustering, considering only keywords
0: [139, 119, 101, 87, 77, 67, 56, 46, 132, 121, 103, 128, 109, 93, 83, 71, 62, 54, 39, 33, 29, 23, 19, 15, 126, 107, 91, 81, 70, 60, 50, 43]
1: [149, 127, 108, 98, 134, 117, 147, 118, 99, 86, 73, 65, 57, 47, 37, 31, 26, 25, 21, 18, 13, 10, 8, 6, 4]
2: [141, 112, 95, 84, 74, 64, 53, 42, 35, 30, 24, 20, 16, 12, 133, 138, 123, 105, 90, 79, 69, 59, 49, 41]
3: [145, 113, 96, 76, 63, 51, 40, 34, 28, 22, 17, 14, 11, 9, 7, 5, 3, 2, 1, 0]
4: [148, 124, 106, 144, 129, 114, 130, 140, 116, 97, 85, 75, 66, 55, 45, 38, 32, 27]
5: [136, 111, 92, 82, 72, 61, 52, 44, 36, 131, 115, 100]
6: [143, 135, 110, 94, 88, 78, 68, 58, 48]
7: [146, 120, 102, 89, 80]
8: [142, 125, 137, 122, 104]


# Clustering on Topics

In [30]:
X_topics = X_train["Topics"].str.split(" ")

In [31]:
topics_pred_single = agglomerative_clustering_single(X_topics, 9)

In [69]:
print(f"Printing clusters for single linkage hierarchical clustering, considering only topics")
for i, cluster in enumerate(topics_pred_single):
    print(f"{i}: {cluster}")

Printing clusters for single linkage hierarchical clustering, considering only topics
0: [141, 140, 139, 138, 137, 136, 135, 134, 133, 132, 131, 130, 129, 128, 127, 126, 125, 124, 123, 122, 121, 120, 119, 118, 117, 116, 115, 114, 113, 112, 111, 110, 109, 108, 107, 106, 105, 104, 103, 102, 101, 100, 99, 98, 97, 96, 95, 94, 93, 92, 91, 90, 89, 88, 87, 86, 85, 84, 83, 82, 81, 80, 79, 78, 77, 76, 75, 74, 73, 72, 71, 70, 69, 68, 67, 66, 65, 64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
1: [149]
2: [148]
3: [147]
4: [146]
5: [145]
6: [144]
7: [143]
8: [142]


In [33]:
topics_pred_multi = agglomerative_clustering_multi(X_topics, 9)

In [71]:
print(f"Printing clusters for multi linkage hierarchical clustering, considering only topics")
for i, cluster in enumerate(topics_pred_multi):
    print(f"{i}: {cluster}")

Printing clusters for multi linkage hierarchical clustering, considering only topics
0: [146, 71, 39, 37, 27, 26, 109, 117, 90, 124, 67, 48, 119, 116, 107, 89, 58, 115, 91, 56, 35, 18, 16, 88, 55, 36, 30]
1: [145, 136, 73, 44, 33, 32, 106, 77, 126, 92, 144, 108, 137, 85, 70, 74, 46, 21, 20, 128, 93]
2: [133, 96, 61, 127, 97, 62, 103, 104, 82, 130, 69, 31, 25, 15, 11, 5, 3, 1, 0, 110, 63]
3: [149, 138, 99, 118, 113, 131, 75, 60, 45, 87, 120, 111, 65, 141, 129, 72, 49, 34, 125, 123]
4: [148, 143, 105, 86, 98, 83, 81, 52, 42, 139, 134, 100, 53, 41, 19, 14, 114]
5: [135, 79, 54, 122, 112, 66, 43, 24, 12, 10, 94, 76, 101, 64]
6: [140, 80, 57, 121, 68, 47, 40, 132, 95, 59, 38]
7: [142, 102, 78, 50, 29, 23, 22, 13, 8, 6]
8: [147, 84, 51, 28, 17, 9, 7, 4, 2]


# Clustering on Abstract

In [35]:
X_abstract = X_train["Abstract"].str.split(" ")

In [36]:
abstract_pred_single = agglomerative_clustering_single(X_abstract, 9)

In [73]:
print(f"Printing clusters for single linkage hierarchical clustering, considering only abstract")
for i, cluster in enumerate(abstract_pred_single):
    print(f"{i}: {cluster}")

Printing clusters for single linkage hierarchical clustering, considering only abstract
0: [149, 136, 138, 70, 144, 117, 55, 73, 88, 0, 93, 8, 11, 124, 25, 20, 9, 78, 4, 41, 63, 75, 36, 54, 126, 98, 68, 137, 109, 39, 5, 97, 129, 12, 120, 108, 79, 34, 72, 60, 100, 59, 106, 19, 51, 17, 22, 118, 85, 52, 30, 35, 119, 116, 6, 146, 42, 33, 111, 113, 135, 90, 62, 87, 128, 145, 50, 127, 141, 31, 53, 71, 47, 15, 82, 56, 107, 121, 18, 66, 57, 2, 28, 49, 16, 94, 83, 95, 134, 76, 23, 58, 104, 61, 114, 96, 115, 105, 148, 131, 29, 48, 7, 99, 46, 122, 86, 89, 1, 37, 101, 80, 64, 103, 110, 67, 123, 133, 65, 69, 74, 132, 140, 38, 147, 139, 13, 26, 43, 130, 24, 92, 3, 45, 40, 21, 77, 102, 143, 91, 27, 142]
1: [125]
2: [112]
3: [84]
4: [81]
5: [44]
6: [32]
7: [14]
8: [10]


In [38]:
abstract_pred_multi = agglomerative_clustering_multi(X_abstract, 9)

In [72]:
print(f"Printing clusters for multi linkage hierarchical clustering, considering only abstract")
for i, cluster in enumerate(abstract_pred_multi):
    print(f"{i}: {cluster}")

Printing clusters for multi linkage hierarchical clustering, considering only abstract
0: [146, 42, 132, 47, 125, 122, 138, 31, 72, 60, 88, 18, 126, 28, 23, 102, 145, 106, 134, 76, 51, 81, 21, 11, 0, 139, 65, 133, 97, 104, 61, 66, 24, 112, 43, 96, 13, 94, 83]
1: [147, 137, 5, 111, 105, 90, 62, 3, 79, 34, 57, 141, 30, 110, 119, 50, 7, 118, 75, 36, 92, 10]
2: [149, 136, 69, 127, 59, 44, 131, 107, 115, 35, 129, 12, 70, 38, 1]
3: [142, 87, 6, 2, 135, 33, 80, 32, 73, 25, 49, 48, 67, 40]
4: [130, 74, 120, 20, 9, 93, 8, 53, 63, 17, 101, 37, 58, 14]
5: [143, 123, 113, 77, 91, 140, 100, 124, 103, 95, 89, 82, 56]
6: [128, 78, 4, 27, 116, 54, 109, 39, 99, 46, 98, 68, 22]
7: [148, 121, 144, 71, 114, 15, 86, 29, 26, 64, 45]
8: [117, 55, 84, 108, 41, 19, 85, 52, 16]


# Removing stopwords, and again repeating above works

In [40]:
from nltk.corpus import stopwords

In [41]:
stop_words = set(stopwords.words('english'))

## Clustering based on Title

In [42]:
X_title = X_train["Title"].str.split(" ")
X_title = X_title.apply(lambda x: [item for item in x if item not in stop_words])

In [43]:
title_pred_single = agglomerative_clustering_single(X_title, 9)

In [44]:
print(title_pred_single)

[[141, 140, 139, 138, 137, 136, 135, 134, 133, 132, 131, 130, 129, 128, 127, 126, 125, 124, 123, 122, 121, 120, 119, 118, 117, 116, 115, 114, 113, 112, 111, 110, 109, 108, 107, 106, 105, 104, 103, 102, 101, 100, 99, 98, 97, 96, 95, 94, 93, 92, 91, 90, 89, 88, 87, 86, 85, 84, 83, 82, 81, 80, 79, 78, 77, 76, 75, 74, 73, 72, 71, 70, 69, 68, 67, 66, 65, 64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0], [149], [148], [147], [146], [145], [144], [143], [142]]


In [45]:
title_pred_multi = agglomerative_clustering_multi(X_title, 9)

In [76]:
print(f"Printing clusters for multi linkage hierarchical clustering, considering only topics")
for i, cluster in enumerate(topics_pred_multi):
    print(f"{i}: {cluster}")

Printing clusters for multi linkage hierarchical clustering, considering only topics
0: [146, 71, 39, 37, 27, 26, 109, 117, 90, 124, 67, 48, 119, 116, 107, 89, 58, 115, 91, 56, 35, 18, 16, 88, 55, 36, 30]
1: [145, 136, 73, 44, 33, 32, 106, 77, 126, 92, 144, 108, 137, 85, 70, 74, 46, 21, 20, 128, 93]
2: [133, 96, 61, 127, 97, 62, 103, 104, 82, 130, 69, 31, 25, 15, 11, 5, 3, 1, 0, 110, 63]
3: [149, 138, 99, 118, 113, 131, 75, 60, 45, 87, 120, 111, 65, 141, 129, 72, 49, 34, 125, 123]
4: [148, 143, 105, 86, 98, 83, 81, 52, 42, 139, 134, 100, 53, 41, 19, 14, 114]
5: [135, 79, 54, 122, 112, 66, 43, 24, 12, 10, 94, 76, 101, 64]
6: [140, 80, 57, 121, 68, 47, 40, 132, 95, 59, 38]
7: [142, 102, 78, 50, 29, 23, 22, 13, 8, 6]
8: [147, 84, 51, 28, 17, 9, 7, 4, 2]


## Clustering based on Keywords

In [47]:
X_keywords = X_train["Keywords"].str.split(" ")
X_keywords = X_keywords.apply(lambda x: [item for item in x if item not in stop_words])

In [48]:
keywords_pred_single = agglomerative_clustering_single(X_keywords, 9)

In [79]:
print(f"Printing clusters for single linkage hierarchical clustering, considering only keywords")
for i, cluster in enumerate(keywords_pred_single):
    print(f"{i}: {cluster}")

Printing clusters for single linkage hierarchical clustering, considering only keywords
0: [141, 140, 139, 138, 137, 136, 135, 134, 133, 132, 131, 130, 129, 128, 127, 126, 125, 124, 123, 122, 121, 120, 119, 118, 117, 116, 115, 114, 113, 112, 111, 110, 109, 108, 107, 106, 105, 104, 103, 102, 101, 100, 99, 98, 97, 96, 95, 94, 93, 92, 91, 90, 89, 88, 87, 86, 85, 84, 83, 82, 81, 80, 79, 78, 77, 76, 75, 74, 73, 72, 71, 70, 69, 68, 67, 66, 65, 64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
1: [149]
2: [148]
3: [147]
4: [146]
5: [145]
6: [144]
7: [143]
8: [142]


In [50]:
keywords_pred_multi = agglomerative_clustering_multi(X_keywords, 9)

In [78]:
print(f"Printing clusters for multi linkage hierarchical clustering, considering only keywords")
for i, cluster in enumerate(keywords_pred_multi):
    print(f"{i}: {cluster}")

Printing clusters for multi linkage hierarchical clustering, considering only keywords
0: [139, 119, 101, 87, 77, 67, 56, 46, 132, 121, 103, 128, 109, 93, 83, 71, 62, 54, 39, 33, 29, 23, 19, 15, 126, 107, 91, 81, 70, 60, 50, 43]
1: [149, 127, 108, 98, 134, 117, 147, 118, 99, 86, 73, 65, 57, 47, 37, 31, 26, 25, 21, 18, 13, 10, 8, 6, 4]
2: [141, 112, 95, 84, 74, 64, 53, 42, 35, 30, 24, 20, 16, 12, 133, 138, 123, 105, 90, 79, 69, 59, 49, 41]
3: [145, 113, 96, 76, 63, 51, 40, 34, 28, 22, 17, 14, 11, 9, 7, 5, 3, 2, 1, 0]
4: [148, 124, 106, 144, 129, 114, 130, 140, 116, 97, 85, 75, 66, 55, 45, 38, 32, 27]
5: [136, 111, 92, 82, 72, 61, 52, 44, 36, 131, 115, 100]
6: [143, 135, 110, 94, 88, 78, 68, 58, 48]
7: [146, 120, 102, 89, 80]
8: [142, 125, 137, 122, 104]


## Clustering on Topics

In [52]:
X_topics = X_train["Topics"].str.split(" ")
X_topics = X_topics.apply(lambda x: [item for item in x if item not in stop_words])

In [53]:
topics_pred_single = agglomerative_clustering_single(X_topics, 9)

In [77]:
print(f"Printing clusters for single linkage hierarchical clustering, considering only topics")
for i, cluster in enumerate(topics_pred_single):
    print(f"{i}: {cluster}")

Printing clusters for single linkage hierarchical clustering, considering only topics
0: [141, 140, 139, 138, 137, 136, 135, 134, 133, 132, 131, 130, 129, 128, 127, 126, 125, 124, 123, 122, 121, 120, 119, 118, 117, 116, 115, 114, 113, 112, 111, 110, 109, 108, 107, 106, 105, 104, 103, 102, 101, 100, 99, 98, 97, 96, 95, 94, 93, 92, 91, 90, 89, 88, 87, 86, 85, 84, 83, 82, 81, 80, 79, 78, 77, 76, 75, 74, 73, 72, 71, 70, 69, 68, 67, 66, 65, 64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
1: [149]
2: [148]
3: [147]
4: [146]
5: [145]
6: [144]
7: [143]
8: [142]


In [55]:
topics_pred_multi = agglomerative_clustering_multi(X_topics, 9)

In [56]:
print(topics_pred_multi)

[[146, 71, 39, 37, 27, 26, 109, 117, 90, 124, 67, 48, 119, 116, 107, 89, 58, 115, 91, 56, 35, 18, 16, 88, 55, 36, 30], [145, 136, 73, 44, 33, 32, 106, 77, 126, 92, 144, 108, 137, 85, 70, 74, 46, 21, 20, 128, 93], [133, 96, 61, 127, 97, 62, 103, 104, 82, 130, 69, 31, 25, 15, 11, 5, 3, 1, 0, 110, 63], [149, 138, 99, 118, 113, 131, 75, 60, 45, 87, 120, 111, 65, 141, 129, 72, 49, 34, 125, 123], [148, 143, 105, 86, 98, 83, 81, 52, 42, 139, 134, 100, 53, 41, 19, 14, 114], [135, 79, 54, 122, 112, 66, 43, 24, 12, 10, 94, 76, 101, 64], [140, 80, 57, 121, 68, 47, 40, 132, 95, 59, 38], [142, 102, 78, 50, 29, 23, 22, 13, 8, 6], [147, 84, 51, 28, 17, 9, 7, 4, 2]]


## Clustering on Abstract

In [57]:
X_abstract = X_train["Abstract"].str.split(" ")
X_abstract = X_abstract.apply(lambda x: [item for item in x if item not in stop_words])

In [58]:
abstract_pred_single = agglomerative_clustering_single(X_abstract, 9)

In [74]:
print(f"Printing clusters for single linkage hierarchical clustering, considering only abstract")
for i, cluster in enumerate(abstract_pred_single):
    print(f"{i}: {cluster}")

Printing clusters for single linkage hierarchical clustering, considering only abstract
0: [149, 136, 138, 70, 144, 117, 55, 73, 88, 0, 93, 8, 11, 124, 25, 20, 9, 78, 4, 41, 63, 75, 36, 54, 126, 98, 68, 137, 109, 39, 5, 97, 129, 12, 120, 108, 79, 34, 72, 60, 100, 59, 106, 19, 51, 17, 22, 118, 85, 52, 30, 35, 119, 116, 6, 146, 42, 33, 111, 113, 135, 90, 62, 87, 128, 145, 50, 127, 141, 31, 53, 71, 47, 15, 82, 56, 107, 121, 18, 66, 57, 2, 28, 49, 16, 94, 83, 95, 134, 76, 23, 58, 104, 61, 114, 96, 115, 105, 148, 131, 29, 48, 7, 99, 46, 122, 86, 89, 1, 37, 101, 80, 64, 103, 110, 67, 123, 133, 65, 69, 74, 132, 140, 38, 147, 139, 13, 26, 43, 130, 24, 92, 3, 45, 40, 21, 77, 102, 143, 91, 27, 142]
1: [125]
2: [112]
3: [84]
4: [81]
5: [44]
6: [32]
7: [14]
8: [10]


In [60]:
abstract_pred_multi = agglomerative_clustering_multi(X_abstract, 9)

In [75]:
print(f"Printing clusters for multi linkage hierarchical clustering, considering only abstract")
for i, cluster in enumerate(abstract_pred_multi):
    print(f"{i}: {cluster}")

Printing clusters for multi linkage hierarchical clustering, considering only abstract
0: [146, 42, 132, 47, 125, 122, 138, 31, 72, 60, 88, 18, 126, 28, 23, 102, 145, 106, 134, 76, 51, 81, 21, 11, 0, 139, 65, 133, 97, 104, 61, 66, 24, 112, 43, 96, 13, 94, 83]
1: [147, 137, 5, 111, 105, 90, 62, 3, 79, 34, 57, 141, 30, 110, 119, 50, 7, 118, 75, 36, 92, 10]
2: [149, 136, 69, 127, 59, 44, 131, 107, 115, 35, 129, 12, 70, 38, 1]
3: [142, 87, 6, 2, 135, 33, 80, 32, 73, 25, 49, 48, 67, 40]
4: [130, 74, 120, 20, 9, 93, 8, 53, 63, 17, 101, 37, 58, 14]
5: [143, 123, 113, 77, 91, 140, 100, 124, 103, 95, 89, 82, 56]
6: [128, 78, 4, 27, 116, 54, 109, 39, 99, 46, 98, 68, 22]
7: [148, 121, 144, 71, 114, 15, 86, 29, 26, 64, 45]
8: [117, 55, 84, 108, 41, 19, 85, 52, 16]


Clealry multi linkage algorithm is producing better result than single linkage algorithm