# Clustering papers based on titles, keywords and topics

In [1]:
# importing libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re
import string
import itertools

In [2]:
# Reading data from a csv file
df_train = pd.read_csv("AAAI.csv")

In [3]:
df_train

Unnamed: 0,Title,Keywords,Topics,High-Level Keyword(s),Abstract
0,The cascade auction – a mechanism for deterrin...,Mediators\nAuctions\nCollusion\nAd Exchanges,Auctions and Market-Based Systems\nE-Commerce\...,Multiagent Systems,We introduce a sealed bid auction of a single ...
1,Basis Adaptation for Sparse Nonlinear Reinforc...,Reinforcement learning\nSparsity\nMirror desce...,Dimension Reduction/Feature Selection\nOnline ...,Machine Learning,This paper presents a new approach to basis ad...
2,Optimal Coalition Structures in Cooperative Gr...,Cooperative Game Theory\nCoalition Structure G...,Coordination and Collaboration\nGame Theory,Multiagent Systems,Representation languages for coalitional game...
3,External Memory Best-First Search for Multiple...,External-Memory Search\nParallel Search\nMulti...,Heuristic Search\nEvaluation and Analysis (Sea...,Heuristic Search and Optimization,Multiple sequence alignment (MSA) is a central...
4,Posted Prices Exchange for Display Advertising...,Display Advertising\nDynamic Pricing\nMarket E...,Auctions and Market-Based Systems\nE-Commerce\...,Multiagent Systems,We propose a new market design for display adv...
...,...,...,...,...,...
145,Probabilistic Sense Sentiment Similarity throu...,Sentiment Similarity\nIndirect yse/no Question...,Information Extraction\nQuestion Answering\nNa...,Natural Language Processing,Sentiment Similarity of word pairs reflects th...
146,Strategic Behavior when Allocating Indivisible...,Fair division\nElicition free protocol\nBackwa...,Game Theory\nMechanism Design,Multiagent Systems,We study a simple sequential allocation mechan...
147,A Pattern Matching Based Graphical Model for Q...,Opinion Question\nSubjectivity Detection\nOpin...,Natural Language Processing (General/Other),Natural Language Processing,This paper presents the results of developing ...
148,Grounding Natural Language References to Unvis...,Human-robot interaction\nIntegrated perception...,Natural Language Processing (General/Other)\nR...,Natural Language Processing,While much research exists on resolving spatia...


In [4]:
all_possible_predictions = list(set(df_train["High-Level Keyword(s)"]))
print(f"All possible prediction classes are {all_possible_predictions}")

All possible prediction classes are ['Machine Learning', 'Heuristic Search and Optimization', 'Robotics', 'Constraints and Satisfiability', 'Natural Language Processing', 'Reasoning about Plans, Processes, and Actions', 'Multidisciplinary Topics', 'Multiagent Systems', 'Knowledge Representation and Reasoning']


In [5]:
print(f"Printing length of clusters:\n")
for i in all_possible_predictions:
    num = df_train[df_train["High-Level Keyword(s)"] == i].shape[0]
    print(f"{i}: {num}")

Printing length of clusters:

Machine Learning: 45
Heuristic Search and Optimization: 9
Robotics: 5
Constraints and Satisfiability: 10
Natural Language Processing: 13
Reasoning about Plans, Processes, and Actions: 12
Multidisciplinary Topics: 7
Multiagent Systems: 32
Knowledge Representation and Reasoning: 17


In [6]:
# sorting all_possible_predictions list on number of predictions in dataset 
all_possible_predictions = sorted(all_possible_predictions, key = lambda x: df_train[df_train["High-Level Keyword(s)"] == x].shape[0])
all_possible_predictions = all_possible_predictions[::-1]
all_possible_predictions

['Machine Learning',
 'Multiagent Systems',
 'Knowledge Representation and Reasoning',
 'Natural Language Processing',
 'Reasoning about Plans, Processes, and Actions',
 'Constraints and Satisfiability',
 'Heuristic Search and Optimization',
 'Multidisciplinary Topics',
 'Robotics']

In [7]:
# Removing the High-Level Keyword(s) column, X_train is the feature vector
X_train = df_train.drop("High-Level Keyword(s)", axis=1)

In [8]:
X_train

Unnamed: 0,Title,Keywords,Topics,Abstract
0,The cascade auction – a mechanism for deterrin...,Mediators\nAuctions\nCollusion\nAd Exchanges,Auctions and Market-Based Systems\nE-Commerce\...,We introduce a sealed bid auction of a single ...
1,Basis Adaptation for Sparse Nonlinear Reinforc...,Reinforcement learning\nSparsity\nMirror desce...,Dimension Reduction/Feature Selection\nOnline ...,This paper presents a new approach to basis ad...
2,Optimal Coalition Structures in Cooperative Gr...,Cooperative Game Theory\nCoalition Structure G...,Coordination and Collaboration\nGame Theory,Representation languages for coalitional game...
3,External Memory Best-First Search for Multiple...,External-Memory Search\nParallel Search\nMulti...,Heuristic Search\nEvaluation and Analysis (Sea...,Multiple sequence alignment (MSA) is a central...
4,Posted Prices Exchange for Display Advertising...,Display Advertising\nDynamic Pricing\nMarket E...,Auctions and Market-Based Systems\nE-Commerce\...,We propose a new market design for display adv...
...,...,...,...,...
145,Probabilistic Sense Sentiment Similarity throu...,Sentiment Similarity\nIndirect yse/no Question...,Information Extraction\nQuestion Answering\nNa...,Sentiment Similarity of word pairs reflects th...
146,Strategic Behavior when Allocating Indivisible...,Fair division\nElicition free protocol\nBackwa...,Game Theory\nMechanism Design,We study a simple sequential allocation mechan...
147,A Pattern Matching Based Graphical Model for Q...,Opinion Question\nSubjectivity Detection\nOpin...,Natural Language Processing (General/Other),This paper presents the results of developing ...
148,Grounding Natural Language References to Unvis...,Human-robot interaction\nIntegrated perception...,Natural Language Processing (General/Other)\nR...,While much research exists on resolving spatia...


# Cleaning Text 

In [9]:
def remove_punctuation(s):
    '''
    s: string 
    String will be returned by removing punctuations and newline character '\n' from s
    '''
    s = s.replace('\n', ' ')
    translator = str.maketrans(string.punctuation, ' '*len(string.punctuation)) #map punctuation to space
    s = s.translate(translator)
    
    # remove redundant spaces after replacing puncutations and newline with spaces
    s = re.sub(' +', ' ', s)
    return s

def clean_df(df):
    '''
    df: Pandas DataFrame
    Inplace changes dataframe
    '''
    cols = df.columns
    for col in cols:
        for i in range(len(df[col])):
            df[col][i] = remove_punctuation(df[col][i])

In [10]:
# Cleaning X_train
clean_df(X_train)

In [11]:
ord(df_train["Title"][0][20])

8211

In [12]:
ord('-')

45

In [13]:
ord(df_train["Title"][28][31])

8212

In [14]:
X_train

Unnamed: 0,Title,Keywords,Topics,Abstract
0,The cascade auction – a mechanism for deterrin...,Mediators Auctions Collusion Ad Exchanges,Auctions and Market Based Systems E Commerce G...,We introduce a sealed bid auction of a single ...
1,Basis Adaptation for Sparse Nonlinear Reinforc...,Reinforcement learning Sparsity Mirror descent...,Dimension Reduction Feature Selection Online L...,This paper presents a new approach to basis ad...
2,Optimal Coalition Structures in Cooperative Gr...,Cooperative Game Theory Coalition Structure Ge...,Coordination and Collaboration Game Theory,Representation languages for coalitional games...
3,External Memory Best First Search for Multiple...,External Memory Search Parallel Search Multipl...,Heuristic Search Evaluation and Analysis Searc...,Multiple sequence alignment MSA is a central p...
4,Posted Prices Exchange for Display Advertising...,Display Advertising Dynamic Pricing Market Equ...,Auctions and Market Based Systems E Commerce M...,We propose a new market design for display adv...
...,...,...,...,...
145,Probabilistic Sense Sentiment Similarity throu...,Sentiment Similarity Indirect yse no Question ...,Information Extraction Question Answering Natu...,Sentiment Similarity of word pairs reflects th...
146,Strategic Behavior when Allocating Indivisible...,Fair division Elicition free protocol Backward...,Game Theory Mechanism Design,We study a simple sequential allocation mechan...
147,A Pattern Matching Based Graphical Model for Q...,Opinion Question Subjectivity Detection Opinio...,Natural Language Processing General Other,This paper presents the results of developing ...
148,Grounding Natural Language References to Unvis...,Human robot interaction Integrated perception ...,Natural Language Processing General Other Robo...,While much research exists on resolving spatia...


# Utility Funcitons ( Jacardian Similarity, getting thresholds etc.)

JC = $\dfrac{H \cap S}{H \cup S}$

In [17]:
def Intersection(H, S):
    '''
    Returns a list Intersection of H and S set
    '''
    return set(H) - (set(H) - set(S))

def Union(H, S):
    '''
    Returns a list of union of H and S set
    '''
    return set(H + S)

def Jacardian_score(H, S):
    '''
    Returns jacardian similarity
    '''
    if len(Union(H, S)) == 0:
        print(f"Empty H and S Passed")
        return 0
    return len(Intersection(H, S))/ len(Union(H, S))

def dist(H, S):
    '''
        Returns 1- Jacardian Score
    '''
    return 1 - Jacardian_score(H, S)

def calc_dist(a, b, f, X):
    '''
    a: list of indices of cluster from X which belongs to one segment
    b: list of indices of cluster from X which belongs to one segment
    f: funtion, for single use min, multi use max
    Uses function dist to calculate jacardian distance between two clusters a and b
    '''
    distances = []
    for i in a:
        for j in b:
            distances.append(dist(X[i], X[j]))
    return f(distances)

In [18]:
def get_accuracy(df_train, clusters, all_possible_predictions):
    '''
    cluster: List of list of indices in clusters
    labels: df_train["High-Level Keyword(s)"] to calculate accuracy
    Returns accuracy
    '''
    mp = dict()
    used_col = []
    i = 1
    for cluster in clusters:
        score = 0
        col = ""
        for x in all_possible_predictions:
            if x not in used_col:
                col = x
                break
        temp = dict()
        for z in cluster:
            key = df_train["High-Level Keyword(s)"][i]
            if key in used_col:
                continue
            if key not in temp:
                temp[key] = 1
            else:
                temp[key] += 1
            if temp[key] > score:
                score = temp[key]
                col = key
        mp[i] = col
        used_col.append(col)
        i += 1
    correct_pred = 0
    for j, cluster in enumerate(clusters):
        for z in cluster:
            if df_train["High-Level Keyword(s)"][z] == mp[j+1]:
                correct_pred += 1
    acc = correct_pred / df_train.shape[0]
    print(mp)
#     perm = list(range(1, 10))
#     perms = list(itertools.permutations(perm))
#     max_acc = 0
#     for perm in perms:
#         curr_score = 0
#         for i, cluster in enumerate(clusters):
#             for z in cluster:
#                 if df_train["High-Level Keyword(s)"][z] == perm[i]:
#                     curr_score += 1
#         max_acc = max(curr_score, max_acc)
#     max_acc = max_acc/df_train.shape[0]
    return acc

# Agglomerative Clustering

In [19]:
def agglomerative_clustering_single(X_train, n_cluster):
    '''
    X_train: Training dataset on which it will calculate similarity
    n_cluster: Number of clusters required
    Returns a cluster list which will contain informations of clusters
    '''
    #Every points in datset is a single cluster initially
    clusters = []
    for i in range(len(X_train)):
        clusters.append([i])
    # curr cluster is number of current clusters, dist_store will store distances
    curr_clusters = len(X_train)
    dist_store = []
    while curr_clusters > n_cluster:
        dist_store.clear()
        for i in range(curr_clusters):
            for j in range(i+1, curr_clusters):
                dist_store.append([calc_dist(clusters[i], clusters[j], min, X_train), j, i])
        # sorting distances in ascending order and merging two lists having min distances
        dist_store = sorted(dist_store, key = lambda x:x[0])
        # now merge first one
        to_merge = dist_store[0]
        clusters[to_merge[1]].extend(clusters[to_merge[2]])
        del clusters[to_merge[2]]
        curr_clusters -= 1
    clusters = sorted(clusters, key = lambda x: len(x))
    clusters = clusters[::-1]
    return clusters
    

In [20]:
def agglomerative_clustering_multi(X_train, n_cluster):
    '''
    X_train: Training dataset on which it will calculate similarity
    n_cluster: Number of clusters required
    Returns a cluster list which will contain informations of clusters
    '''
    #Every points in datset is a single cluster initially
    clusters = []
    for i in range(len(X_train)):
        clusters.append([i])
    # curr cluster is number of current clusters, dist_store will store distances
    curr_clusters = len(X_train)
    dist_store = []
    while curr_clusters > n_cluster:
        dist_store.clear()
        for i in range(curr_clusters):
            for j in range(i+1, curr_clusters):
                dist_store.append([calc_dist(clusters[i], clusters[j], max, X_train), j, i])
        # sorting distances in ascending order and merging two lists having min distances
        dist_store = sorted(dist_store, key = lambda x:x[0])
        # now merge first one
        to_merge = dist_store[0]
        clusters[to_merge[1]].extend(clusters[to_merge[2]])
        del clusters[to_merge[2]]
        curr_clusters -= 1
    clusters = sorted(clusters, key = lambda x: len(x))
    clusters = clusters[::-1]
    return clusters
    

# Clustering based on Title

In [21]:
X_title = X_train["Title"].str.split(" ")

In [22]:
X_title[2]

['Optimal', 'Coalition', 'Structures', 'in', 'Cooperative', 'Graph', 'Games']

In [23]:
title_pred_single = agglomerative_clustering_single(X_title, 9)

In [24]:
get_accuracy(df_train, title_pred_single, all_possible_predictions)

{1: 'Machine Learning', 2: 'Multiagent Systems', 3: 'Heuristic Search and Optimization', 4: 'Knowledge Representation and Reasoning', 5: 'Robotics', 6: 'Natural Language Processing', 7: 'Reasoning about Plans, Processes, and Actions', 8: 'Constraints and Satisfiability', 9: 'Multidisciplinary Topics'}


0.2866666666666667

In [25]:
title_pred_multi = agglomerative_clustering_multi(X_title, 9)

In [26]:
get_accuracy(df_train, title_pred_multi, all_possible_predictions)

{1: 'Machine Learning', 2: 'Multiagent Systems', 3: 'Heuristic Search and Optimization', 4: 'Knowledge Representation and Reasoning', 5: 'Robotics', 6: 'Natural Language Processing', 7: 'Reasoning about Plans, Processes, and Actions', 8: 'Constraints and Satisfiability', 9: 'Multidisciplinary Topics'}


0.22

# Clustering based on Keywords

In [27]:
X_keywords = X_train["Keywords"].str.split(" ")

In [28]:
keywords_pred_single = agglomerative_clustering_single(X_keywords, 9)

In [29]:
get_accuracy(df_train, keywords_pred_single, all_possible_predictions)

{1: 'Machine Learning', 2: 'Multiagent Systems', 3: 'Heuristic Search and Optimization', 4: 'Knowledge Representation and Reasoning', 5: 'Robotics', 6: 'Natural Language Processing', 7: 'Reasoning about Plans, Processes, and Actions', 8: 'Constraints and Satisfiability', 9: 'Multidisciplinary Topics'}


0.29333333333333333

In [30]:
keywords_pred_multi = agglomerative_clustering_multi(X_keywords, 9)

In [31]:
get_accuracy(df_train, keywords_pred_multi, all_possible_predictions)

{1: 'Machine Learning', 2: 'Multiagent Systems', 3: 'Heuristic Search and Optimization', 4: 'Knowledge Representation and Reasoning', 5: 'Robotics', 6: 'Natural Language Processing', 7: 'Reasoning about Plans, Processes, and Actions', 8: 'Constraints and Satisfiability', 9: 'Multidisciplinary Topics'}


0.22

# Clustering on Topics

In [32]:
X_topics = X_train["Topics"].str.split(" ")

In [33]:
topics_pred_single = agglomerative_clustering_single(X_topics, 9)

In [34]:
get_accuracy(df_train, topics_pred_single, all_possible_predictions)

{1: 'Machine Learning', 2: 'Multiagent Systems', 3: 'Heuristic Search and Optimization', 4: 'Knowledge Representation and Reasoning', 5: 'Robotics', 6: 'Natural Language Processing', 7: 'Reasoning about Plans, Processes, and Actions', 8: 'Constraints and Satisfiability', 9: 'Multidisciplinary Topics'}


0.29333333333333333

In [35]:
topics_pred_multi = agglomerative_clustering_multi(X_topics, 9)

In [36]:
get_accuracy(df_train, topics_pred_multi, all_possible_predictions)

{1: 'Machine Learning', 2: 'Multiagent Systems', 3: 'Heuristic Search and Optimization', 4: 'Knowledge Representation and Reasoning', 5: 'Robotics', 6: 'Natural Language Processing', 7: 'Reasoning about Plans, Processes, and Actions', 8: 'Constraints and Satisfiability', 9: 'Multidisciplinary Topics'}


0.12666666666666668

In [37]:
all_possible_predictions

['Machine Learning',
 'Multiagent Systems',
 'Knowledge Representation and Reasoning',
 'Natural Language Processing',
 'Reasoning about Plans, Processes, and Actions',
 'Constraints and Satisfiability',
 'Heuristic Search and Optimization',
 'Multidisciplinary Topics',
 'Robotics']

In [38]:
topics_pred_multi

[[123,
  76,
  116,
  86,
  70,
  50,
  39,
  121,
  24,
  87,
  4,
  0,
  111,
  43,
  103,
  102,
  42,
  32,
  109,
  72,
  69,
  51,
  106,
  1,
  79,
  100,
  91,
  94,
  11,
  89,
  41,
  25,
  88,
  83,
  47],
 [148,
  147,
  64,
  145,
  90,
  61,
  141,
  115,
  57,
  75,
  142,
  38,
  108,
  82,
  58,
  122,
  120,
  139,
  34,
  107,
  134,
  113,
  48,
  92,
  62,
  118,
  105,
  77,
  85,
  23],
 [132,
  126,
  96,
  2,
  20,
  124,
  110,
  22,
  84,
  60,
  46,
  104,
  78,
  59,
  35,
  7,
  15,
  119,
  114,
  93,
  56,
  33,
  5,
  127,
  13,
  66,
  26,
  99,
  30,
  3],
 [149,
  125,
  21,
  65,
  63,
  54,
  9,
  98,
  45,
  40,
  14,
  144,
  36,
  12,
  143,
  138,
  137,
  133,
  117,
  80,
  16,
  27,
  101,
  6,
  97],
 [140, 37, 10, 135, 49, 17, 68, 31, 55, 19, 8],
 [131, 81, 74, 112, 28, 128, 73, 53],
 [146, 18, 136, 95, 29, 67],
 [129, 71, 52],
 [130, 44]]

In [39]:
df_train["High-Level Keyword(s)"][1]

'Machine Learning'

# Clustering on Abstract

In [40]:
X_abstract = X_train["Abstract"].str.split(" ")

In [41]:
abstract_pred_single = agglomerative_clustering_single(X_abstract, 9)

KeyboardInterrupt: 

In [None]:
get_accuracy(df_train, abstract_pred_single, all_possible_predictions)

In [None]:
abstract_pred_multi = agglomerative_clustering_multi(X_abstract, 9)

In [None]:
get_accuracy(df_train, abstract_pred_multi, all_possible_predictions)

In [None]:
abstract_pred_multi

In [None]:
df_train["High-Level Keyword(s)"][4]

# Removing stopwords, and again repeating above works

In [106]:
from nltk.corpus import stopwords

In [107]:
stop_words = set(stopwords.words('english'))

In [284]:
X_train = X_train.apply(lambda x: [item for item in x if item not in stop_words])

## Clustering based on Title

In [21]:
X_title = X_train["Title"].str.split(" ")

In [22]:
X_title[2]

['Optimal', 'Coalition', 'Structures', 'in', 'Cooperative', 'Graph', 'Games']

In [None]:
title_pred_single = agglomerative_clustering_single(X_title, 9)

In [None]:
get_accuracy(df_train, title_pred_single, all_possible_predictions)

In [None]:
title_pred_multi = agglomerative_clustering_multi(X_title, 9)

In [None]:
get_accuracy(df_train, title_pred_multi, all_possible_predictions)

## Clustering based on Keywords

In [None]:
X_keywords = X_train["Keywords"].str.split(" ")

In [None]:
keywords_pred_single = agglomerative_clustering_single(X_keywords, 9)

In [None]:
get_accuracy(df_train, keywords_pred_single, all_possible_predictions)

In [None]:
keywords_pred_multi = agglomerative_clustering_multi(X_keywords, 9)

In [None]:
get_accuracy(df_train, keywords_pred_multi, all_possible_predictions)

## Clustering on Topics

In [None]:
X_topics = X_train["Topics"].str.split(" ")

In [None]:
topics_pred_single = agglomerative_clustering_single(X_topics, 9)

In [None]:
get_accuracy(df_train, topics_pred_single, all_possible_predictions)

In [None]:
topics_pred_multi = agglomerative_clustering_multi(X_topics, 9)

In [None]:
get_accuracy(df_train, topics_pred_multi, all_possible_predictions)

In [None]:
all_possible_predictions

In [None]:
topics_pred_multi

In [None]:
df_train["High-Level Keyword(s)"][1]

## Clustering on Abstract

In [None]:
X_abstract = X_train["Abstract"].str.split(" ")

In [None]:
abstract_pred_single = agglomerative_clustering_single(X_abstract, 9)

In [None]:
get_accuracy(df_train, abstract_pred_single, all_possible_predictions)

In [None]:
abstract_pred_multi = agglomerative_clustering_multi(X_abstract, 9)

In [None]:
get_accuracy(df_train, abstract_pred_multi, all_possible_predictions)

In [None]:
abstract_pred_multi

In [None]:
df_train["High-Level Keyword(s)"][4]

# Stemming and again repeating above works

In [None]:
# Stemming

## Clustering based on Title

In [21]:
X_title = X_train["Title"].str.split(" ")

In [22]:
X_title[2]

['Optimal', 'Coalition', 'Structures', 'in', 'Cooperative', 'Graph', 'Games']

In [None]:
title_pred_single = agglomerative_clustering_single(X_title, 9)

In [None]:
get_accuracy(df_train, title_pred_single, all_possible_predictions)

In [None]:
title_pred_multi = agglomerative_clustering_multi(X_title, 9)

In [None]:
get_accuracy(df_train, title_pred_multi, all_possible_predictions)

## Clustering based on Keywords

In [None]:
X_keywords = X_train["Keywords"].str.split(" ")

In [None]:
keywords_pred_single = agglomerative_clustering_single(X_keywords, 9)

In [None]:
get_accuracy(df_train, keywords_pred_single, all_possible_predictions)

In [None]:
keywords_pred_multi = agglomerative_clustering_multi(X_keywords, 9)

In [None]:
get_accuracy(df_train, keywords_pred_multi, all_possible_predictions)

## Clustering on Topics

In [None]:
X_topics = X_train["Topics"].str.split(" ")

In [None]:
topics_pred_single = agglomerative_clustering_single(X_topics, 9)

In [None]:
get_accuracy(df_train, topics_pred_single, all_possible_predictions)

In [None]:
topics_pred_multi = agglomerative_clustering_multi(X_topics, 9)

In [None]:
get_accuracy(df_train, topics_pred_multi, all_possible_predictions)

In [None]:
all_possible_predictions

In [None]:
topics_pred_multi

In [None]:
df_train["High-Level Keyword(s)"][1]

## Clustering on Abstract

In [None]:
X_abstract = X_train["Abstract"].str.split(" ")

In [None]:
abstract_pred_single = agglomerative_clustering_single(X_abstract, 9)

In [None]:
get_accuracy(df_train, abstract_pred_single, all_possible_predictions)

In [None]:
abstract_pred_multi = agglomerative_clustering_multi(X_abstract, 9)

In [None]:
get_accuracy(df_train, abstract_pred_multi, all_possible_predictions)

In [None]:
abstract_pred_multi

In [None]:
df_train["High-Level Keyword(s)"][4]

# Removing Stopwords and Stemming

## Clustering based on Title

In [21]:
X_title = X_train["Title"].str.split(" ")

In [22]:
X_title[2]

['Optimal', 'Coalition', 'Structures', 'in', 'Cooperative', 'Graph', 'Games']

In [None]:
title_pred_single = agglomerative_clustering_single(X_title, 9)

In [None]:
get_accuracy(df_train, title_pred_single, all_possible_predictions)

In [None]:
title_pred_multi = agglomerative_clustering_multi(X_title, 9)

In [None]:
get_accuracy(df_train, title_pred_multi, all_possible_predictions)

## Clustering based on Keywords

In [None]:
X_keywords = X_train["Keywords"].str.split(" ")

In [None]:
keywords_pred_single = agglomerative_clustering_single(X_keywords, 9)

In [None]:
get_accuracy(df_train, keywords_pred_single, all_possible_predictions)

In [None]:
keywords_pred_multi = agglomerative_clustering_multi(X_keywords, 9)

In [None]:
get_accuracy(df_train, keywords_pred_multi, all_possible_predictions)

## Clustering on Topics

In [None]:
X_topics = X_train["Topics"].str.split(" ")

In [None]:
topics_pred_single = agglomerative_clustering_single(X_topics, 9)

In [None]:
get_accuracy(df_train, topics_pred_single, all_possible_predictions)

In [None]:
topics_pred_multi = agglomerative_clustering_multi(X_topics, 9)

In [None]:
get_accuracy(df_train, topics_pred_multi, all_possible_predictions)

In [None]:
all_possible_predictions

In [None]:
topics_pred_multi

In [None]:
df_train["High-Level Keyword(s)"][1]

## Clustering on Abstract

In [None]:
X_abstract = X_train["Abstract"].str.split(" ")

In [None]:
abstract_pred_single = agglomerative_clustering_single(X_abstract, 9)

In [None]:
get_accuracy(df_train, abstract_pred_single, all_possible_predictions)

In [None]:
abstract_pred_multi = agglomerative_clustering_multi(X_abstract, 9)

In [None]:
get_accuracy(df_train, abstract_pred_multi, all_possible_predictions)

In [None]:
abstract_pred_multi