# *AIR Project - Paper Implementation*

***Paper Title - Fast and Effective Cluster-based Information Retrieval using
Frequent Closed Itemsets***

Team Details : 
* Abhay D A - PES1UG19CS011
* Akash S - PES1UG19CS042

In [None]:
import os
import numpy as np 
import pandas as pd 
import re
import time

import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

from itertools import combinations
from functools import reduce

from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
df = pd.read_csv("bbc-data.csv")

df.head()

Unnamed: 0,text
0,Ad sales boost Time Warner profit Quarterly p...
1,Dollar gains on Greenspan speech The dollar h...
2,Yukos unit buyer faces loan claim The owners ...
3,High fuel prices hit BA's profits British Air...
4,Pernod takeover talk lifts Domecq Shares in U...


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2225 entries, 0 to 2224
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    2225 non-null   object
dtypes: object(1)
memory usage: 17.5+ KB


# Pre processing

In [None]:
def preprocess(text):
    
    #----------Pre-processing----------#
    urlPattern        = r"((http://)[^ ]*|(https://)[^ ]*|( www\.)[^ ]*)"
    alphaPattern      = "[^a-zA-Z0-9]"

    # Remove all EmailIDs
    text = re.sub("^[a-zA-Z0-9+_.-]+@[a-zA-Z0-9.-]+$", '', text)
    # Remove all URLs
    text = re.sub(urlPattern,'',text)
    # Replace all non alphabets.
    text = re.sub(alphaPattern, " ", text)
    # Remove extra spaces
    text = re.sub(r"[ \t\n]+"," " , text)
    # Remove Numbers
    text = re.sub(r"\d+"," " , text)
    # Remove single characters
    text = re.sub(r'\b[.]{1}\b', '', text)
    # Remove consecutive characters
    text = re.sub(r'/(.)\1+/', '', text)
    
    #----------Case folding---------------#
    
    text = text.strip().lower()
    
    #---------Tokenization----------------#
   
    text = word_tokenize(text)
    
    #-----Stop word removal---------#
    temp = []
    stop_words = set(stopwords.words('english'))
    for word in text:
        if word not in stop_words:
            temp.append(word)
    
    text = temp
    
    #--------Lemmatization--------#
    
    lemmatizer = WordNetLemmatizer()
    
    text = [lemmatizer.lemmatize(word) for word in text]
    
    return set(text)


df['processed'] = df['text'].apply(lambda x : preprocess(x))

print("Preprocessing Complete.")

Preprocessing Complete.


In [None]:
df['clean_text'] = df['processed'].apply(lambda x : ' '.join(x).strip())
df.head()

Unnamed: 0,text,processed,clean_text
0,Ad sales boost Time Warner profit Quarterly p...,"{connection, google, month, service, analyst, ...",connection google month service analyst offeri...
1,Dollar gains on Greenspan speech The dollar h...,"{view, call, month, february, yawning, dollar,...",view call month february yawning dollar buying...
2,Yukos unit buyer faces loan claim The owners ...,"{clause, filed, arbitration, shell, mikhail, r...",clause filed arbitration shell mikhail russian...
3,High fuel prices hit BA's profits British Air...,"{extra, month, analyst, full, continues, expec...",extra month analyst full continues expects exa...
4,Pernod takeover talk lifts Domecq Shares in U...,"{cream, seagram, wsj, wine, comment, chain, in...",cream seagram wsj wine comment chain including...


# K means

In [None]:
documents = df['clean_text'].values.astype("U")
documents[0]

'connection google month service analyst offering jumped advertising full parson expects amount slump alexander around exceeding december estimate stronger fortune trilogy enhancing wider restate hope year contrast ad result better reported aol market boosted sale quarter loss offset profit value try financial executive strong final purchase firm online speed preceding pay quarterly existing sign flexibility however exceptional item time saw revenue review free probe increase concluding bertelsmann third user german security aside previously chairman biggest meeting three sharp back bn dip lower owns charge intends chief box high inquiry fourth warner offered giant slightly search regulator margin office grew lost expectation already resolve way investor division music set ring company said publisher higher mixed also settle one lord earlier legal part following greatly performance effort underlying customer account engine internet buoyed le commission close objective projecting book p

In [None]:
vectorizer = TfidfVectorizer(stop_words='english')
features = vectorizer.fit_transform(documents)

In [None]:
k = 5
model = KMeans(n_clusters=k, init='k-means++', max_iter=1000, n_init=1, random_state = 42)
model.fit(features)

KMeans(max_iter=1000, n_clusters=5, n_init=1, random_state=42)

In [None]:
df['cluster'] = model.labels_

In [None]:
df.head()

Unnamed: 0,text,processed,clean_text,cluster
0,Ad sales boost Time Warner profit Quarterly p...,"{connection, google, month, service, analyst, ...",connection google month service analyst offeri...,1
1,Dollar gains on Greenspan speech The dollar h...,"{view, call, month, february, yawning, dollar,...",view call month february yawning dollar buying...,1
2,Yukos unit buyer faces loan claim The owners ...,"{clause, filed, arbitration, shell, mikhail, r...",clause filed arbitration shell mikhail russian...,1
3,High fuel prices hit BA's profits British Air...,"{extra, month, analyst, full, continues, expec...",extra month analyst full continues expects exa...,1
4,Pernod takeover talk lifts Domecq Shares in U...,"{cream, seagram, wsj, wine, comment, chain, in...",cream seagram wsj wine comment chain including...,1


In [None]:
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names_out()

cluster_features = {}

for i in range(k):
    cluster_features[i] = []
    for j in order_centroids[i, :]: 
        cluster_features[i].append(terms[j])


In [None]:
for i in cluster_features:
    print(i,':',cluster_features[i][:15])

0 : ['film', 'star', 'award', 'year', 'music', 'best', 'actor', 'including', 'movie', 'singer', 'new', 'tv', 'said', 'hit', 'oscar']
1 : ['firm', 'company', 'market', 'said', 'year', 'bn', 'analyst', 'month', 'new', 'growth', 'share', 'business', 'mr', 'chief', 'price']
2 : ['minister', 'government', 'labour', 'party', 'mr', 'election', 'tory', 'blair', 'leader', 'secretary', 'tony', 'conservative', 'say', 'prime', 'public']
3 : ['player', 'game', 'match', 'club', 'cup', 'win', 'play', 'season', 'final', 'champion', 'league', 'team', 'chelsea', 'bos', 'played']
4 : ['coach', 'nation', 'game', 'england', 'championship', 'ireland', 'win', 'rugby', 'team', 'olympic', 'france', 'injury', 'champion', 'international', 'world']


In [None]:
df['cluster'].value_counts()

1    824
0    450
2    442
3    277
4    232
Name: cluster, dtype: int64

In [None]:
df.head()

Unnamed: 0,text,processed,clean_text,cluster
0,Ad sales boost Time Warner profit Quarterly p...,"{connection, google, month, service, analyst, ...",connection google month service analyst offeri...,1
1,Dollar gains on Greenspan speech The dollar h...,"{view, call, month, february, yawning, dollar,...",view call month february yawning dollar buying...,1
2,Yukos unit buyer faces loan claim The owners ...,"{clause, filed, arbitration, shell, mikhail, r...",clause filed arbitration shell mikhail russian...,1
3,High fuel prices hit BA's profits British Air...,"{extra, month, analyst, full, continues, expec...",extra month analyst full continues expects exa...,1
4,Pernod takeover talk lifts Domecq Shares in U...,"{cream, seagram, wsj, wine, comment, chain, in...",cream seagram wsj wine comment chain including...,1


# Finding frequent itemsets

In [None]:

def generate_itemsets(features, df):

    frequent_itemsets = {}
    te = TransactionEncoder()
    for cluster in features:

        te_ary = te.fit(df.loc[df['cluster'] == cluster, 'processed']).transform(df.loc[df['cluster'] == cluster, 'processed'])

        temp_df = pd.DataFrame(te_ary, columns=te.columns_)
        result = apriori(temp_df, min_support=0.15, use_colnames=True)

        frequent_itemsets[cluster] = result['itemsets']
        
        frequent_itemsets[cluster] = [list(i) for i in frequent_itemsets[cluster]]

        temp_df = pd.DataFrame(None)

    return frequent_itemsets

frequent_itemsets = generate_itemsets(cluster_features, df)

print(frequent_itemsets[0][-15:])
print(frequent_itemsets[1][-15:])
print(frequent_itemsets[2][-15:])
print(frequent_itemsets[3][-15:])
print(frequent_itemsets[4][-15:])


[['year', 'said', 'u'], ['year', 'uk', 'said'], ['year', 'well', 'said'], ['year', 'said', 'would'], ['year', 'star', 'u'], ['said', 'award', 'year', 'also'], ['said', 'year', 'also', 'film'], ['year', 'first', 'also', 'said'], ['year', 'also', 'one', 'new'], ['year', 'also', 'said', 'new'], ['year', 'also', 'said', 'one'], ['said', 'star', 'year', 'also'], ['year', 'also', 'said', 'time'], ['year', 'also', 'said', 'u'], ['year', 'said', 'one', 'new']]
[['said', 'year', 'u', 'market'], ['year', 'said', 'would', 'market'], ['said', 'year', 'u', 'month'], ['year', 'said', 'would', 'month'], ['mr', 'year', 'said', 'new'], ['mr', 'said', 'one', 'would'], ['year', 'said', 'mr', 'would'], ['said', 'one', 'would', 'new'], ['said', 'year', 'one', 'new'], ['said', 'year', 'u', 'new'], ['year', 'said', 'would', 'new'], ['said', 'year', 'one', 'people'], ['said', 'year', 'one', 'u'], ['year', 'said', 'one', 'would'], ['year', 'said', 'u', 'would']]
[['said', 'would', 'prime', 'labour', 'blair', '

# Querying

In [None]:

query = 'sports game'
q1 = query.split()

score = {}

def intersection(list1, list2):
    return list(set(list1) & set(list2))

for key in frequent_itemsets:
    
    for item in frequent_itemsets[key]:
        if(len(intersection(item, q1)) != 0):
            try:
                score[key] += 1
            except KeyError:
                score[key] = 1

score


{0: 1, 3: 201, 4: 380}

In [None]:
max_cluster = max(score, key= lambda x: score[x])
print(max_cluster)

4


In [None]:
chosen_df = df.loc[df['cluster'] == max_cluster, 'clean_text']

In [None]:
chosen_indices = chosen_df.index

In [None]:
chosen_df = chosen_df.reset_index()

In [None]:
chosen_df['clean_text']

0      chance month russian leap world year domestic ...
1      runner participation run australia dublin mont...
2      justin know another catch say currently world ...
3      heptathlon champion back woman record miscount...
4      champion failed claim record union know broke ...
                             ...                        
227    australia month circuit problem know lot achie...
228    absent runner withdraw champion woman injury j...
229    absent runner withdraw champion woman injury j...
230    september failed belief injury rusedski month ...
231    bad month problem russian upset say scandal ev...
Name: clean_text, Length: 232, dtype: object

In [None]:
chosen_cluster_vectorizer = TfidfVectorizer()
chosen_cluster_features = chosen_cluster_vectorizer.fit_transform(chosen_df['clean_text'].to_list())

In [None]:
chosen_cluster_vectorizer.get_feature_names_out()

array(['aaa', 'aac', 'aaron', ..., 'zoe', 'zornotza', 'zurich'],
      dtype=object)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
cosine_sim = cosine_similarity(chosen_cluster_features, chosen_cluster_features)
print(cosine_sim)

[[1.         0.10280124 0.09126118 ... 0.03765559 0.02955946 0.04943838]
 [0.10280124 1.         0.03902822 ... 0.05920766 0.03422419 0.03576942]
 [0.09126118 0.03902822 1.         ... 0.04775845 0.03415096 0.05438196]
 ...
 [0.03765559 0.05920766 0.04775845 ... 1.         0.04568597 0.09740029]
 [0.02955946 0.03422419 0.03415096 ... 0.04568597 1.         0.07500846]
 [0.04943838 0.03576942 0.05438196 ... 0.09740029 0.07500846 1.        ]]


In [None]:
chosen_cluster_features = chosen_cluster_features.T.toarray()

In [None]:
sim_df = pd.DataFrame(chosen_cluster_features , index=chosen_cluster_vectorizer.get_feature_names_out())

In [None]:
sim_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,222,223,224,225,226,227,228,229,230,231
aaa,0.109907,0.0,0.0,0.0,0.0,0.0,0.0,0.077092,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
aac,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
aaron,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
abate,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
abbott,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
def get_similar_articles(q, df):

  print("query:", q)

  q = [q]
  q_vec = chosen_cluster_vectorizer.transform(q).toarray().reshape(df.shape[0],)
  sim = {}
  
  for i in range(10):
    sim[i] = np.dot(df.loc[:, i].values, q_vec) / np.linalg.norm(df.loc[:, i]) * np.linalg.norm(q_vec)
  
  sim_sorted = sorted(sim.items(), key=lambda x: x[1], reverse=True)

  for k, v in sim_sorted:
    if v != 0.0:
      print("Similarity:", v)
      print(chosen_df['clean_text'][k])
      print()

# Add The Query

get_similar_articles(query, sim_df)

query: sports game
Similarity: 0.05033552490801252
heptathlon champion back woman record miscounted performance staged break matter race pace another mark lap game minute easily dibaba world ethiopia plan kept hope slovenia sprint year bekele get took kicked dashed tirunesh cragg second ceplak erase want jump boston stuttgart battled ethiopian indoor set kenenisa ireland carolina jolanda new olympic go said compatriot long men winning mine berhane soon alistair mistake made sit last kluft previous sweden finish adera

Similarity: 0.045794658121293345
fazed bad month problem rehabilitation knee aston prospect sprinting introduction fit year push aim coach two moore ruled june added british wound get according next though hansen arena determination may healing confident term unclear time level poland return jump competitive hoped report jumping long sidelined european champion back injury frustrating game delay since march jumper recovery told start short set said athlete event comeback 