In [7]:
import math
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import re
import os
import pandas as pd
from collections import Counter


ps = PorterStemmer()
stopwords = []
N = 20

In [8]:
# Stopwords
def load_stopwords():
    with open('Stopword-List.txt', 'r') as f:
        for line in f:
            # if line is space, skip
            if not line.strip():
                continue
            stopwords.append(line.strip())
load_stopwords()
print(stopwords)

['a', 'is', 'the', 'of', 'all', 'and', 'to', 'can', 'be', 'as', 'once', 'for', 'at', 'am', 'are', 'has', 'have', 'had', 'up', 'his', 'her', 'in', 'on', 'no', 'we', 'do']


In [9]:
# Preprocess

def preprocessing(corpus):
    # lowercase
    corpus = corpus.lower()
    # remove punctuation
    corpus = re.sub(r'[^\w\s]', '', corpus)
    # remove numbers
    corpus = re.sub(r'\d+', '', corpus)
    # replace multiple spaces with single space
    corpus = re.sub(r'\s+', ' ', corpus)
    # remove leading and trailing spaces
    corpus = corpus.strip()
    # remove irrelevant characters
    corpus = re.sub(r'[^\x00-\x7F]+', '', corpus)
    tokens = word_tokenize(corpus)
    # remove stopwords and stem
    tokens = [ps.stem(token) for token in tokens if token not in stopwords]
    # remove single character tokens
    tokens = [token for token in tokens if len(token) > 1]
    # remove large tokens
    tokens = [token for token in tokens if len(token) < 20]
    # remove tokens with consecutive characters
    tokens = [token for token in tokens if not re.match(
        r".*(.)\1{2,}.*", token)]
    # remove urls with http or https using startswith
    tokens = [token for token in tokens if not token.startswith(
        'http') and not token.startswith('https')]
    # http or https in the middle of the url
    tokens = [token for token in tokens if not re.match(
        r"[a-zA-Z0-9\./]+http[a-zA-Z0-9\./]+", token)]
    # remove url with github
    tokens = [token for token in tokens if not re.match(
        r"github/[a-zA-Z0-9\./]+", token)]
    # remove email addresses using regex
    tokens = [token for token in tokens if not re.match(
        r"[^@]+@[^@]+\.[^@]+", token)]
    return tokens

In [10]:
doc_ids = []

def load_data():
    data = []
    for filename in sorted(os.listdir(r'../ResearchPapers'), key=lambda x: int(x[:-4])):
        with open(r'../ResearchPapers/' + filename, 'r') as f:
            filename = int(filename[:-4])
            doc_ids.append(filename)
            data.append(f.read())
    return data

data = load_data()

In [11]:
index = ["terms",*doc_ids, 'df', 'idf']
df = pd.DataFrame(columns=index)
df

Unnamed: 0,terms,1,2,3,7,8,9,11,12,13,...,17,18,21,22,23,24,25,26,df,idf


In [12]:
def compute_tf_idf(data):
    global df
    for i, doc in enumerate(data):
        tokens = preprocessing(doc)
        token_count = Counter(tokens)
        for token, count in token_count.items():
            if token not in df['terms'].values:
                df = df._append({'terms': token}, ignore_index=True)
            df.loc[df['terms'] == token, doc_ids[i]] = count
    df = df.fillna(0)
    df['df'] = df[doc_ids].apply(lambda x: sum(x > 0), axis=1)
    df['idf'] = df['df'].apply(lambda x: math.log10(N/x))
    for doc_id in doc_ids:
        # df[doc_id] = df[doc_id].apply(lambda x: 1 + math.log10(x) if x > 0 else 0)
        df[doc_id] = df[doc_id] * df['idf']
    return df

compute_tf_idf(data)
df.head(10)

  df = df.fillna(0)


Unnamed: 0,terms,1,2,3,7,8,9,11,12,13,...,17,18,21,22,23,24,25,26,df,idf
0,overview,3.58146,1.19382,0.0,6.76498,0.0,0.0,0.0,1.19382,0.0,...,0.39794,0.0,0.0,0.39794,0.0,0.0,0.0,0.0,8,0.39794
1,histor,2.78558,0.39794,0.39794,0.39794,0.0,0.0,0.0,0.79588,0.0,...,0.0,0.0,0.39794,0.0,0.0,0.0,0.0,0.0,8,0.39794
2,perspect,3.814662,0.0,0.346787,5.5486,0.0,1.38715,0.0,0.693575,0.0,...,0.346787,0.0,3.467875,10.750412,0.0,0.0,0.0,0.346787,9,0.346787
3,explain,25.88858,25.88858,17.45974,96.028569,0.60206,0.30103,0.0,0.0,0.0,...,0.90309,0.0,0.30103,0.30103,0.0,0.0,0.0,0.30103,10,0.30103
4,artifici,17.45974,0.0,0.0,0.0,13.84738,0.0,0.0,3.61236,0.0,...,0.0,0.0,0.0,0.0,0.60206,0.0,0.60206,0.0,5,0.60206
5,intellig,7.483466,3.554646,2.432126,14.031498,3.92882,0.0,0.0,0.0,0.0,...,0.187087,0.0,0.374173,0.0,0.374173,1.309607,0.935433,0.187087,13,0.187087
6,roberto,5.20412,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,1.30103
7,confalonieri,35.12781,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,1.30103
8,ludovik,2.60206,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,1.30103
9,coba,14.31133,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,1.30103


In [13]:
df = df.round(3)
df.to_csv('tf_idf.csv', index=False)

In [16]:
# load the csv file
new_df = pd.read_csv('tf_idf.csv')
# make dataframe float values to 2 decimal places
new_df.head()


Unnamed: 0,terms,1,2,3,7,8,9,11,12,13,...,17,18,21,22,23,24,25,26,df,idf
0,overview,3.581,1.194,0.0,6.765,0.0,0.0,0.0,1.194,0.0,...,0.398,0.0,0.0,0.398,0.0,0.0,0.0,0.0,8,0.398
1,histor,2.786,0.398,0.398,0.398,0.0,0.0,0.0,0.796,0.0,...,0.0,0.0,0.398,0.0,0.0,0.0,0.0,0.0,8,0.398
2,perspect,3.815,0.0,0.347,5.549,0.0,1.387,0.0,0.694,0.0,...,0.347,0.0,3.468,10.75,0.0,0.0,0.0,0.347,9,0.347
3,explain,25.889,25.889,17.46,96.029,0.602,0.301,0.0,0.0,0.0,...,0.903,0.0,0.301,0.301,0.0,0.0,0.0,0.301,10,0.301
4,artifici,17.46,0.0,0.0,0.0,13.847,0.0,0.0,3.612,0.0,...,0.0,0.0,0.0,0.0,0.602,0.0,0.602,0.0,5,0.602


In [21]:
def add_query_tf_idf(new_df, query):
    tokens = preprocessing(query)
    token_count = Counter(tokens)
    for token, count in token_count.items():
        if token not in new_df['terms'].values:
            new_df = new_df._append({'terms': token}, ignore_index=True)
        new_df.loc[new_df['terms'] == token, 'query'] = count
    new_df = new_df.fillna(0)
    # new_df['query'] = new_df['query'].apply(lambda x: 1 + math.log10(x) if x > 0 else 0)
    new_df['query'] = new_df['query'] * new_df['idf']
    return new_df

def create_vector():
    global new_df
    vec = {}
    for id in doc_ids:
        vec[id] = new_df[str(id)].values
    query_vector = new_df.pop("query").values
    return vec, query_vector

# def create_vector():
#     global new_df
#     float_cols = new_df.select_dtypes('float64').columns
#     vector = {}
#     for col in float_cols:
#         vector[col] = new_df[col].values
#     vector.pop('idf')
#     query_vector = vector.pop('query')
#     new_df = new_df.drop('query', axis=1)
#     return vector, query_vector

def cosine_similarity(query_vector, doc_vector):
    dot_product = sum(query_vector * doc_vector)
    query_norm = math.sqrt(sum(query_vector ** 2))
    doc_norm = math.sqrt(sum(doc_vector ** 2))
    return dot_product / (query_norm * doc_norm)


In [23]:
query = "machine learning"
new_df = add_query_tf_idf(new_df, query)
vector, query_vector = create_vector()

# calculate cosine similarity
cosine_sim = {}
for doc_id, doc_vector in vector.items():
    cosine_sim[doc_id] = cosine_similarity(query_vector, doc_vector)

# sort the dictionary by values
cosine_sim = dict(sorted(cosine_sim.items(), key=lambda x: x[1], reverse=True))

# sort the dictionary by values by a threshold of 0.5
cosine_sim = {k: v for k, v in sorted(
    cosine_sim.items(), key=lambda item: item[1], reverse=True) if v > 0.03}
print(list(cosine_sim.keys()))


[24, 7, 16, 2, 1]


In [None]:
# sort the dictionary by values by a threshold of 0.5
cosine_sim = {k: v for k, v in sorted(
    cosine_sim.items(), key=lambda item: item[1], reverse=True) if v > 0.025}
cosine_sim

# sort the dictionary by values
cosine_sim = dict(sorted(cosine_sim.items(), key=lambda x: x[1], reverse=True))
print(cosine_sim)