In [1]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from string import digits, punctuation
import numpy as np

In [3]:
data = fetch_20newsgroups(subset='train', remove=('header', 'footer', 'quotes'))['data']
test_data = fetch_20newsgroups(subset='test', remove=('header', 'footer', 'quotes'))['data']
len(data)

11314

In [4]:
data[:4]

["From: lerxst@wam.umd.edu (where's my thing)\nSubject: WHAT car is this!?\nNntp-Posting-Host: rac3.wam.umd.edu\nOrganization: University of Maryland, College Park\nLines: 15\n\n I was wondering if anyone out there could enlighten me on this car I saw\nthe other day. It was a 2-door sports car, looked to be from the late 60s/\nearly 70s. It was called a Bricklin. The doors were really small. In addition,\nthe front bumper was separate from the rest of the body. This is \nall I know. If anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.\n\nThanks,\n- IL\n   ---- brought to you by your neighborhood Lerxst ----\n\n\n\n\n",
 "From: guykuo@carson.u.washington.edu (Guy Kuo)\nSubject: SI Clock Poll - Final Call\nSummary: Final call for SI clock reports\nKeywords: SI,acceleration,clock,upgrade\nArticle-I.D.: shelley.1qvfo9INNc3s\nOrganization: University of Washington\nLines: 

In [5]:
stop_words = {"'ll", 'a', 'able', 'about', 'above', 'abst', 'accordance', 'according', 'accordingly', 'across', 'act', 'actually', 'added', 'adj', 'affected', 'affecting', 'affects', 'after', 'afterwards', 'again', 'against', 'ah', 'all', 'almost', 'alone', 'along', 'already', 'also', 'although', 'always', 'am', 'among', 'amongst', 'an', 'and', 'announce', 'another', 'any', 'anybody', 'anyhow', 'anymore', 'anyone', 'anything', 'anyway', 'anyways', 'anywhere', 'apparently', 'approximately', 'are', 'aren', 'arent', 'arise', 'around', 'as', 'aside', 'ask', 'asking', 'at', 'auth', 'available', 'away', 'awfully', 'b', 'back', 'be', 'became', 'because', 'become', 'becomes', 'becoming', 'been', 'before', 'beforehand', 'begin', 'beginning', 'beginnings', 'begins', 'behind', 'being', 'believe', 'below', 'beside', 'besides', 'between', 'beyond', 'biol', 'both', 'brief', 'briefly', 'but', 'by', 'c', 'ca', 'came', 'can', "can't", 'cannot', 'cause', 'causes', 'certain', 'certainly', 'co', 'com', 'come', 'comes', 'contain', 'containing', 'contains', 'could', 'couldnt', 'd', 'date', 'did', "didn't", 'different', 'do', 'does', "doesn't", 'doing', 'don', "don't", 'done', 'down', 'downwards', 'due', 'during', 'e', 'each', 'ed', 'edu', 'effect', 'eg', 'eight', 'eighty', 'either', 'else', 'elsewhere', 'end', 'ending', 'enough', 'especially', 'et', 'et-al', 'etc', 'even', 'ever', 'every', 'everybody', 'everyone', 'everything', 'everywhere', 'ex', 'except', 'f', 'far', 'few', 'ff', 'fifth', 'first', 'five', 'fix', 'followed', 'following', 'follows', 'for', 'former', 'formerly', 'forth', 'found', 'four', 'from', 'further', 'furthermore', 'g', 'gave', 'get', 'gets', 'getting', 'give', 'given', 'gives', 'giving', 'go', 'goes', 'gone', 'got', 'gotten', 'h', 'had', 'happens', 'hardly', 'has', "hasn't", 'have', "haven't", 'having', 'he', 'hed', 'hence', 'her', 'here', 'hereafter', 'hereby', 'herein', 'heres', 'hereupon', 'hers', 'herself', 'hes', 'hi', 'hid', 'him', 'himself', 'his', 'hither', 'home', 'how', 'howbeit', 'however', 'hundred', 'i', "i'll", "i've", 'id', 'ie', 'if', 'im', 'immediate', 'immediately', 'importance', 'important', 'in', 'inc', 'indeed', 'index', 'information', 'instead', 'into', 'invention', 'inward', 'is', "isn't", 'it', "it'll", 'itd', 'its', 'itself', 'j', 'just', 'k', 'keep \tkeeps', 'kept', 'kg', 'km', 'know', 'known', 'knows', 'l', 'largely', 'last', 'lately', 'later', 'latter', 'latterly', 'least', 'less', 'lest', 'let', 'lets', 'like', 'liked', 'likely', 'line', 'little', 'look', 'looking', 'looks', 'ltd', 'm', 'made', 'mainly', 'make', 'makes', 'many', 'may', 'maybe', 'me', 'mean', 'means', 'meantime', 'meanwhile', 'merely', 'mg', 'might', 'million', 'miss', 'ml', 'more', 'moreover', 'most', 'mostly', 'mr', 'mrs', 'much', 'mug', 'must', 'my', 'myself', 'n', 'na', 'name', 'namely', 'nay', 'nd', 'near', 'nearly', 'necessarily', 'necessary', 'need', 'needs', 'neither', 'never', 'nevertheless', 'new', 'next', 'nine', 'ninety', 'no', 'nobody', 'non', 'none', 'nonetheless', 'noone', 'nor', 'normally', 'nos', 'not', 'noted', 'nothing', 'now', 'nowhere', 'o', 'obtain', 'obtained', 'obviously', 'of', 'off', 'often', 'oh', 'ok', 'okay', 'old', 'omitted', 'on', 'once', 'one', 'ones', 'only', 'onto', 'or', 'ord', 'other', 'others', 'otherwise', 'ought', 'our', 'ours', 'ourselves', 'out', 'outside', 'over', 'overall', 'owing', 'own', 'p', 'page', 'pages', 'part', 'particular', 'particularly', 'past', 'per', 'perhaps', 'placed', 'please', 'plus', 'poorly', 'possible', 'possibly', 'potentially', 'pp', 'predominantly', 'present', 'previously', 'primarily', 'probably', 'promptly', 'proud', 'provides', 'put', 'q', 'que', 'quickly', 'quite', 'qv', 'r', 'ran', 'rather', 'rd', 're', 'readily', 'really', 'recent', 'recently', 'ref', 'refs', 'regarding', 'regardless', 'regards', 'related', 'relatively', 'research', 'respectively', 'resulted', 'resulting', 'results', 'right', 'run', 's', 'said', 'same', 'saw', 'say', 'saying', 'says', 'sec', 'section', 'see', 'seeing', 'seem', 'seemed', 'seeming', 'seems', 'seen', 'self', 'selves', 'sent', 'seven', 'several', 'shall', 'she', "she'll", 'shed', 'shes', 'should', "shouldn't", 'show', 'showed', 'shown', 'showns', 'shows', 'significant', 'significantly', 'similar', 'similarly', 'since', 'six', 'slightly', 'so', 'some', 'somebody', 'somehow', 'someone', 'somethan', 'something', 'sometime', 'sometimes', 'somewhat', 'somewhere', 'soon', 'sorry', 'specifically', 'specified', 'specify', 'specifying', 'still', 'stop', 'strongly', 'sub', 'substantially', 'successfully', 'such', 'sufficiently', 'suggest', 'sup', 'sure', 't', 'than', 'that', 'the', 'their', 'theirs', 'them', 'themselves', 'then', 'there', 'these', 'they', 'this', 'those', 'through', 'to', 'too', 'under', 'until', 'up', 'very', 'was', 'we', 'were', 'what', 'when', 'where', 'which', 'while', 'who', 'whom', 'why', 'will', 'with', 'you', 'your', 'yours', 'yourself', 'yourselves'}
digits_ = tuple(digits)
punctuation = punctuation.replace("'","").replace("_","")
def preprocess(text:str):
    text = str(text).lower() if text.__class__ is not str else text.lower()
    text = text.split("\n\n", maxsplit=1)[-1]
    text = text[::-1].split("\n--\n", maxsplit=1)[-1][::-1].strip()
    text = text[::-1].split("\n---\n", maxsplit=1)[-1][::-1].strip()
    for punct in punctuation:
        text = text.replace(punct," ")
    return [word for word in text.split() 
                if word not in stop_words 
                and word.isalpha()
                ]

### Preprocessing the corpus

In [6]:
preprocessed: list[str] = []
for document in data:
    pps = preprocess(document)
    preprocessed.append(" ".join(pps))

In [7]:
preprocessed[:4]

['wondering enlighten car day door sports car looked late early called bricklin doors small addition front bumper separate rest body tellme model engine specs years production car history whatever info funky car mail thanks il brought neighborhood lerxst',
 'fair number brave souls upgraded si clock oscillator shared experiences poll send message detailing experiences procedure top speed attained cpu rated speed add cards adapters heat sinks hour usage day floppy disk functionality floppies requested summarizing two days add network knowledge base clock upgrade answered poll thanks guy kuo guykuo u washington',
 'well folks mac finally ghost weekend starting life way sooo market machine bit sooner intended picking powerbook bunch questions hopefully answer dirt round powerbook introductions expected heard supposed appearence summer heard access macleak wondering info heard rumors price drops powerbook went impression display swing disk feel better display yea great store wow good solic

### TF-IDF Vector Matrix

In [8]:
vectorizer = TfidfVectorizer(max_features = 2000)
tfidf_matrix = vectorizer.fit_transform(preprocessed)

### Similarity Matrix

In [9]:
similarity = cosine_similarity(tfidf_matrix, tfidf_matrix)

# It's a symmetric matrix with 1s on principle diagonal

### Similarity between two tf-idf vectors

In [10]:
def similarity_between_vectors(vec1, vec2):
    return cosine_similarity(vec1.reshape(1,-1),vec2.reshape(1,-1)).flatten()[0]
tfidf_arrs =  tfidf_matrix.toarray()

In [11]:
similarity_between_vectors(tfidf_arrs[62], tfidf_arrs[50])

0.006328509385887433

In [12]:
similarity[62][50]

0.006328509385887432

In [13]:
similarity_between_vectors(similarity[50], similarity[50])

0.9999999999999991

In [14]:
similarity[50][50]

1.0

In [15]:
def find_similarity(text:str, tfidf_matrix, vectorizer:TfidfVectorizer , top_k = 5):
    text = " ".join(preprocess(text))
    text_vector = vectorizer.transform([text])
    similarity = cosine_similarity(tfidf_matrix, text_vector).flatten()
    top_k = np.argsort(similarity)[-top_k:][::-1]
    return similarity[top_k], top_k

In [20]:
from random import choices, seed
seed(4200)
sentences = choices(test_data,k=100)

for t in sentences:
    sim_percentage, index = find_similarity(t, tfidf_matrix, vectorizer, top_k = 1)
    print(f"SENTENCE:\n\n{t}\n\n")
    print("=="*40,"\nSIMILARITIES\n","="*40)
    print(sim_percentage[0],end="\n-----------------\n")
    print(data[index[0]])
    print("=="*40)

SENTENCE:

From: Feng.Qian@launchpad.unc.edu (Feng Qian)
Subject: LL format utilities in Maxitor BBS? What is the BBS number?
Nntp-Posting-Host: lambada.oit.unc.edu
Organization: University of North Carolina Extended Bulletin Board Service
Lines: 13


Due to some problems with my IDE drive, I LL formatted the Maxitor7213AT.
Now it started to give me some errors in some applications. I was told
Maxitor has a utility called IDE_INT in their BBS, anyone tried it? Can
some one tell me what that BBS number is? Or better, can I find the file
in some ftp site? Or perhaps someone can email it to me? Thanks.

Feng
--
   The opinions expressed are not necessarily those of the University of
     North Carolina at Chapel Hill, the Campus Office for Information
        Technology, or the Experimental Bulletin Board Service.
           internet:  laUNChpad.unc.edu or 152.2.22.80



SIMILARITIES
0.5364515088984857
-----------------
From: dshanks@nyx.cs.du.edu (David Shanks)
Subject: Re: Diamond Speed