In [1]:
import os
from nltk.corpus import stopwords
import re
stop_words = list(set(stopwords.words('english')))
from nltk.stem.porter import PorterStemmer
pstemmer = PorterStemmer()
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
os.chdir('data')

In [3]:
lst = []
for file in os.listdir():
    f = open(file)
    lst.append(f.read())

In [4]:
lst

['This April patch   mini DLC for Borderlands 3 is amazing  the events  the new difficulty modifiers  the extra gear stats make the game fresh once again ',
 'This latest April patch   mini DLC for Ultimate Borderlands Online 3 is amazing  The story events  the brand new level difficulty modifiers  the ramp to up power in gear stats  all combine to make the game fresh yet again ',
 'Borderlands 3  Weapons  Love and Tentacles  10 best new legendary weapons  rank ift tt   2Xj2rmu',
 'Super tempted to get a simple black heart tattoo right above my shoulder like Ellie from Borderlands 1  idk if I d like it but I just love the thought that hopefully a very small pool of people would know WHY I get that tattoo where Id did   cause simple black hearts are pretty common lol',
 '4 player local Borderlands 3  Such a great weekend ',
 '4 2 player local play Borderlands 3  Such a great weekend ',
 'Sly   Borderlands haha  The mission  The demon in the dark   Moria mines  Nice ',
 'Sly G  Borderlan

In [5]:
def preprocess_input(text):
    sent_lower = text.lower()
    sent_en = re.sub(r'[^a-z]',' ',sent_lower)
    sent_stemmed = ' '.join([pstemmer.stem(word) for word in sent_en.split() if word not in stop_words and word != ''])
    return sent_stemmed

In [6]:
clean_lst = []
for sentence in lst:
    sent_stemmed = preprocess_input(sentence)
    clean_lst.append(sent_stemmed)

In [7]:
import pandas as pd

In [8]:
clean_lst

['april patch mini dlc borderland amaz event new difficulti modifi extra gear stat make game fresh',
 'latest april patch mini dlc ultim borderland onlin amaz stori event brand new level difficulti modifi ramp power gear stat combin make game fresh yet',
 'borderland weapon love tentacl best new legendari weapon rank ift tt xj rmu',
 'super tempt get simpl black heart tattoo right shoulder like elli borderland idk like love thought hope small pool peopl would know get tattoo id caus simpl black heart pretti common lol',
 'player local borderland great weekend',
 'player local play borderland great weekend',
 'sli borderland haha mission demon dark moria mine nice',
 'sli g borderland haha mission high demon district dark mine moria nice',
 'play borderland mistak bc know die even replay end mess',
 'play borderland final mistak bc know die also play replay im gonna g mess end',
 'run borderland xbox game dope tryna run squad man',
 'never run borderland xbox video game go dope tryna ru

In [9]:
raw = pd.DataFrame(lst)

In [10]:
vec = TfidfVectorizer()
vectorized_text = vec.fit_transform(clean_lst)
vectorized_array = vectorized_text.toarray()

In [11]:
vectorization_matrix = pd.DataFrame(vectorized_array, columns=vec.get_feature_names_out())

In [12]:
vectorization_matrix

Unnamed: 0,absolut,access,accid,accomplish,account,achiev,acnh,activ,actual,ad,...,ye,yeah,year,yed,yesterday,yet,youtu,youtub,zer,ztedpr
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.220435,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
229,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
230,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
231,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
232,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0


In [13]:
def search(search_text,search_area,original):
    df = original.copy()
    clean_search_sentence = preprocess_input(search_text)
    search_text_vectorized = vec.transform([clean_search_sentence])
    search_text_vectorized = search_text_vectorized.toarray()
    sim_vector = cosine_similarity(search_text_vectorized , search_area)
    df['sim'] = sim_vector[0]
    df = df.sort_values(by='sim',ascending=False)
    similar_text = df.iloc[:5,:]
    return similar_text

In [14]:
results = search('twitter',vectorization_matrix,raw)

In [15]:
results

Unnamed: 0,0,sim
91,That s really all you left me GearboxOffici...,0.289215
24,Best border character y pic twitter com XR9...,0.280815
196,I know it doesn t look like much but Borderla...,0.250636
197,N I know it doesnt look like much but borderla...,0.241103
185,Have u played borderlands 3 Pretty fun imo A...,0.237585


In [16]:
results.iloc[1,0]

'Best border character  y pic twitter com   XR9P3UrsqG'