In [1]:
from os import listdir
from os.path import isfile, join

def load_data():
    PATH = "../data/"
    contents = []
    files = [f"{PATH}{f}" for f in listdir(PATH) if isfile(join(PATH, f))]
    for file_path in files: 
        with open(file_path, "r") as f:
            contents.append(f.read())
    return files, contents



In [2]:
import nltk


def load_stop_words(path:str="../stopwords.txt"):
   words = []
   with open(path, "r") as f: 
      words = f.read().split("\n")
   assert len(words) != 0, "no stop words were found"
   return words

def clean(content:list):
    ascii_char = [chr(i) for i in range(0,255)]
    numbers = "0123456789"
    non_acc_char =  "\n,.()[]{}`/:-_*=\\<>|&%@?!\"\'#" + numbers
    non_acc_tokens = ["https","www", "com", "org", "license"]
    stop_words = load_stop_words()
    for i, _ in enumerate(content):
        for c in non_acc_char:
            content[i] = content[i].replace(c, " ")
        content[i] = content[i].split(" ")
        content[i] = list(filter(lambda c: c != "", content[i]))
        content[i] = [t for t in content[i] if not t in non_acc_tokens ] 
        content[i] = [s.lower() for s in content[i] if all(c in ascii_char for c in s)]
        content[i] = [t for t in content[i] if not t in stop_words] 
        ps = nltk.stem.PorterStemmer()
        content[i] = [ps.stem(t) for t in content[i]] 


    return [" ".join(con) for con in content]



In [4]:

files, contents = load_data()
contents = clean(contents)

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
mat = vectorizer.fit_transform(contents)


In [6]:
from sklearn.metrics.pairwise import cosine_similarity

query = "build web apps with javascript"

query = clean([query])[0]
print(query)
q_vec = vectorizer.transform([query])
simi = cosine_similarity(q_vec, mat)
print(simi)
idxs = simi.argsort()[0]
simi_limit = 0.037
[files[idx] for idx in idxs if simi[0][idx] > simi_limit]

build app javascript
[[0.0724035  0.02881532 0.10567176 0.00256731 0.         0.03792033
  0.03832463 0.03578802 0.09258942 0.01062844]]


['../data/linux_README.txt',
 '../data/cpython_README.rst',
 '../data/svelte_README.md',
 '../data/rust_README.md',
 '../data/react_README.md']