In [1]:
from os import listdir
from os.path import isfile, join

def load_data():
    PATH = "../data/"
    contents = []
    files = [f"{PATH}{f}" for f in listdir(PATH) if isfile(join(PATH, f))]
    for file_path in files: 
        with open(file_path, "r") as f:
            contents.append(f.read())
    return files, contents

import nltk


def load_stop_words(path:str="../stopwords.txt"):
   words = []
   with open(path, "r") as f: 
      words = f.read().split("\n")
   assert len(words) != 0, "no stop words were found"
   return words

def clean(content:list):
    ascii_char = [chr(i) for i in range(0,255)]
    numbers = "0123456789"
    non_acc_char =  "\n,.()[]{}`/:-_*=\\<>|&%@?!\"\'#" + numbers
    non_acc_tokens = ["https","www", "com", "org", "license"]
    stop_words = load_stop_words()
    for i, _ in enumerate(content):
        for c in non_acc_char:
            content[i] = content[i].replace(c, " ")
        content[i] = content[i].split(" ")
        content[i] = list(filter(lambda c: c != "", content[i]))
        content[i] = [t for t in content[i] if not t in non_acc_tokens ] 
        content[i] = [s.lower() for s in content[i] if all(c in ascii_char for c in s)]
        content[i] = [t for t in content[i] if not t in stop_words] 
        ps = nltk.stem.PorterStemmer()
        content[i] = [ps.stem(t) for t in content[i]] 


    return [" ".join(con) for con in content]

In [2]:
files, contents = load_data()
contents = clean(contents)
files_idxs = [i for _, i in enumerate(files)]

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
mat = vectorizer.fit_transform(contents)

In [5]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=100)
model.fit(mat, files_idxs)

In [17]:
query = "cira"
query = clean([query])[0]
q_vec = vectorizer.transform([query])
_min = 0.08
prob_of_file = model.predict_proba(q_vec)[0]
print(prob_of_file)
res = [(prob,files[idx]) for idx, prob in enumerate(prob_of_file) if  prob > _min]
res.sort()
res.reverse()
res = [f for _, f in res]
res

[0.05 0.01 0.3  0.25 0.01 0.1  0.01 0.13 0.04 0.1 ]


['../data/react_README.md',
 '../data/vuejs_README.md',
 '../data/pytorch_README.md',
 '../data/linux_README.txt',
 '../data/cira_README.md']