In [1]:
from os import listdir
from os.path import isfile, join

def load_data():
    PATH = "../data/"
    contents = []
    files = [f"{PATH}{f}" for f in listdir(PATH) if isfile(join(PATH, f))]
    for file_path in files: 
        with open(file_path, "r") as f:
            contents.append(f.read())
    return files, contents

import nltk


def load_stop_words(path:str="../stopwords.txt"):
   words = []
   with open(path, "r") as f: 
      words = f.read().split("\n")
   assert len(words) != 0, "no stop words were found"
   return words

def clean(content:list):
    ascii_char = [chr(i) for i in range(0,255)]
    numbers = "0123456789"
    non_acc_char =  "\n,.()[]{}`/:-_*=\\<>|&%@?!\"\'#" + numbers
    non_acc_tokens = ["https","www", "com", "org", "license"]
    stop_words = load_stop_words()
    for i, _ in enumerate(content):
        for c in non_acc_char:
            content[i] = content[i].replace(c, " ")
        content[i] = content[i].split(" ")
        content[i] = list(filter(lambda c: c != "", content[i]))
        content[i] = [t for t in content[i] if not t in non_acc_tokens ] 
        content[i] = [s.lower() for s in content[i] if all(c in ascii_char for c in s)]
        content[i] = [t for t in content[i] if not t in stop_words] 
        ps = nltk.stem.PorterStemmer()
        content[i] = [ps.stem(t) for t in content[i]] 


    return [" ".join(con) for con in content]

In [2]:

files, contents = load_data()
contents = clean(contents)

In [3]:
import requests
import re
def get_topics_gh():
    _topics = {}
    for file in files: 
        name = re.search(r'([^\/]+)(?=_[A-Za-z]+\.[A-Za-z]+)', file).group(1)
        r = requests.get(f"https://api.github.com/search/repositories?q={name}")
        resp = r.json()
        try:
            repo_topics = resp["items"][0]["topics"]
        except:
            print(file)
            _topics[file] = ["NONE"]
            continue
        _topics[file] = set()
        for topic in repo_topics:
            _topics[file].add(topic)
            for sub in topic.split("-"):
                _topics[file].add(sub)
        _topics[file] = list(_topics[file])

In [4]:
topics = {
    "../data/svelte_README.md": "JavaScript",
    "../data/tensorflow_README.md": "Machine Learning",
    "../data/react_README.md": "JavaScript",
    "../data/vuejs_README.md": "JavaScript",
    "../data/gcc_README.txt": "Compiler",
    "../data/linux_README.txt": "Operating System",
    "../data/cpython_README.rst": "Python Interpreter",
    "../data/pytorch_README.md": "Deep Learning",
    "../data/rust_README.md": "Systems Programming",
    "../data/cira_README.md": "Trading",
}

topics = [t.replace(" ", "").lower() for t in topics.values()]
topics

['javascript',
 'machinelearning',
 'javascript',
 'javascript',
 'compiler',
 'operatingsystem',
 'pythoninterpreter',
 'deeplearning',
 'systemsprogramming',
 'trading']

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC

In [6]:
vectorizer = TfidfVectorizer()
mat = vectorizer.fit_transform(contents)

In [7]:
model = SVC(probability=True,kernel="linear", C=1.0)
model.fit(mat, files)

In [46]:
query = "java script"
_min = 0.1005
query = clean([query])[0]
print(query)
q_vec = vectorizer.transform([query])

prob_of_file = model.predict_proba(q_vec)[0]

print(prob_of_file)
res = [(prob,files[idx]) for idx, prob in enumerate(prob_of_file) if  prob > _min]
res.sort()
res.reverse()
print(res)
res = [f for _, f in res]
res

java script
[0.10052007 0.09965676 0.10055288 0.10062102 0.10032692 0.09900482
 0.09793657 0.10040647 0.1004584  0.10051609]
[(0.10062101936636254, '../data/vuejs_README.md'), (0.10055287623784966, '../data/react_README.md'), (0.10052006971057065, '../data/svelte_README.md'), (0.10051609343476169, '../data/cira_README.md')]


['../data/vuejs_README.md',
 '../data/react_README.md',
 '../data/svelte_README.md',
 '../data/cira_README.md']

In [41]:
model.predict(q_vec)[0]

'../data/cira_README.md'

In [47]:
files

['../data/svelte_README.md',
 '../data/tensorflow_README.md',
 '../data/react_README.md',
 '../data/vuejs_README.md',
 '../data/gcc_README.txt',
 '../data/linux_README.txt',
 '../data/cpython_README.rst',
 '../data/pytorch_README.md',
 '../data/rust_README.md',
 '../data/cira_README.md']

F1 score: 0.6
