# Information extraction using tf-idf


In [88]:
import pandas as pd 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

loading in readme file and making them in to tokens

In [89]:
from os import listdir
from os.path import isfile, join

contents = []
PATH = "../data"
files = [f"{PATH}/{f}" for f in listdir(PATH) if isfile(join(PATH, f))]

for file_path in files: 
    with open(file_path, "r") as f:
        contents.append(f.read())

contents[0][:50]

'[![Cybernetically enhanced web apps: Svelte](https'

In [90]:

def load_stop_words(path:str="../stopwords.txt"):
   words = []
   with open(path, "r") as f: 
      words = f.read().split("\n")
   assert len(words) != 0, "no stop words were found"
   return words

In [91]:



def clean(content:list):
    ascii_char = [chr(i) for i in range(0,255)]
    numbers = "0123456789"
    non_acc_char =  "\n,.()[]`/:-_*=\\<>|&%@?!\"\'#" + numbers
    non_acc_tokens = ["https","www", "com", "org", "license"]
    stop_words = load_stop_words()
    for i, _ in enumerate(content):
        for c in non_acc_char:
            content[i] = content[i].replace(c, " ")
        content[i] = content[i].split(" ")
        content[i] = list(filter(lambda c: c != "", content[i]))
        content[i] = [t for t in content[i] if not t in non_acc_tokens ] 
        content[i] = [s.lower() for s in content[i] if all(c in ascii_char for c in s)]
        content[i] = [t for t in content[i] if not t in stop_words] 
    return [" ".join(con) for con in content]


contents = clean(contents)
contents

['cybernetically enhanced apps svelte sveltejs github assets banner png svelte dev npm version img shields npm svelte svg npmjs package svelte img shields npm svelte svg license chat img shields discord label chat logo discord svelte dev chat svelte svelte build applications compiler takes declarative components converts efficient javascript surgically updates dom learn svelte svelte dev discord chatroom svelte dev chat supporting svelte svelte mit licensed source project ongoing development fantastic volunteers support efforts backer collective opencollective svelte funds donated collective compensating expenses svelte development hosting costs sufficient donations received funds support svelte development roadmap view roadmap svelte dev roadmap development pull requests encouraged pick issue github sveltejs svelte issues aissue+is aopen+sort aupdated desc install svelte locally bash git clone github sveltejs svelte git svelte npm install yarn install dependencies specific package ver

In [92]:
def TfIdf(content:list):
    vectorizer = TfidfVectorizer()
    vecs = vectorizer.fit_transform(content)
    feature_names = vectorizer.get_feature_names_out()
    dense = vecs.todense()
    dense_list = dense.tolist()
    df = pd.DataFrame(dense_list, columns=feature_names)
    return df

tf_idf_data = TfIdf(contents)


In [93]:
from collections import defaultdict

def query_data(tf_idf_data:dict, query:str)->list:
    query = clean([query])[0]
    print(query.split(" "))
    words_tf_idf = {}
    query_words = query.split(" ")
    for word in query_words: 
        if word in tf_idf_data.keys():
            words_tf_idf[word] = tf_idf_data[word]

    result = []
    highest = 0.0
    for word, tf_idf in words_tf_idf.items():
        for idx, val in tf_idf.items(): 
            if val > highest: 
                highest = val 
                result.append((val, idx))
    result.sort()
    result.reverse()
    lookup = set()
    result = [idx for _, idx in result if idx not in lookup and lookup.add(idx) is None]
    return result


In [97]:
query = "I'm interested in working on javascript framework"
idxs = query_data(tf_idf_data.to_dict(), query)
for idx in idxs:    
    print(files[idx])

['javascript', 'framework']
../data/react_README.md
../data/svelte_README.md


In [95]:
tf_idf_data["cira"]

0    0.000000
1    0.000000
2    0.000000
3    0.000000
4    0.000000
5    0.000000
6    0.000000
7    0.000000
8    0.000000
9    0.796774
Name: cira, dtype: float64

In [96]:
files

['../data/svelte_README.md',
 '../data/tensorflow_README.md',
 '../data/react_README.md',
 '../data/vuejs_README.md',
 '../data/gcc_README.txt',
 '../data/linux_README.txt',
 '../data/cpython_README.rst',
 '../data/pytorch_README.md',
 '../data/rust_README.md',
 '../data/cira_README.md']