# Information extraction using tf-idf


In [51]:
import pandas as pd 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

loading in readme file and making them in to tokens

In [52]:
from os import listdir
from os.path import isfile, join

contents = []
PATH = "../data"
files = [f"{PATH}/{f}" for f in listdir(PATH) if isfile(join(PATH, f))]

for file_path in files: 
    with open(file_path, "r") as f:
        contents.append(f.read())

contents[0][:50]

'[![Cybernetically enhanced web apps: Svelte](https'

In [53]:

def load_stop_words(path:str="../stopwords.txt"):
   words = []
   with open(path, "r") as f: 
      words = f.read().split("\n")
   assert len(words) != 0, "no stop words were found"
   return words

In [54]:



def clean(content:list):
    ascii_char = [chr(i) for i in range(0,255)]
    numbers = "0123456789"
    non_acc_char =  "\n,.()[]`/:-_*=\\<>|&%@?!\"\'#" + numbers
    non_acc_tokens = ["https","www", "com", "org", "license"]
    stop_words = load_stop_words()
    for i, _ in enumerate(content):
        for c in non_acc_char:
            content[i] = content[i].replace(c, " ")
        content[i] = content[i].split(" ")
        content[i] = list(filter(lambda c: c != "", content[i]))
        content[i] = [t for t in content[i] if not t in non_acc_tokens ] 
        content[i] = [s.lower() for s in content[i] if all(c in ascii_char for c in s)]
        content[i] = [t for t in content[i] if not t in stop_words] 
    return [" ".join(con) for con in content]


contents = clean(contents)
contents

['cybernetically enhanced web apps svelte sveltejs github io assets banner png svelte dev npm version img shields io npm svelte svg npmjs package svelte img shields io npm svelte svg license md chat img shields io discord label chat logo discord svelte dev chat svelte svelte new way build web applications compiler takes declarative components converts efficient javascript surgically updates dom learn svelte website svelte dev discord chatroom svelte dev chat supporting svelte svelte mit licensed open source project ongoing development made possible entirely fantastic volunteers like support efforts consider backer open collective opencollective svelte funds donated open collective will compensating expenses svelte development hosting costs sufficient donations received funds may support svelte development directly roadmap may view roadmap svelte dev roadmap like see currently working development pull requests encouraged welcome pick issue github sveltejs svelte issues aissue+is aopen+s

In [55]:
def TfIdf(content:list):
    vectorizer = TfidfVectorizer()
    vecs = vectorizer.fit_transform(content)
    feature_names = vectorizer.get_feature_names_out()
    dense = vecs.todense()
    dense_list = dense.tolist()
    df = pd.DataFrame(dense_list, columns=feature_names)
    return df

tf_idf_data = TfIdf(contents)


In [56]:
from collections import defaultdict

def query_data(tf_idf_data:dict, query:str, result_size:int=2)->list:
    query = clean([query])[0]
    result = []
    words_tf_idf = {}
    query = query.lower()
    query_words = query.split(" ")
    for word in query_words: 
        if word in tf_idf_data.keys():
            words_tf_idf[word] = tf_idf_data[word]

    cum_tf_idf = defaultdict(int)
    for word, tf_idf in words_tf_idf.items():
        for idx, val in tf_idf.items(): 
            cum_tf_idf[idx] += val

    return list(dict(sorted(cum_tf_idf.items(), key=lambda item: item[1])).keys())[:result_size]


In [57]:
query = "I want to build web apps "
idxs = query_data(tf_idf_data.to_dict(), query)
for idx in idxs:    
    print(files[idx])

../data/gcc_README.txt
../data/vuejs_README.md


In [58]:
tf_idf_data

Unnamed: 0,abi,abide,ability,abis,abstractions,accelerate,accelerates,acceleration,accepted,access,...,yeosh,your,youtube,yuandong,yuxi,zachary,zagoruyko,zeming,zenodo,zero
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.026917,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.006978,0.0,0.0,0.0,0.0,0.0,0.0,0.006978,0.0,...,0.0,0.0,0.011788,0.0,0.0,0.0,0.0,0.0,0.020935,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.009379,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.016599,0.0,0.0,0.0,0.0,0.0,0.0,0.014019,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.006579,0.006579,0.006579,0.013158,0.0,0.0,...,0.006579,0.019737,0.011113,0.006579,0.0,0.006579,0.006579,0.006579,0.0,0.006579
7,0.128389,0.0,0.0,0.014265,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [59]:
files

['../data/svelte_README.md',
 '../data/tensorflow_README.md',
 '../data/react_README.md',
 '../data/vuejs_README.md',
 '../data/gcc_README.txt',
 '../data/cpython_README.rst',
 '../data/pytorch_README.md',
 '../data/rust_README.md',
 '../data/cira_README.md']