# Information extraction using tf-idf


In [55]:
import pandas as pd 
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk

loading in readme file and making them in to tokens

In [56]:
from os import listdir
from os.path import isfile, join

contents = []
PATH = "../data"
files = [f"{PATH}/{f}" for f in listdir(PATH) if isfile(join(PATH, f))]

for file_path in files: 
    with open(file_path, "r") as f:
        contents.append(f.read())

contents[0][:50]

'[![Cybernetically enhanced web apps: Svelte](https'

In [57]:

def load_stop_words(path:str="../stopwords.txt"):
   words = []
   with open(path, "r") as f: 
      words = f.read().split("\n")
   assert len(words) != 0, "no stop words were found"
   return words

In [58]:



def clean(content:list):
    ascii_char = [chr(i) for i in range(0,255)]
    numbers = "0123456789"
    non_acc_char =  "\n,.()[]{}`/:-_*=\\<>|&%@?!\"\'#" + numbers
    non_acc_tokens = ["https","www", "com", "org", "license"]
    stop_words = load_stop_words()
    for i, _ in enumerate(content):
        for c in non_acc_char:
            content[i] = content[i].replace(c, " ")
        content[i] = content[i].split(" ")
        content[i] = list(filter(lambda c: c != "", content[i]))
        content[i] = [t for t in content[i] if not t in non_acc_tokens ] 
        content[i] = [s.lower() for s in content[i] if all(c in ascii_char for c in s)]
        content[i] = [t for t in content[i] if not t in stop_words] 
        ps = nltk.stem.PorterStemmer()
        content[i] = [ps.stem(t) for t in content[i]] 


    return [" ".join(con) for con in content]


contents = clean(contents)
contents

['cybernet enhanc app svelt sveltej github asset banner png svelt dev npm version img shield npm svelt svg npmj packag svelt img shield npm svelt svg licens chat img shield discord label chat logo discord svelt dev chat svelt svelt build applic compil take declar compon convert effici javascript surgic updat dom learn svelt svelt dev discord chatroom svelt dev chat support svelt svelt mit licens sourc project ongo develop fantast volunt support effort backer collect opencollect svelt fund donat collect compens expens svelt develop host cost suffici donat receiv fund support svelt develop roadmap view roadmap svelt dev roadmap develop pull request encourag pick issu github sveltej svelt issu aissue+i aopen+sort aupdat desc instal svelt local bash git clone github sveltej svelt git svelt npm instal yarn instal depend specif packag version packag lock json build svelt build compil modul includ packag bash npm build watch continu rebuild packag npm link doc npmj cli link project local bash

In [59]:
def TfIdf(content:list):
    vectorizer = TfidfVectorizer()
    vecs = vectorizer.fit_transform(content)
    feature_names = vectorizer.get_feature_names_out()
    dense = vecs.todense()
    dense_list = dense.tolist()
    df = pd.DataFrame(dense_list, columns=feature_names)
    return df

tf_idf_data = TfIdf(contents)


In [60]:
from collections import defaultdict

def query_data(tf_idf_data:pd.DataFrame, query:str, cut_of:float = 0.1)->dict:
    query = clean([query])[0]
    query = query.split(" ")
    tf_idf = tf_idf_data.copy() # to no be distinctive
    query = tf_idf.columns[tf_idf.columns.isin(query)]
    tf_idf = tf_idf[query].T
    document_wights = tf_idf.sum(axis=0)
    document_wights = document_wights[document_wights > cut_of]
    print(document_wights.sum())
    return document_wights.to_dict()


In [61]:
query = "compiler and python abstract"
idxs = query_data(tf_idf_data, query)
print(idxs)
for idx in idxs:
    print(files[idx])

1.2926907896492035
{5: 0.1112173043217394, 7: 0.7681287449127536, 9: 0.13504119200566056, 10: 0.2783035484090499}
../data/gcc_README.txt
../data/cpython_README.rst
../data/pytorch_README.md
../data/rust_README.md


In [62]:
tf_idf_data.to_dict().keys()

dict_keys(['abc', 'abi', 'abid', 'abil', 'absolute', 'abstract', 'acceler', 'accept', 'access', 'account', 'acknowledg', 'action', 'activ', 'actual', 'ada', 'adam', 'add', 'addit', 'adher', 'adjust', 'admin', 'adopt', 'advanc', 'advantag', 'agvnhpsajp', 'agx', 'aissue', 'aka', 'alban', 'algebra', 'algorithm', 'align', 'alist', 'alloc', 'alomst', 'alpaca', 'alpha', 'alt', 'altern', 'altinstal', 'alykhan', 'amd', 'anaconda', 'andrea', 'android', 'annot', 'announc', 'answer', 'antiga', 'aopen', 'apach', 'apca', 'api', 'app', 'appear', 'appli', 'applic', 'approach', 'appwrit', 'arbitrari', 'arbitrarili', 'argument', 'armv', 'art', 'artifact', 'ash', 'asm', 'assembl', 'asset', 'assum', 'assumpt', 'asymptot', 'asynchron', 'audienc', 'aupdat', 'authinfo', 'author', 'auto', 'autograd', 'automat', 'auxiliari', 'available', 'avp', 'awar', 'awesom', 'awk', 'ax', 'axel', 'axelgard', 'azur', 'babel', 'babi', 'backend', 'backer', 'background', 'bad', 'badg', 'banner', 'base', 'bash', 'basic', 'bat',

In [63]:
print(len(files))
files

12


['../data/svelte_README.md',
 '../data/tensorflow_README.md',
 '../data/QmYaMCtgVF46b5jTJ9n95F5yTgw9ZYCRNupBeJrvnphjTW',
 '../data/react_README.md',
 '../data/vuejs_README.md',
 '../data/gcc_README.txt',
 '../data/linux_README.txt',
 '../data/cpython_README.rst',
 '../data/QmNb2LcaN8hzSNp4g7z8FtLsqvNyo3XDiR1gnDna1TWMqe',
 '../data/pytorch_README.md',
 '../data/rust_README.md',
 '../data/cira_README.md']