# Information extraction using tf-idf


In [14]:
import pandas as pd 
from sklearn.feature_extraction.text import TfidfVectorizer

loading in readme file and making them in to tokens

In [15]:
from os import listdir
from os.path import isfile, join

contents = []
PATH = "../data"
files = [f"{PATH}/{f}" for f in listdir(PATH) if isfile(join(PATH, f))]

for file_path in files: 
    with open(file_path, "r") as f:
        contents.append(f.read())

contents[0][:50]

'[![Cybernetically enhanced web apps: Svelte](https'

In [16]:

def load_stop_words(path:str="../stopwords.txt"):
   words = []
   with open(path, "r") as f: 
      words = f.read().split("\n")
   assert len(words) != 0, "no stop words were found"
   return words

In [17]:

import nltk
def clean(content:list):
    ascii_char = [chr(i) for i in range(0,255)]
    numbers = "0123456789"
    non_acc_char =  "\n,.()[]{}`/:-_*=\\<>|&%@?!\"\'#" + numbers
    non_acc_tokens = ["https","www", "com", "org", "license"]
    stop_words = load_stop_words()
    for i, _ in enumerate(content):
        for c in non_acc_char:
            content[i] = content[i].replace(c, " ")
        content[i] = content[i].split(" ")
        content[i] = list(filter(lambda c: c != "", content[i]))
        content[i] = [t for t in content[i] if not t in non_acc_tokens ] 
        content[i] = [s.lower() for s in content[i] if all(c in ascii_char for c in s)]
        content[i] = [t for t in content[i] if not t in stop_words] 
        ps = nltk.stem.PorterStemmer()
        content[i] = [ps.stem(t) for t in content[i]] 


    return [" ".join(con) for con in content]


contents = clean(contents)
contents

['cybernet enhanc app svelt sveltej github asset banner png svelt dev npm version img shield npm svelt svg npmj packag svelt img shield npm svelt svg licens chat img shield discord label chat logo discord svelt dev chat svelt svelt build applic compil take declar compon convert effici javascript surgic updat dom learn svelt svelt dev discord chatroom svelt dev chat support svelt svelt mit licens sourc project ongo develop fantast volunt support effort backer collect opencollect svelt fund donat collect compens expens svelt develop host cost suffici donat receiv fund support svelt develop roadmap view roadmap svelt dev roadmap develop pull request encourag pick issu github sveltej svelt issu aissue+i aopen+sort aupdat desc instal svelt local bash git clone github sveltej svelt git svelt npm instal yarn instal depend specif packag version packag lock json build svelt build compil modul includ packag bash npm build watch continu rebuild packag npm link doc npmj cli link project local bash

In [18]:
def TfIdf(content:list):
    vectorizer = TfidfVectorizer()
    vecs = vectorizer.fit_transform(content)
    feature_names = vectorizer.get_feature_names_out()
    dense = vecs.todense()
    dense_list = dense.tolist()
    df = pd.DataFrame(dense_list, columns=feature_names)
    return df

tf_idf_data = TfIdf(contents)


In [19]:
from collections import defaultdict

def query_data(tf_idf_data:dict, query:str)->list:
    query = clean([query])[0]
    print(query.split(" "))
    words_tf_idf = {}
    query_words = query.split(" ")
    for word in query_words: 
        if word in tf_idf_data.keys():
            words_tf_idf[word] = tf_idf_data[word]

    result = []
    highest = 0.0
    for word, tf_idf in words_tf_idf.items():
        for idx, val in tf_idf.items(): 
            if val > highest: 
                highest = val 
                result.append((val, idx))
    result.sort()
    result.reverse()
    lookup = set()
    result = [idx for _, idx in result if idx not in lookup and lookup.add(idx) is None]
    return result


In [20]:
query = "compilers"
idxs = query_data(tf_idf_data.to_dict(), query)
for idx in idxs:    
    print(files[idx])

['compil']
../data/rust_README.md
../data/gcc_README.txt
../data/svelte_README.md


In [21]:
tf_idf_data.to_dict().keys()

dict_keys(['abi', 'abid', 'abil', 'abstract', 'acceler', 'accept', 'access', 'account', 'acknowledg', 'action', 'activ', 'actual', 'adam', 'add', 'addit', 'adher', 'adjust', 'admin', 'adopt', 'advanc', 'advantag', 'agvnhpsajp', 'agx', 'aissue', 'aka', 'alban', 'algebra', 'algorithm', 'align', 'alloc', 'alomst', 'alpaca', 'alpha', 'alt', 'altern', 'altinstal', 'alykhan', 'amd', 'anaconda', 'andrea', 'android', 'annot', 'announc', 'answer', 'antiga', 'aopen', 'apach', 'apca', 'api', 'app', 'appear', 'appli', 'applic', 'approach', 'appwrit', 'arbitrari', 'arbitrarili', 'argument', 'armv', 'art', 'artifact', 'asset', 'assum', 'assumpt', 'asynchron', 'audienc', 'aupdat', 'author', 'auto', 'autograd', 'automat', 'auxiliari', 'available', 'awar', 'awesom', 'ax', 'axel', 'axelgard', 'azur', 'babi', 'backend', 'backer', 'bad', 'badg', 'banner', 'base', 'bash', 'basic', 'bat', 'batch', 'beginn', 'behav', 'behavior', 'beopen', 'bestpractic', 'bib', 'bigger', 'bin', 'binari', 'bind', 'bintray', 'b

In [22]:
files

['../data/svelte_README.md',
 '../data/tensorflow_README.md',
 '../data/react_README.md',
 '../data/vuejs_README.md',
 '../data/gcc_README.txt',
 '../data/linux_README.txt',
 '../data/cpython_README.rst',
 '../data/pytorch_README.md',
 '../data/rust_README.md',
 '../data/cira_README.md']

In [23]:
len(files)

10

### K-Means cluster  

In [25]:
from sklearn import cluster
import numpy as np

In [28]:



def keyword_cluster(content:list):
    vectorizer = TfidfVectorizer()
    features = vectorizer.fit_transform(raw_documents=content)
    X = pd.DataFrame(features.toarray(), index=content, columns=vectorizer.get_feature_names_out())
    model = cluster.AffinityPropagation()
    X["predict"] = model.fit_predict(X)
    return X["predict"] 

keyword_cluster(contents)

cybernet enhanc app svelt sveltej github asset banner png svelt dev npm version img shield npm svelt svg npmj packag svelt img shield npm svelt svg licens chat img shield discord label chat logo discord svelt dev chat svelt svelt build applic compil take declar compon convert effici javascript surgic updat dom learn svelt svelt dev discord chatroom svelt dev chat support svelt svelt mit licens sourc project ongo develop fantast volunt support effort backer collect opencollect svelt fund donat collect compens expens svelt develop host cost suffici donat receiv fund support svelt develop roadmap view roadmap svelt dev roadmap develop pull request encourag pick issu github sveltej svelt issu aissue+i aopen+sort aupdat desc instal svelt local bash git clone github sveltej svelt git svelt npm instal yarn instal depend specif packag version packag lock json build svelt build compil modul includ packag bash npm build watch continu rebuild packag npm link doc npmj cli link project local bash n