# Information extraction using tf-idf


In [2]:
import pandas as pd 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

loading in readme file and making them in to tokens

In [3]:
from os import listdir
from os.path import isfile, join

contents = []
PATH = "../data"
files = [f"{PATH}/{f}" for f in listdir(PATH) if isfile(join(PATH, f))]

for file_path in files: 
    with open(file_path, "r") as f:
        contents.append(f.read())

contents[0][:50]

'[![Cybernetically enhanced web apps: Svelte](https'

In [4]:

def load_stop_words(path:str="../stopwords.txt"):
   words = []
   with open(path, "r") as f: 
      words = f.read().split("\n")
   assert len(words) != 0, "no stop words were found"
   return words

In [5]:



def clean(content:list):
    ascii_char = [chr(i) for i in range(0,255)]
    numbers = "0123456789"
    non_acc_char =  "\n,.()[]{}`/:-_*=\\<>|&%@?!\"\'#" + numbers
    non_acc_tokens = ["https","www", "com", "org", "license"]
    stop_words = load_stop_words()
    for i, _ in enumerate(content):
        for c in non_acc_char:
            content[i] = content[i].replace(c, " ")
        content[i] = content[i].split(" ")
        content[i] = list(filter(lambda c: c != "", content[i]))
        content[i] = [t for t in content[i] if not t in non_acc_tokens ] 
        content[i] = [s.lower() for s in content[i] if all(c in ascii_char for c in s)]
        content[i] = [t for t in content[i] if not t in stop_words] 
        for j, word in enumerate(content[i]):
            if word[-1] == "s": 
                content[i][j] = word[:-1]


    return [" ".join(con) for con in content]


contents = clean(contents)
contents

['cybernetically enhanced app svelte sveltej github asset banner png svelte dev npm version img shield npm svelte svg npmj package svelte img shield npm svelte svg license chat img shield discord label chat logo discord svelte dev chat svelte svelte build application compiler take declarative component convert efficient javascript surgically update dom learn svelte svelte dev discord chatroom svelte dev chat supporting svelte svelte mit licensed source project ongoing development fantastic volunteer support effort backer collective opencollective svelte fund donated collective compensating expense svelte development hosting cost sufficient donation received fund support svelte development roadmap view roadmap svelte dev roadmap development pull request encouraged pick issue github sveltej svelte issue aissue+i aopen+sort aupdated desc install svelte locally bash git clone github sveltej svelte git svelte npm install yarn install dependencie specific package version package lock json bu

In [6]:
def TfIdf(content:list):
    vectorizer = TfidfVectorizer()
    vecs = vectorizer.fit_transform(content)
    feature_names = vectorizer.get_feature_names_out()
    dense = vecs.todense()
    dense_list = dense.tolist()
    df = pd.DataFrame(dense_list, columns=feature_names)
    return df

tf_idf_data = TfIdf(contents)


In [7]:
from collections import defaultdict

def query_data(tf_idf_data:dict, query:str)->list:
    query = clean([query])[0]
    print(query.split(" "))
    words_tf_idf = {}
    query_words = query.split(" ")
    for word in query_words: 
        if word in tf_idf_data.keys():
            words_tf_idf[word] = tf_idf_data[word]

    result = []
    highest = 0.0
    for word, tf_idf in words_tf_idf.items():
        for idx, val in tf_idf.items(): 
            if val > highest: 
                highest = val 
                result.append((val, idx))
    result.sort()
    result.reverse()
    lookup = set()
    result = [idx for _, idx in result if idx not in lookup and lookup.add(idx) is None]
    return result


In [8]:
query = "compilers"
idxs = query_data(tf_idf_data.to_dict(), query)
for idx in idxs:    
    print(files[idx])

['compiler']
../data/gcc_README.txt
../data/svelte_README.md


In [9]:
tf_idf_data.to_dict().keys()

dict_keys(['abi', 'abide', 'ability', 'abstraction', 'accelerate', 'acceleration', 'accepted', 'acces', 'accounting', 'acknowledgment', 'action', 'activate', 'actual', 'adam', 'add', 'additional', 'additionally', 'adhere', 'adjust', 'adjusting', 'admin', 'adoptable', 'adoption', 'advanced', 'advantage', 'agvnhpsajp', 'agx', 'aissue', 'aka', 'alban', 'algebra', 'algorithm', 'algorithmic', 'align', 'allocator', 'alomst', 'alpaca', 'alpha', 'alt', 'alternative', 'altinstall', 'alykhan', 'amd', 'anaconda', 'andrea', 'android', 'annotation', 'announcement', 'answer', 'antiga', 'aopen', 'apache', 'apca', 'api', 'app', 'appear', 'applicable', 'application', 'apply', 'approachable', 'appwrite', 'arbitrarily', 'arbitrary', 'argument', 'armv', 'art', 'artifact', 'asset', 'assume', 'assumption', 'asynchronou', 'audience', 'aupdated', 'author', 'auto', 'autograd', 'automatic', 'automatically', 'auxiliary', 'available', 'aware', 'awesome', 'ax', 'axel', 'axelgard', 'azure', 'baby', 'backend', 'back

In [10]:
files

['../data/svelte_README.md',
 '../data/tensorflow_README.md',
 '../data/react_README.md',
 '../data/vuejs_README.md',
 '../data/gcc_README.txt',
 '../data/linux_README.txt',
 '../data/cpython_README.rst',
 '../data/pytorch_README.md',
 '../data/rust_README.md',
 '../data/cira_README.md']

In [11]:
len(files)

10

### K-Means cluster  

In [12]:
from sklearn.cluster import KMeans

In [32]:
def k_means(content:list):
    global files
    vectorizer = TfidfVectorizer()
    features = vectorizer.fit_transform(raw_documents=content)
    k = 10
    model = KMeans(n_clusters=k, init="k-means++", max_iter=1000, n_init=1)
    model.fit(features)

    X = pd.DataFrame(features.toarray(), index=[files[i][8:].replace("_README", "") for i,_ in enumerate(content)], columns=vectorizer.get_feature_names_out())
    centroids = model.cluster_centers_
    X["predict"] = model.fit_predict(features)    

    print(X.head())
 
    return model 

model = k_means(contents)

               abi     abide  ability  abstraction  accelerate  acceleration   
svelte.md      0.0  0.000000      0.0          0.0         0.0           0.0  \
tensorflow.md  0.0  0.007032      0.0          0.0         0.0           0.0   
react.md       0.0  0.000000      0.0          0.0         0.0           0.0   
vuejs.md       0.0  0.000000      0.0          0.0         0.0           0.0   
gcc.txt        0.0  0.000000      0.0          0.0         0.0           0.0   

               accepted     acces  accounting  acknowledgment  ...      yarn   
svelte.md      0.000000  0.027755         0.0             0.0  ...  0.032649  \
tensorflow.md  0.007032  0.000000         0.0             0.0  ...  0.000000   
react.md       0.000000  0.000000         0.0             0.0  ...  0.000000   
vuejs.md       0.000000  0.000000         0.0             0.0  ...  0.000000   
gcc.txt        0.000000  0.000000         0.0             0.0  ...  0.000000   

               yeosh   youtube  yuando

In [16]:
model.cluster_centers_

array([[0.15164212, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.00703214, 0.        , ..., 0.        , 0.        ,
        0.02109643],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.01671227, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.00660349, 0.00660349,
        0.        ]])

In [None]:
import matplotlib as plt
