In [10]:
import pandas as pd
import numpy as np
import swifter
import json
import math
import time
import re

In [3]:
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.corpus import wordnet

tokenizer = RegexpTokenizer(r'\w+')
stop_words = set(stopwords.words('english'))

def lemmatize(word:str) -> str:
    lemma = wordnet.morphy(word)
    if lemma is None: return word
    else: return lemma
    
def clean_tokens(raw_text:str, stop_words:[str]=[]) -> [str]:
    result = raw_text.replace("\\n", " ")
    result = [token.lower() for token in tokenizer.tokenize(result) if len(token) > 3]
    result = [lemmatize(token) for token in result if re.match(r"\D", token) is not None]
    return result

## Read data

In [4]:
start = time.time()

df = pd.read_csv("in.tsv",sep="\t", header=None)
df.drop(columns=[1,2,3,5], inplace=True)
df.columns = ["File", "Raw"]
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)

print("Took", round(time.time() -start, 2), "seconds.")

Took 2.57 seconds.


# Tokenize

In [7]:
start = time.time()

df["Tokens"] = df["Raw"].apply(clean_tokens)

print("Took", round(time.time() -start, 2), "seconds.")

Took 66.15 seconds.


# Collection

In [5]:
start = time.time()

collection = []
for idx in df.index:
    for word in df.loc[idx, "Tokens"]:
        if word not in collection:
            collection.append(word)
            
print("Took", (time.time() -start) // 60, "minutes.")

Took 7.0 minutes.


## Dict2Idxs

In [6]:
start = time.time()

dictionary = {idx: word for idx, word in enumerate(sorted(collection))}
dictionary_rev = {word: idx for idx, word in dictionary.items()}

df["Idxs"] = df["Tokens"].apply(lambda x:
    list(map(lambda y: dictionary_rev[y], x))
)

print("Took", round(time.time() -start, 2), "seconds.")

Took 1.57 seconds.


## Count

In [7]:
start = time.time()

df["Count"] = df["Idxs"].apply(lambda x: {idx: x.count(idx) for idx in x})
df["Count"] = df["Count"].apply(lambda x:
    {k:v for k,v in sorted(x.items(), key=lambda i: i[1])[::-1]}
)

print("Took", (time.time() -start) // 60, "minutes.")

Took 14.0 minutes.


# Term Frequency

In [10]:
start = time.time()

#df["TF"] = df["Count"].apply(lambda x: {k:v/len(x) for k,v in x.items()})

df["TF"] = df.apply(lambda x: {
    k:v/len(x["Tokens"]) for k,v in x["Count"].items()
}, axis=1)

print("Took", round(time.time() -start, 2), "seconds.")

Took 24.34 seconds.


# Inverse Document Frequency

In [11]:
start = time.time()

docs_containing = {idx: sum(map(lambda x: idx in x, df["Count"])) for idx in dictionary}

total_docs = len(df)
dict_idf = {idx: math.log10(total_docs / docs_containing[idx]) for idx in dictionary}

df["TFIDF"] = df["TF"].apply(lambda x: {idx: val * dict_idf[idx] for idx, val in x.items()})

print("Took", round(time.time() -start, 2), "seconds.")

Took 40.18 seconds.


# Top10 Weights

# Save

In [12]:
def save_files() -> None:
    with open("dictionary.json", "w") as file:
        json.dump(dictionary, file)
    with open("dictionary_rev.json", "w") as file:
        json.dump(dictionary_rev, file)
    with open("docs_containing.json", "w") as file:
        json.dump(docs_containing, file)
    with open("dict_idf.json", "w") as file:
        json.dump(dict_idf, file)
    df.to_csv("tfidf.csv")

save_files()

In [13]:
garbage = []
for token in dictionary_rev:
    if re.match(r".*[0_9].*", token):
        garbage.append(token)
len(garbage)

2603