In [2]:
import csv
path = './data/mega_train.csv'
indexes = []
texts = []
authors = []
with open(path, newline='', encoding='utf-8') as csvfile:
    csvreader = csv.DictReader(csvfile)
    for row in csvreader:
        indexes.append(row['index'])
        texts.append(row['text'])
        authors.append(row['Author'])
import re
documents = []
cur_author  = authors[0]
j = 0
for i in range(50):
    author = []
    
    while  j<4000 and authors[j] == cur_author:
        text = texts[j].lower()
        text_no_punctuation_digits = re.sub(r'[^\w\s]', '', text)
        article = re.sub(r'\d+', '', text_no_punctuation_digits).split()
        author.append(article)
        j += 1
    if j < 4000:
        cur_author = authors[j]
    documents.append(author)
print(len(documents))



50


### Compute IDF for words in all articles

In [3]:
from math import log
from collections import Counter

def compute_idf(documents):
    N = 0
    for i in range(len(documents)):
        N += len(documents[i])
    
    idf_dict = {}

    df = Counter()
    for author in documents:
        for document in author:
            for word in set(document):
                df[word] += 1

    # Calculate IDF for each word
    for word, count in df.items():
        if log(N/ count) < 1 :
            idf_dict[word] = 0
        else:
            idf_dict[word] = 1

    return idf_dict

idf_scores = compute_idf(documents)
print(idf_scores)  # Display the IDF scores for each word




### Compute IDF for a single author

In [4]:
from math import log
from collections import Counter

def compute_idf_for_author (author_index, documents):
    idf_dict = {}
    df = Counter()
    index = 1
    for author in documents:
        idf_flag = Counter()
        for document in author:
            for word in set(document):
                if index == author_index:
                    if(idf_flag[word] == 0):
                        df[word] += 1
                        idf_flag[word] += 1

                else:
                    if(idf_flag[word] == 0):
                        df[word] += 50
                        idf_flag[word] += 1
                    
        index+=1
    for word, count in df.items():
        idf_dict[word] = max(0, log(50 / (count)))  

    return idf_dict
idf_socres_author = []
for i in range(50):
    idf_t = compute_idf_for_author(i+1, documents)
    idf_socres_author.append(idf_t)
    
print(len(idf_socres_author))


50


### Compute TF for each word within articles with same author

In [6]:
from math import log
from collections import Counter

def compute_author_frequent_word_level (author_index, documents):
    
    tf_dict = {}

    # Count the number of documents that contain each word
    df = Counter()
    all_count = 0

    for document in documents[author_index - 1]:
        for word in document:
            df[word] += 1
        
        all_count += len(document)


    # Calculate TF for each word
    for word, count in df.items():
        tf_dict[word] = max(0, (count - 3) / all_count * 100)     # 3 is a threshold

    return tf_dict

# Example usage
a_feq = []
for i in range(50):
    tf_t = compute_author_frequent_word_level(i+1, documents)
    a_feq.append(tf_t)
    
print(a_feq[0])


{'treasury': 0.07307962972987604, 'secretary': 0.05954636496508418, 'robert': 0.037893141341417205, 'rubin': 0.08931954744762627, 'goes': 0, 'to': 2.928598495100958, 'capitol': 0.013533264764791858, 'hill': 0.010826611811833486, 'on': 0.9256753099117631, 'tuesday': 0.05413305905916743, 'further': 0.027066529529583716, 'explain': 0, 'the': 7.1455637958101015, 'clinton': 0.15157256536566882, 'administrations': 0.05954636496508418, 'bank': 0.1813457478482109, 'reform': 0.094732853353543, 'plan': 0.28149190710767064, 'but': 0.5440372435446327, 'lawmakers': 0.04601310020029232, 'are': 0.40599794294375574, 'likely': 0.05954636496508418, 'focus': 0.013533264764791858, 'foremost': 0, 'what': 0.07849293563579278, 'left': 0.005413305905916743, 'out': 0.12179938288312672, 'proposal': 0.1245060358360851, 'along': 0.01623991771775023, 'with': 0.6035836085097169, 'several': 0.05954636496508418, 'pending': 0.0027066529529583714, 'bills': 0.07578628268283441, 'would': 0.5494505494505495, 'scrap': 0.00

### Compute TF-IDF and sort all words

In [7]:
word_list = []
for i in range(50):
    temp = {}
    for word in a_feq[i]:
        temp[word] = idf_scores[word] * idf_socres_author[i][word] * a_feq[i][word]
    word_list.append(temp)

def sort_and_filter_dicts(word_list):
    new_word_list = []
    for d in word_list:
        filtered_dict = {k: v for k, v in d.items() if v > 0}   # remove objects with nonpositive weight
        sorted_dict = dict(sorted(filtered_dict.items(), key=lambda item: item[1],reverse = True))
        new_word_list.append(sorted_dict)
    return new_word_list
word_list = sort_and_filter_dicts(word_list)

tfidf_count = []
for i in word_list:
    tfidf_count.append(len(i))

mx = max(tfidf_count)
author_selected = tfidf_count.index(mx)

print(author_selected)

20


In [16]:
with open("./data/tfidf-words-20.csv", "w") as fd:
    writer = csv.writer(fd)
    
    for key in word_list[author_selected]:
        writer.writerow([key])

### Compute TF-IDF to find words to replace for each article

This part is used on testing (validation) datasets.

In [None]:
import csv
path = './data/mega_test.csv'
indexes = []
texts = []
authors = []
with open(path, newline='', encoding='utf-8') as csvfile:
    csvreader = csv.DictReader(csvfile)
    for row in csvreader:
        indexes.append(row['index'])
        texts.append(row['text'])
        authors.append(row['Author'])
import re
documents = []
cur_author  = authors[0]
j = 0
for i in range(50):
    author = []
    
    while  j < 1000 and authors[j] == cur_author:
        text = texts[j].lower()
        text_no_punctuation_digits = re.sub(r'[^\w\s]', '', text)
        article = re.sub(r'\d+', '', text_no_punctuation_digits).split()
        author.append(article)
        j += 1
    if j < 1000:
        cur_author = authors[j]
    documents.append(author)
print(len(documents))

#### Compute TF for an article

In [None]:
from math import log
from collections import Counter

def compute_tf_in_single_article (document):
    
    tf_dict = {}

    # Count the number of frequency in an article
    df = Counter()
    all_count = len(document)

    for word in document:
        df[word] += 1

    # Calculate TF for each word
    for word, count in df.items():
        tf_dict[word] = max(0, (count - 3) / all_count)     # 3 is a threshold

    return tf_dict

# Example usage
a_feq = []
for au in range(50):
    for document in documents[au]:
        a_feq.append(compute_tf_in_single_article(document))
    
print(a_feq[0])

#### Compute IDF for a single article

In [None]:
from math import log
from collections import Counter

def compute_idf_in_single_article (article, documents):
    
    idf_dict = {}

    # Count the number of frequency in an article
    df = Counter()
    all_count = 1000

    for word in article:
        for document in documents:
            if word in document:    # exist in some article
                df[word] += 1

    # Calculate TF for each word
    for word, count in df.items():
        idf_dict[word] = math.log(all_count / (count + 1))     # prevent division by zero

    return idf_dict
