In [137]:
import os
import pandas as pd
import numpy as np
import math
from collections import OrderedDict

In [68]:
# name of file w/ topic counts
count_file = "allpapers-topics.csv"

In [69]:
# load csv into pandas dataframe

df = pd.read_csv(count_file)
    
# csv has header row in format "Title" then each topic number 0-99
# each subesequent row is in format newspaper title, t00-proportion, t02-proportion etc.
# each row totals the number of articles in corpus of that paper
# each column totals the proportion of that topic in corpus   

In [123]:
# need function to determine total number of docs in df
# total_docs = # need code to sum each row, and then sum each of those sums

def all_topics_by_paper(paper):
    for index, row in df.iterrows():
        if row['Title'] == paper:
            return int(row['Total'])
        
# testing
all_topics_by_paper('DouglassMonthly')

1717

In [124]:
def topic_in_all_papers(topic):
    # need function to determine total number of articles assoc. w/ topic, given topic 
    # just add the column of that topic
    return df[topic].sum()

# testing
topic_in_all_papers('0')

3321.018576845

In [130]:
# need to calculate total docs in corpus for total_docs
total_docs = int(df['Total'].sum())

# salience functions 
def topic_freq(topic, paper):
    # this one calculates the "number" of times a topic appears in a newspaper, 
    # normalized by dividing by the total "number" of topics in the paper 
    
    for index, row in df.iterrows():
        if row['Title'] == paper:
            topic_score = row[topic]
            return topic_score / all_topics_by_paper(paper)  

def inv_paper_freq(topic): 
    # This one measures how common a topic is among all newpaper articles in the corpus. 
    # The more common a topic is, the lower its "ipf."" I take the ratio of the total
    # number of newspaper articles to the number of articles containing the topic, then 
    # take the log of that. Add 1 to the divisor to prevent division by zero.
    
    return math.log(total_docs / (1 + topic_in_all_papers(topic)))

def tfipf(topic, paper):
    # just the product of the "TF" and "IPF"
    return topic_freq(topic, paper) * inv_paper_freq(topic)

In [131]:
# testing
print(total_docs)

print(topic_freq('0', 'DouglassMonthly'))

print(inv_paper_freq('0'))

print(tfipf('0', 'DouglassMonthly'))

372276
0.007701160209668025
4.719062911416954
0.036342259520324396


In [149]:
# for each newspaper, calculate tf-ipf score for each topic
# then print top 10 topics for each paper 

fout = open("tf-ipf.txt","w")

for index, row in df.iterrows():
    # calculate scores and store to dict
    scores = {}
    for i in range(0,99):
        scores[i] = tfipf(str(i), row['Title'])
    
    print("=== TF-IPF Scores for",row['Title'], "===", file=fout)
    # then sort dict by highest tfipf score and print / write to file
    
    for key, value in sorted(scores.items(), key=lambda kv: kv[1], reverse=True):
        print(key, round(value, 4), file=fout)

    print(file=fout)

fout.close()