# Explore harvested text files

In [None]:
import os
import pandas as pd
import fileinput
from sklearn.feature_extraction.text import TfidfVectorizer
from textblob import TextBlob
from operator import itemgetter
import nltk
nltk.download('stopwords')
nltk.download('punkt')
stopwords = nltk.corpus.stopwords.words('english')

In [None]:
def get_latest_harvest():
    '''
    Get the timestamp of the most recent harvest.
    '''
    harvests = sorted([d for d in os.listdir('data') if os.path.isdir(os.path.join('data', d))])
    return harvests[-1]

def open_harvest_data(timestamp=None):
    '''
    Open the results of the specified harvest (most recent by default).
    
    Returns a DataFrame.
    '''
    if not timestamp:
        timestamp = get_latest_harvest()
    print(timestamp)
    df = pd.read_csv(os.path.join('data', timestamp, 'results.csv'), parse_dates=['date'])
    return df 

## Aggregate the text files

In [None]:
def aggregate_texts(timestamp=None):
    '''
    Aggregate all individual article texts creating one big file.
    '''
    if not timestamp:
        timestamp = get_latest_harvest()
    output_file = os.path.join('data', timestamp, 'all-texts.txt')
    data_dir = os.path.join('data', timestamp, 'text')
    files = [os.path.join(data_dir, file) for file in os.listdir(data_dir) if file[-4:] == '.txt']                                                                                   
    with open(output_file, 'w') as fout, fileinput.input(files) as fin:
        for line in fin:
            fout.write(line)
    
def aggregate_years(timestamp=None):
    '''
    Aggregate individual article text by year, creating one file per year.
    '''
    if not timestamp:
        timestamp = get_latest_harvest()
    output_dir = os.path.join('data', timestamp, 'years')
    os.makedirs(output_dir, exist_ok=True)
    data_dir = os.path.join('data', timestamp, 'text')
    df = open_harvest_data(timestamp=timestamp)
    df['year'] = df['date'].dt.year
    years = list(df['year'].unique())
    for year in years:
        output_file = os.path.join(output_dir, '{}.txt'.format(year))
        files = [os.path.join(data_dir, file) for file in os.listdir(data_dir) if file[-4:] == '.txt' and file[:4] == str(year)]                                                                                   
        with open(output_file, 'w') as fout, fileinput.input(files) as fin:
            for line in fin:
                fout.write(line)
                
                
def aggregate_newspapers(timestamp=None):
    '''
    Aggregate individual article text by newspaper, creating one file per newspaper.
    '''
    if not timestamp:
        timestamp = get_latest_harvest()
    output_dir = os.path.join('data', timestamp, 'newspapers')
    os.makedirs(output_dir, exist_ok=True)
    data_dir = os.path.join('data', timestamp, 'text')
    df = open_harvest_data(timestamp=timestamp)
    newspapers = list(df['newspaper_id'].unique())
    for newspaper in newspapers:
        output_file = os.path.join(output_dir, '{}.txt'.format(newspaper))
        files = [os.path.join(data_dir, file) for file in os.listdir(data_dir) if file[-4:] == '.txt' and '-{}-'.format(newspaper) in file]                                                                                   
        with open(output_file, 'w') as fout, fileinput.input(files) as fin:
            for line in fin:
                fout.write(line)

## Display word frequencies

In [None]:
def show_word_frequencies(text_file):
    with open(text_file, 'r') as text:
        blob = TextBlob(text.read())
    word_counts = [[word, count] for word, count in blob.lower().word_counts.items() if word not in stopwords]
    word_counts = sorted(word_counts, key=itemgetter(1), reverse=True)[:25]
    return pd.DataFrame(word_counts).style.format({1: '{:,}'}).bar(subset=[1], color='#d65f5f').set_properties(subset=[1], **{'width': '300px'})

def word_frequency_all(timestamp=None):
    if not timestamp:
        timestamp = get_latest_harvest()
    data_dir = os.path.join('data', timestamp)
    return show_word_frequencies(os.path.join(data_dir, 'all-texts.txt'))

def word_frequency_by_year(year, timestamp=None):
    if not timestamp:
        timestamp = get_latest_harvest()
    data_dir = os.path.join('data', timestamp, 'years')
    return show_word_frequencies(os.path.join(data_dir, '{}.txt'.format(year)))

def word_frequency_by_newspaper(newspaper_id, timestamp=None):
    if not timestamp:
        timestamp = get_latest_harvest()
    data_dir = os.path.join('data', timestamp, 'newspapers')
    return show_word_frequencies(os.path.join(data_dir, '{}.txt'.format(year)))   

In [None]:
aggregate_texts()

In [None]:
word_frequency_all()

In [None]:
def calculate_tfidf(names, files, ngram_size=1):
    # Chomp chomp -- getting trigrams
    tf = TfidfVectorizer(input='filename', analyzer='word', ngram_range=(ngram_size, ngram_size), min_df=0, smooth_idf=False, sublinear_tf=True)
    tfidf_matrix = tf.fit_transform(files)
    # These are the actual phrases
    feature_names = tf.get_feature_names()
    # These are the scores
    texts = tfidf_matrix.todense()
    for index, row in enumerate(texts):
        name = names[index]
        print('\n\n{}\n'.format(name.upper()))
        text = row.tolist()[0]
        # If the score is not 0 save it with an index (which will let us get the feature_name)
        scores = [pair for pair in zip(range(0, len(text)), text) if pair[1] > 0]
        sorted_scores = sorted(scores, key=lambda t: t[1] * -1)
        # Print the top 20 results for each file
        for phrase, score in [(feature_names[word_id], score) for (word_id, score) in sorted_scores][:20]:
            print('{0: <40} {1}'.format(phrase, score))

def calculate_tfidf_by_year(timestamp=None, ngram_size=1):
    if not timestamp:
        timestamp = get_latest_harvest()
    data_dir = os.path.join('data', timestamp, 'years')
    if not os.path.exists(data_dir):
        aggregate_years(timestamp)
    # Get a list of the file names in the directory.
    names = [file[:-4] for file in os.listdir(data_dir) if file[-4:] == '.txt']
    # Get a list of files to feed to scikit-learn.
    files = [os.path.join(data_dir, file) for file in os.listdir(data_dir) if file[-4:] == '.txt']
    calculate_tfidf(names, files, ngram_size)
    
    
def calculate_tfidf_by_newspaper(timestamp=None, ngram_size=1):
    if not timestamp:
        timestamp = get_latest_harvest()
    data_dir = os.path.join('data', timestamp, 'newspapers')
    if not os.path.exists(data_dir):
        aggregate_newspapers(timestamp)
    df = open_harvest_data(timestamp=timestamp)
    newspapers = df[['newspaper_id', 'newspaper_title']].drop_duplicates().set_index('newspaper_id')
    # Get a list of the file names in the directory.
    names = [newspapers.loc[int(file[:-4])]['newspaper_title'] for file in os.listdir(data_dir) if file[-4:] == '.txt']
    # Get a list of files to feed to scikit-learn.
    files = [os.path.join(data_dir, file) for file in os.listdir(data_dir) if file[-4:] == '.txt']
    calculate_tfidf(names, files, ngram_size)

In [None]:
calculate_tfidf_by_newspaper(ngram_size=1)