In [1]:
import os
from datetime import datetime
import pandas as pd
from lxml import etree
from bs4 import BeautifulSoup
from tqdm import tqdm
tqdm.pandas()
import multiprocessing as mp
from multiprocessing import Pool
import re
import numpy as np
from tqdm import tqdm
import nltk
import matplotlib.pyplot as plt
pd.options.mode.chained_assignment = None
from format_dataset import *

##### Process Dataset 

Naming convention: Country_NewsType_article-date-range_(todays date)

In [2]:
country_name = "us"
news_type = 'covid' # (reg or covid)
# news_type = "reg"
date = "april_2020-april_2021"
df_name = f"{country_name}_{news_type}_{date}_({datetime.today().date()}).csv"
print(f"Dataframe Name: {df_name}")

Dataframe Name: us_covid_april_2020-april_2021_(2022-09-19).csv


In [3]:
dataset_name = 'us-no-newsstream-full-timeline'
processed_lists = process_dataset(dataset_name, article_skip=1)

Cores Available: 4


In [4]:
columns=['article_id','date','publisher','title', 'text',
         'language', 'page_num', 'source_type',
        'city', 'author', 'LexileScore']
df = pd.DataFrame(processed_lists, columns=columns)

In [5]:
before_remove_errors = len(df)
df = df[df.text != 'Error in processing document']
df = df[df.language == 'English'] 
df = df[df.source_type != 'Multimedia']
df['text'] = df['text'].astype('string')
df = df[df.title != 'Cleveland COVID-19 Vaccine Locations']
print(f'{len(df)} Number of articles to analyze. \n {before_remove_errors - len(df)} articles lost.')

1504696 Number of articles to analyze. 
 94958 articles lost.


In [6]:
cnn_df = pd.read_csv(f'cnn us_covid_april_2020-april_2021_(2022-09-18).csv')

In [7]:
df = pd.concat([df,cnn_df], axis=0, ignore_index=True)

In [8]:
df.reset_index(drop=True, inplace=True)

##### filter text/articles

Covid News Articles

In [None]:
df = base_text_formatting(df)
df['text_len'] = df.text.str.split().apply(len)
df = df[(df.text_len > 325) & (df.text_len < 4000)]

sent_tokenizer = nltk.data.load('nltk_tokenizer/punkt/english.pickle')
with mp.Pool(3) as pool:
    df['sentences'] = pool.map(sent_tokenizer.tokenize, df['text'])
print(f'Number of unique articles: {df.article_id.nunique()}')
articles_before = df.article_id.nunique()

def sentences_keep(sentences):
    try:
        if len(sentences) >= 19:
            # first 15 sentences and last 3 sentences not including the
            # last sentence since it is usually an advertisement.
            return sentences[:15] + sentences[-4:-1]
        elif (len(sentences) <= 19) & (len(sentences) >= 5):
            return sentences
        else:
            return np.nan
    except:
        return np.nan

# First we only keep 20 sentences from the article.
df.sentences = df.sentences.apply(lambda x: sentences_keep(x)) 
df.dropna(axis=0, subset=['sentences'], inplace=True)
print(f'Articles Lost from being too short: {articles_before - df.article_id.nunique()}')
df['filtered_text'] = df['sentences'].str.join(' ')
# Then we count the number of keywords in the text.
df = experimental_count_keywords(df)
df = experimental_covid_article_filtering(df)
print(f'Number of unique articles: {df.article_id.nunique()}')

In [None]:
# num=310
# print(df.iloc[num])
# print(df.iloc[num].text)

## For Regular News Articles

In [None]:
# df = regular_news_formatting(df, num_articles_to_sample=150000)

## For headline Formatting

In [None]:
# df = headline_formatting(df)
# df.to_csv(f'csv/headline_{df_name}')

In [None]:
df.groupby('source_type').count()['article_id'] / df.groupby('source_type').count()['article_id'].sum()

In [None]:
def remove_non_relevant_content(sentences):
    copy_sentences = sentences.copy()
    check_one = 'Newstex Authoritative Content is not'
    check_two = 'The material and information provided in Newstex'
    check_three = 'Sign up for our'
    check_four = 'Neither newstex nor its re-distributors'
    check_five = 'Please wait for the page to reload'
    for sentence_num, sentence in enumerate(copy_sentences):
        if (bool(re.search(check_one, sentence, re.I)) or \
        bool(re.search(check_two, sentence, re.I)) or \
        bool(re.search(check_three, sentence, re.I)) or \
        bool(re.search(check_four, sentence, re.I)) or \
        bool(re.search(check_five, sentence, re.I))) and (sentence_num >= 9):
            
            print(f'we got one {sentence_num}')
            return sentences[:sentence_num]
    return sentences
    
df['sentences'] = df['sentences'].apply(remove_non_relevant_content) 

This block are past attempts at reformatting to try again once database is fixed

In [None]:
def keep_pairs(lst):
    """ Make sentences into groups of three.
    If the article is full length, this will lead to 6 predictions per article.
    Grouping makes predictions faster and more accurate. 
    However, groups larger than 3 will usually go above Roberta's character limit.
    """
    return [' '.join(x) for x in zip(lst[0::3], lst[1::3], lst[2::3])]
df['pairs'] = df.sentences.apply(keep_pairs)

In [None]:
# df = df[(df['keyword_len'] >= 3) & (df['keyword_len'] < 60)]

# from transformers import RobertaTokenizer
# tokenizer = RobertaTokenizer.from_pretrained("/home/ec2-user/SageMaker/pre_trained_tokenizer")

# def check_len(lst_sent):
#     token_lst_len = [len(tokenizer.encode(sent, add_special_tokens=True, truncation=True)) for sent in lst_sent]
#     sent_cum_len = np.cumsum(token_lst_len)
#     idx_lst = np.where(sent_cum_len >= 125)[0]
#     if idx_lst.size > 0:
#         idx = idx_lst[0]
#         return lst_sent[:idx]
#     return lst_sent

# df['sentences'] = df.sentences.progress_apply(lambda sent: check_len(sent))
# df.sentences = df.sentences.apply(lambda x: " ".join(x))

For Non-Covid News Articles sample from them

In [None]:
df.columns

In [None]:
df.drop(['text', 'language', 'sentences', 'filtered_text'],axis=1,inplace=True)
pre_explode = df.drop('pairs', axis=1) # we only use pre_explode as an article_id reference.
print(df_name)
pre_explode.to_csv('csv/pre_explode_' + df_name) 

In [None]:
df = df.explode('pairs') #This keeps it in the format required for data loader.
print(f'Dataframe {df_name} goes from {df.date.min()} to {df.date.max()}.')
print(f'Dataframe {df_name} has {df.article_id.nunique()} unique articles.')
df.page_num.fillna('None', inplace=True)

# df.reset_index(inplace=True,drop=True)

In [None]:
df.to_csv('csv/no_txt_' + df_name)
print(df_name)

### Finished

In [None]:
# df.sample(100).to_csv('email_test/tdm_samples.csv')