In [1]:
import os
import pandas as pd
from lxml import etree
from bs4 import BeautifulSoup
from tqdm import tqdm
tqdm.pandas()
import multiprocessing as mp
from multiprocessing import Pool
import re
import numpy as np
from tqdm import tqdm
import nltk
import matplotlib.pyplot as plt
pd.options.mode.chained_assignment = None

from format_dataset import process_dataset, regular_news_formatting, covid_format_text, headline_formatting

##### Process Dataset

In [11]:
dataset_name = 'g-canada-apr2020-apr2021'
processed_lists = process_dataset(dataset_name)

df_name = 'can-apr2021-apr2022.csv'

FileNotFoundError: [Errno 2] No such file or directory: '/home/ec2-user/SageMaker/data/g-canada-apr2020-apr2021/'

In [16]:
df = pd.DataFrame(processed_lists, columns=['article_id','date','publisher','title', 'text' , 'language', 'page_num', 'source_type'])
print(len(df))
df = df[df.text != 'Error in processing document']
print(f'Number of articles to analyze: {len(df)}')

df = df[df.language == 'English'] 
df = df[df.source_type != 'Multimedia']
df['text'] = df['text'].astype('string')
df = df[df.title != 'Cleveland COVID-19 Vaccine Locations']

659028
Number of articles to analyze: 658196


##### filter text/articles

In [None]:
# df[df.text_len < 500].keyword_len.mean()
# np.sum(df.text_len>1000) / len(df.text_len)
# df.text_len.mean()
# df.groupby('source_type').text_len.mean()
# df.groupby('source_type').count()['article_id'] / df.groupby('source_type').count()['article_id'].sum()

Covid News Articles

In [None]:
df = covid_format_text(df)

For Regular News Articles

In [None]:
# df = regular_news_formatting(df, num_articles_to_sample=150000)

For headline Formatting

In [None]:
# df = headline_formatting(df)
# df.to_csv(f'csv/headline_{df_name}')

In [28]:
df.groupby('source_type').count()['article_id'] / df.groupby('source_type').count()['article_id'].sum()

source_type
Newspapers       0.503319
Web Resources    0.496681
Name: article_id, dtype: float64

In [30]:
%%time
sent_tokenizer = nltk.data.load('nltk_tokenizer/punkt/english.pickle')
with mp.Pool(3) as pool:
    df['sentences'] = pool.map(sent_tokenizer.tokenize, df['text'])

CPU times: user 7.58 s, sys: 3.37 s, total: 11 s
Wall time: 1min 32s


In [31]:
print(f'Number of unique articles: {df.article_id.nunique()}')
def sentences_keep(sentences):
    try:
        if len(sentences) > 15:
            return sentences[:15] 
        elif (len(sentences) <= 15) & (len(sentences) >= 4):
            return sentences
        else:
            return np.nan
    except:
        return np.nan

df.sentences = df.sentences.apply(lambda x: sentences_keep(x)) 
df.dropna(axis=0, subset=['sentences'], inplace=True)
print(f'Number of unique articles: {df.article_id.nunique()}')

Number of unique articles: 164510
Number of unique articles: 164481


In [None]:
def email_link(sentences):
    copy_list = sentences.copy()
    check_one = 'Newstex Authoritative Content is not'
    check_two = 'The material and information provided in Newstex'
    check_three = 'Sign up for our'
    check_four = 'Neither newstex nor its re-distributors'
    check_five = 'Please wait for the page to reload'
    for k, sentence in enumerate(copy_list):
        if (bool(re.search(check_one, sentence, re.I)) or \
        bool(re.search(check_two, sentence, re.I)) or \
        bool(re.search(check_three, sentence, re.I)) or \
        bool(re.search(check_four, sentence, re.I)) or \
        bool(re.search(check_five, sentence, re.I))) and (k >= 9):
            return sentences[:k]
    return sentences
    
df.sentences = df.sentences.apply(lambda x: email_link(x)) 

In [32]:
df.article_id.nunique()

164481

This block are past attempts at reformatting to try again once database is fixed

In [33]:
def keep_pairs(lst):
    """ Makes predictions on pairs of sentences since they are usually small enough and fixes issues of tiny sentences.
    """
    return [' '.join(x) for x in zip(lst[0::3], lst[1::3], lst[2::3])]
df['pairs'] = df.sentences.apply(keep_pairs)

In [None]:
# df = df[(df['keyword_len'] >= 3) & (df['keyword_len'] < 60)]

# from transformers import RobertaTokenizer
# tokenizer = RobertaTokenizer.from_pretrained("/home/ec2-user/SageMaker/pre_trained_tokenizer")

# def check_len(lst_sent):
#     token_lst_len = [len(tokenizer.encode(sent, add_special_tokens=True, truncation=True)) for sent in lst_sent]
#     sent_cum_len = np.cumsum(token_lst_len)
#     idx_lst = np.where(sent_cum_len >= 125)[0]
#     if idx_lst.size > 0:
#         idx = idx_lst[0]
#         return lst_sent[:idx]
#     return lst_sent

# df['sentences'] = df.sentences.progress_apply(lambda sent: check_len(sent))
# df.sentences = df.sentences.apply(lambda x: " ".join(x))

For Non-Covid News Articles sample from them

In [34]:
df = df[['article_id', 'date', 'publisher', 'title', 'page_num', 'pairs']]
pre_explode = df.drop('pairs', axis=1)

print(df_name)
pre_explode.to_csv('csv/pre_explode_' + df_name) 

can-apr2021-apr2022.csv


Make sure to concatenate all the other files other than the first one

In [35]:
# np.save(file='duplicates.npy', arr=df.article_id.unique())
# duplicates = np.load(file='duplicates.npy', allow_pickle=True)
# updated_duplicate = np.concatenate((duplicates, df.article_id))
# np.save(file='duplicates.npy', arr=updated_duplicate)
# check = np.load(file='duplicates.npy', allow_pickle=True)
# df.article_id.isin(check).sum()
# df = df[~df.article_id.isin(check)]

In [36]:
df = df.explode('pairs') #This keeps it in the format required for data loader.
print(f'Dataframe {df_name} goes from {df.date.min()} to {df.date.max()}.')
print(f'Dataframe {df_name} has {df.article_id.nunique()} unique articles.')
df.page_num.fillna('None', inplace=True)
df.reset_index(inplace=True,drop=True)

Dataframe can-apr2021-apr2022.csv goes from 2020-04-01 to 2021-03-31.
Dataframe can-apr2021-apr2022.csv has 164481 unique articles.


In [37]:
df.to_csv('csv/no_txt_' + df_name)
df_name

'can-apr2021-apr2022.csv'

### Finished

In [None]:
# df.sample(100).to_csv('email_test/tdm_samples.csv')