In [1]:
%load_ext autoreload
%autoreload 2
import os
import re
import pandas as pd

In [2]:
def concatenate_dataframes(pull_type, directory, source='_rt_'):
    """
    Concatenate all csv's and delete records with duplicate url's
    """
    # Year_month_day:
    pattern_date = '^\d{4}_\d{1,2}_\d{1,2}'

    # Short source name:
    pattern_source = '[a-z]+'
    
    # Starts with lower-case word and followed by letters/numbers/underscores (long source name):
    pattern_source_detail = '[a-z]+\w+'
    
    # Empty DataFrame to accumulate all csv's:
    df_master = pd.DataFrame()
    for filename in os.listdir(directory):
        # Skip non-csv's:
        if filename.endswith(".csv") and (source in filename):
            # Parse out date:
            pull_date = re.search(pattern_date, filename).group()
            pull_date = pd.to_datetime(pull_date, format='%Y_%m_%d')
            
            # Parse out short source name, e.g. 'rt':
            source = re.search(pattern_source, filename).group()
            if source == 'foxnews':
                source = 'fox'

            # And long source name, e.g. 'rt_main_page':
            source_detail = re.search(pattern_source_detail, filename).group()
            
            current_df = pd.read_csv(directory + filename)

            # Push pull_date and source name fields into the dataframe:
            current_df['source'] = source
            current_df['source_detail'] = source_detail
            current_df['pull_type'] = pull_type
            current_df['pull_date'] = pull_date

            # Accumulate frame:
            df_master = df_master.append(current_df, ignore_index=True)

            # Clean up duplicates using 'canonical_link' and 'url':
            #df_master.drop_duplicates(subset=['canonical_link', 'url', 'paper_section_name'], keep='first', inplace=True)
    return df_master

#### Work on BBC first:

In [15]:
%%time
directory = 'data/custom_data_pulls/'
directory_extra = 'data_extra/custom_data_pulls/'
df_bbc = concatenate_dataframes(pull_type='custom', directory=directory, source='_bbc_')

CPU times: user 12.1 s, sys: 2.27 s, total: 14.3 s
Wall time: 15.1 s


In [18]:
df_bbc_extra = concatenate_dataframes(pull_type='custom', directory=directory_extra, source='_bbc_')

In [37]:
df_bbc_clean = df_bbc.append(df_bbc_extra, ignore_index=True)

In [68]:
# Make sure that that there no non-BBC links:
df_bbc_clean.url.str.contains('bbc').sum() == df_bbc_clean.shape[0]

True

In [46]:
# This deletes dupes WITHIN same RSS feed:
df_bbc_clean_nodupes = df_bbc_clean.drop_duplicates(subset=['paper_section_name', 'url'])
# This deletes dupes BETWEEN RSS feeds:
df_bbc_clean_nodupes_atall = df_bbc_clean_nodupes.drop_duplicates(subset=['url'])

In [47]:
print(df_bbc_clean_nodupes.shape)
print(df_bbc_clean_nodupes_atall.shape)

(5586, 27)
(3635, 27)


Save and test loading back:

In [49]:
processed_dir_file_1 = 'data/processed/2019_03_27_BBC_NoDupesWithin.csv'
processed_dir_file_2 = 'data/processed/2019_03_27_BBC_NoDupesAtAll.csv'

In [50]:
df_bbc_clean_nodupes.to_csv(path_or_buf=processed_dir_file_1, index=False)
df_bbc_clean_nodupes_atall.to_csv(path_or_buf=processed_dir_file_2, index=False)

In [109]:
df_bbc_clean_nodupes_csv = pd.read_csv(processed_dir_file_1)
df_bbc_clean_nodupes_atall_csv = pd.read_csv(processed_dir_file_2)

In [114]:
print(df_bbc_clean_nodupes_csv.shape)
print(df_bbc_clean_nodupes_atall_csv.shape)

(5586, 27)
(3635, 27)


#### Work on Fox News:

In [54]:
%%time
directory = 'data/custom_data_pulls/'
directory_extra = 'data_extra/custom_data_pulls/'
df_fox = concatenate_dataframes(pull_type='custom', directory=directory, source='_fox_')
df_fox_extra = concatenate_dataframes(pull_type='custom', directory=directory_extra, source='_fox_')
df_fox_clean = df_fox.append(df_fox_extra, ignore_index=True)

CPU times: user 14.8 s, sys: 1.82 s, total: 16.6 s
Wall time: 17.7 s


In [69]:
# Make sure that that there no non-Fox links:
df_fox_clean.url.str.contains('fox').sum() == df_fox_clean.shape[0]

True

In [108]:
# This deletes dupes WITHIN same RSS feed:
df_fox_clean_nodupes = df_fox_clean.drop_duplicates(subset=['paper_section_name', 'url'])
# This deletes dupes BETWEEN RSS feeds:
df_fox_clean_nodupes_atall = df_fox_clean_nodupes.drop_duplicates(subset=['url'])

print(df_fox_clean_nodupes.shape)
print(df_fox_clean_nodupes_atall.shape)

(4637, 27)
(4202, 27)


Save and test loading back:

In [110]:
processed_dir_file_1 = 'data/processed/2019_03_27_FOX_NoDupesWithin.csv'
processed_dir_file_2 = 'data/processed/2019_03_27_FOX_NoDupesAtAll.csv'

df_fox_clean_nodupes.to_csv(path_or_buf=processed_dir_file_1, index=False)
df_fox_clean_nodupes_atall.to_csv(path_or_buf=processed_dir_file_2, index=False)

df_fox_clean_nodupes_csv = pd.read_csv(processed_dir_file_1)
df_fox_clean_nodupes_atall_csv = pd.read_csv(processed_dir_file_2)

In [113]:
print(df_fox_clean_nodupes_csv.shape)
print(df_fox_clean_nodupes_atall_csv.shape)

(4637, 27)
(4202, 27)


#### Work on CNN:

In [3]:
%%time
directory = 'data/custom_data_pulls/'
directory_extra = 'data_extra/custom_data_pulls/'
df_cnn = concatenate_dataframes(pull_type='custom', directory=directory, source='_cnn_')

CPU times: user 39.3 s, sys: 7.46 s, total: 46.7 s
Wall time: 47.6 s


In [5]:
%%time
df_cnn_extra = concatenate_dataframes(pull_type='custom', directory=directory_extra, source='_cnn_')

KeyboardInterrupt: 

In [5]:
df_cnn_clean = df_cnn.append(df_cnn_extra, ignore_index=True)

In [6]:
# Make sure that that there no non-CNN links:
df_cnn_clean.url.str.contains('cnn').sum() == df_cnn_clean.shape[0]

False

In [7]:
# Delete non-CNN entries (sometimes CNN links to external sites in RSS feeds, apparently):
filter_noncnn = (~df_cnn_clean.url.str.contains('cnn'))
noncnn_idx = df_cnn_clean[filter_noncnn].index
df_cnn_clean.drop(noncnn_idx, inplace=True)

In [8]:
# This deletes dupes WITHIN same RSS feed:
df_cnn_clean_nodupes = df_cnn_clean.drop_duplicates(subset=['paper_section_name', 'url'])
# This deletes dupes BETWEEN RSS feeds:
df_cnn_clean_nodupes_atall = df_cnn_clean_nodupes.drop_duplicates(subset=['url'])

print(df_cnn_clean_nodupes.shape)
print(df_cnn_clean_nodupes_atall.shape)

(6050, 27)
(3942, 27)


Save and test loading back:

In [9]:
processed_dir_file_1 = 'data/processed/2019_03_28_CNN_NoDupesWithin.csv'
processed_dir_file_2 = 'data/processed/2019_03_28_CNN_NoDupesAtAll.csv'

In [10]:
%%time
df_cnn_clean_nodupes.to_csv(path_or_buf=processed_dir_file_1, index=False)

CPU times: user 6min 23s, sys: 1min 55s, total: 8min 18s
Wall time: 22min 41s


In [11]:
%%time
df_cnn_clean_nodupes_atall.to_csv(path_or_buf=processed_dir_file_2, index=False)

CPU times: user 3min 53s, sys: 1min 8s, total: 5min 2s
Wall time: 10min 59s


In [12]:
df_cnn_clean_nodupes_csv = pd.read_csv(processed_dir_file_1)
df_cnn_clean_nodupes_atall_csv = pd.read_csv(processed_dir_file_2)

print(df_cnn_clean_nodupes_csv.shape)
print(df_cnn_clean_nodupes_atall_csv.shape)

KeyboardInterrupt: 