In [1]:
# Main imports
import pandas as pd
import numpy as np
import torch, torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from torch.optim.lr_scheduler import StepLR
import matplotlib.pyplot as plt
import os, sys
import seaborn as sns
from tqdm import tqdm
from copy import deepcopy
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [2]:
# Read the CSVs
data_2017 = pd.read_csv('/Users/iranjan/Projects/dd4g-bias-in-media/data/2017_articles_mass.csv')
data_2018 = pd.read_csv('/Users/iranjan/Projects/dd4g-bias-in-media/data/2018_articles_mass.csv')

In [3]:
data_2017.head()


Unnamed: 0.1,Unnamed: 0,position_section,position_subsection,hl1,hl2,author,lede,body,word_count,content-id,pub_date,indexing_terms,year
0,183,LIVING ARTS,,Music,,,sub rosa: songs from three mile island remembe...,"godheadsilo this fargo, n.d.-born duo, embarki...",2003.0,BGLOBE-2a68b240-d1d9-11e6-b500-ed0ab7c07c6a,2017-01-08,"{'subject': [{'score': '90', 'classCode': 'ST0...",2017
1,195,SPORTS,,"Duke 93, BC 82",,,"at cameron indoor stadium, durham, n.c. boston...",jeffers 29 3-4 2-2 4-12 1 1 8 turner 35 5-9 2-...,229.0,BGLOBE-7b4a7670-d522-11e6-b500-ed0ab7c07c6a,2017-01-08,"{'legal': [{'className': 'Bankruptcy Law', 'cl...",2017
2,203,REGIONAL,West,CAMPUS ANGLE,DAVID OLUWADARA,,a repeat patriot league men's indoor field ath...,"last february at bu's valentine invitational, ...",734.0,BGLOBE-d61f6ee0-ce0e-11e6-b750-b206eb104cd9,2017-01-08,"{'legal': [{'className': 'Education Law', 'cla...",2017
3,205,REGIONAL,West,NOTEWORTHY,,,cal howes maynard the freshman basketball guar...,jennifer narlee medfield narlee was named new ...,157.0,BGLOBE-995230a0-ce0f-11e6-b750-b206eb104cd9,2017-01-08,"{'subject': [{'score': '90', 'classCode': 'STX...",2017
4,207,SUNDAY,Travel,Broadway actor enjoys visiting places with old...,,,"rob mcclure with his wife, maggie lakis, in ve...",favorite vacation destination? venice. i love...,497.0,BGLOBE-31a8a19c-d106-11e6-b500-ed0ab7c07c6a,2017-01-08,"{'subject': [{'score': '78', 'classCode': 'ST0...",2017


In [4]:
data_2018.head()


Unnamed: 0.1,Unnamed: 0,position_section,position_subsection,hl1,hl2,author,lede,body,word_count,content-id,pub_date,indexing_terms,year
0,0,SPORTS,,Team up in air on Canada's pot issue,,,"calgary, alberta — some bruins rely on plant-b...",body while now legal in 13 nhl cities — seven ...,912.0,BGLOBE-1a045630-d24e-11e8-83d5-dee7c961b652,2018-10-18,"{'subject': [{'score': '92', 'classCode': 'STX...",2018
1,7,NEWS,Metro,"Out of chaos, a candidate shouting to be heard",Trump-backer Ayyadurai lags in race for Senate...,,shiva ayyadurai election day is several weeks ...,body there's name-calling. ayyadurai calls sen...,1456.0,BGLOBE-1765cf42-c201-11e8-9dad-d8665da1fb0e,2018-10-18,"{'legal': [{'className': 'Governments', 'class...",2018
2,19,BUSINESS,,Walsh looking for climate plan funds,,,business leaders seem ready to embrace marty w...,body the mayor received a prolonged standing o...,547.0,BGLOBE-7339aaec-d24f-11e8-83d5-dee7c961b652,2018-10-18,"{'subject': [{'score': '89', 'classCode': 'ST0...",2018
3,24,SPORTS,,Baseball playoffs,,,"game 1 astros 7, red sox 2 game 2 red sox 7, a...",body,44.0,BGLOBE-2f273fb2-d250-11e8-8fdb-5c3af1605444,2018-10-18,{'legal': [{'className': 'Business & Corporate...,2018
4,25,NEWS,Metro,Harvard decisions benefited donors,Bias suit highlights admissions process,,when harvard university admitted several appli...,"body ellwood lauded fitzsimmons on ""big wins,""...",1172.0,BGLOBE-138deca0-d24c-11e8-8fdb-5c3af1605444,2018-10-18,"{'legal': [{'className': 'Education Law', 'cla...",2018


In [5]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/iranjan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [6]:
data_2017 = data_2017.fillna('unknown')
data_2018 = data_2018.fillna('unknown')

In [13]:
def clean_data(df):

    tqdm.pandas()
    # Create a copy of the dataframe to return
    df_toclean = df.copy()

    # Remove first column that is unnamed, as well as content-id column
    df_toclean = df_toclean.drop(columns=['Unnamed: 0', 'content-id', 'word_count'])

    # Make all text lowercase in hl1 hl2 lede body columns 
    df_toclean['hl1'] = df_toclean['hl1'].str.lower()
    df_toclean['hl2'] = df_toclean['hl2'].str.lower()
    df_toclean['lede'] = df_toclean['lede'].str.lower()
    df_toclean['body'] = df_toclean['body'].str.lower()

    # Remove extra whitespace in hl1 hl2 lede body columns 
    df_toclean['hl1'] = df_toclean['hl1'].str.strip()
    df_toclean['hl2'] = df_toclean['hl2'].str.strip()
    df_toclean['lede'] = df_toclean['lede'].str.strip()
    df_toclean['body'] = df_toclean['body'].str.strip()

    # If the body only has the word 'body', replace it with the lede
    df_toclean.loc[df_toclean['body'] == 'body', 'body'] = df_toclean['lede']

    # Clean Bodies using Regex
    # func_clean_body_no_punct = lambda x: ' '.join([item for item in re.findall(r'[A-Za-z0-9]+', x) if len(item) > 1])
    # df_toclean['body_cleaned'] = df_toclean['body'].progress_apply(func_clean_body_no_punct)

    # Do an actual word count
    func_count_words = lambda x: len(x.split())
    df['actual_body_word_count'] = df['body'].progress_apply(func_count_words)

    # Clean Bodies using Regex
    func_clean_body = lambda x: ' '.join(re.findall(r'[A-Za-z0-9!@#$%^&*()]+.', x))
    df['body_cleaned'] = df['body'].progress_apply(func_clean_body)

    # Before stop words: create a column actual_body_word_count that has the true word count of the body BEFORE it is cleaned and words are removed 
    nltk.download('stopwords')

    stop_words = set(stopwords.words('english'))
    
    func_remove_stop = lambda x: ' '.join([word for word in word_tokenize(x) if word.lower() not in stop_words])
    df_toclean['body_cleaned'] = df_toclean['body_cleaned'].progress_apply(func_remove_stop)

    return df_toclean



In [14]:
data_2018_cleaned = clean_data(data_2018)

100%|██████████| 12287/12287 [00:00<00:00, 42093.47it/s]
100%|██████████| 12287/12287 [00:00<00:00, 12435.55it/s]
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/iranjan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
100%|██████████| 12287/12287 [00:35<00:00, 347.61it/s]


In [15]:
data_2018_cleaned.head()

Unnamed: 0,position_section,position_subsection,hl1,hl2,author,lede,body,pub_date,indexing_terms,year,actual_body_word_count,body_cleaned
0,SPORTS,unknown,team up in air on canada's pot issue,unknown,unknown,"calgary, alberta — some bruins rely on plant-b...",body while now legal in 13 nhl cities — seven ...,2018-10-18,"{'subject': [{'score': '92', 'classCode': 'STX...",2018,837,"body legal 13 nhl cities seven canada , three ..."
1,NEWS,Metro,"out of chaos, a candidate shouting to be heard",trump-backer ayyadurai lags in race for senate...,unknown,shiva ayyadurai election day is several weeks ...,body there's name-calling. ayyadurai calls sen...,2018-10-18,"{'legal': [{'className': 'Governments', 'class...",2018,1329,body ' name- calling . ayyadurai calls senator...
2,BUSINESS,unknown,walsh looking for climate plan funds,unknown,unknown,business leaders seem ready to embrace marty w...,body the mayor received a prolonged standing o...,2018-10-18,"{'subject': [{'score': '89', 'classCode': 'ST0...",2018,497,body mayor received prolonged standing ovation...
3,SPORTS,unknown,baseball playoffs,unknown,unknown,"game 1 astros 7, red sox 2 game 2 red sox 7, a...","game 1 astros 7, red sox 2 game 2 red sox 7, a...",2018-10-18,{'legal': [{'className': 'Business & Corporate...,2018,1,body
4,NEWS,Metro,harvard decisions benefited donors,bias suit highlights admissions process,unknown,when harvard university admitted several appli...,"body ellwood lauded fitzsimmons on ""big wins,""...",2018-10-18,"{'legal': [{'className': 'Education Law', 'cla...",2018,964,"body ellwood lauded fitzsimmons big wins , inc..."


In [10]:
data_2017_cleaned = clean_data(data_2017)

100%|██████████| 15294/15294 [00:00<00:00, 43611.06it/s]
100%|██████████| 15294/15294 [00:01<00:00, 12956.75it/s]
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/iranjan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
100%|██████████| 15294/15294 [00:42<00:00, 358.58it/s]


In [11]:
data_2017_cleaned.head()

Unnamed: 0,position_section,position_subsection,hl1,hl2,author,lede,body,pub_date,indexing_terms,year,actual_body_word_count,body_cleaned
0,LIVING ARTS,unknown,music,unknown,unknown,sub rosa: songs from three mile island remembe...,"godheadsilo this fargo, n.d.-born duo, embarki...",2017-01-08,"{'subject': [{'score': '90', 'classCode': 'ST0...",2017,1624,"godheadsilo fargo , n. d. born duo , embarking..."
1,SPORTS,unknown,"duke 93, bc 82",unknown,unknown,"at cameron indoor stadium, durham, n.c. boston...",jeffers 29 3-4 2-2 4-12 1 1 8 turner 35 5-9 2-...,2017-01-08,"{'legal': [{'className': 'Bankruptcy Law', 'cl...",2017,264,jeffers 29 3- 4 2- 2 4- 12 1 1 8 turner 35 5- ...
2,REGIONAL,West,campus angle,david oluwadara,unknown,a repeat patriot league men's indoor field ath...,"last february at bu's valentine invitational, ...",2017-01-08,"{'legal': [{'className': 'Education Law', 'cla...",2017,694,"last february bu ' valentine invitational , ne..."
3,REGIONAL,West,noteworthy,unknown,unknown,cal howes maynard the freshman basketball guar...,jennifer narlee medfield narlee was named new ...,2017-01-08,"{'subject': [{'score': '90', 'classCode': 'STX...",2017,93,jennifer narlee medfield narlee named new engl...
4,SUNDAY,Travel,broadway actor enjoys visiting places with old...,unknown,unknown,"rob mcclure with his wife, maggie lakis, in ve...",favorite vacation destination? venice. i loved...,2017-01-08,"{'subject': [{'score': '78', 'classCode': 'ST0...",2017,301,favorite vacation destination ? venice . loved...


In [16]:
data_2017_cleaned.to_csv('/Users/iranjan/Projects/dd4g-bias-in-media/data/cleaned_data_2017.csv', index=False)
data_2018_cleaned.to_csv('/Users/iranjan/Projects/dd4g-bias-in-media/data/cleaned_data_2018.csv', index=False)