In [56]:
import pandas as pd
import tqdm
from functions import standardize_word, count_words

In [2]:
df = pd.read_csv('irishtimes-date-text.csv/irishtimes-date-text.csv')

In [3]:
df.head()

Unnamed: 0,publish_date,headline_category,headline_text
0,19960102,news,UUP sees possibility of voting Major out
1,19960102,news,Pubs targeted as curbs on smoking are extended
2,19960102,news,Papers reveal secret links with O'Neill cabinet
3,19960102,news,Domestic chaos as Italy takes EU presidency
4,19960102,news,Learning about the star to which we owe life


Cleaning Publish Date

In [4]:
df['year'] = 0
df['month'] = 0
df['day'] = 0
df.head()

Unnamed: 0,publish_date,headline_category,headline_text,year,month,day
0,19960102,news,UUP sees possibility of voting Major out,0,0,0
1,19960102,news,Pubs targeted as curbs on smoking are extended,0,0,0
2,19960102,news,Papers reveal secret links with O'Neill cabinet,0,0,0
3,19960102,news,Domestic chaos as Italy takes EU presidency,0,0,0
4,19960102,news,Learning about the star to which we owe life,0,0,0


In [5]:
date = str(df['publish_date'][0])

In [6]:
date[4:6]

'01'

First thought for code but too inefficient

`for index, row in df.iterrows():
    if(index % 100 == 0): print(index)
    date = str(row['publish_date'])
    df.loc[index, 'year'] = int(date[:4])
    df.loc[index, 'month'] = int(date[4:6])
    df.loc[index, 'day'] = int(date[6:])`

In [7]:
dates = df['publish_date'].to_list()

In [8]:
years = []
months = []
days = []

In [9]:
def split_date(x:int) -> int:
    assert type(x) == int, 'input to function must be integer'
    s = str(x)
    return s[:4], s[4:6], s[6:]

In [10]:
for date in tqdm_notebook(dates):
    year,month,day = split_date(date)
    years.append(year)
    months.append(month)
    days.append(day)

HBox(children=(IntProgress(value=0, max=1425460), HTML(value='')))




In [11]:
years[-1]

'2018'

In [12]:
df['year'] = years

In [13]:
df['month'] = months

In [14]:
df['day'] = days

In [15]:
df.head()

Unnamed: 0,publish_date,headline_category,headline_text,year,month,day
0,19960102,news,UUP sees possibility of voting Major out,1996,1,2
1,19960102,news,Pubs targeted as curbs on smoking are extended,1996,1,2
2,19960102,news,Papers reveal secret links with O'Neill cabinet,1996,1,2
3,19960102,news,Domestic chaos as Italy takes EU presidency,1996,1,2
4,19960102,news,Learning about the star to which we owe life,1996,1,2


In [16]:
lengths = []

for headline in tqdm_notebook(df['headline_text']):
    lengths.append(len(headline))

HBox(children=(IntProgress(value=0, max=1425460), HTML(value='')))




In [17]:
df['headline_len'] = lengths

In [18]:
 df['headline_len'].describe()

count    1.425460e+06
mean     4.243940e+01
std      1.525467e+01
min      9.000000e+00
25%      3.200000e+01
50%      4.100000e+01
75%      5.300000e+01
max      2.790000e+02
Name: headline_len, dtype: float64

In [19]:
df.head()

Unnamed: 0,publish_date,headline_category,headline_text,year,month,day,headline_len
0,19960102,news,UUP sees possibility of voting Major out,1996,1,2,40
1,19960102,news,Pubs targeted as curbs on smoking are extended,1996,1,2,46
2,19960102,news,Papers reveal secret links with O'Neill cabinet,1996,1,2,47
3,19960102,news,Domestic chaos as Italy takes EU presidency,1996,1,2,43
4,19960102,news,Learning about the star to which we owe life,1996,1,2,44


Create columns for categories and subcategories

In [33]:
categories = []
subcategories = []

for x in df['headline_category']:
    s = x.split('.')
    
    if len(s) != 1: 
        subcategories.append('.'.join(s[1:]))
    else:
        subcategories.append(None)
    categories.append(s[0])

In [34]:
df['category'] = categories
df['subcategory'] = subcategories

In [35]:
df.tail()

Unnamed: 0,publish_date,headline_category,headline_text,year,month,day,headline_len,category,subcategory
1425455,20181231,sport.other-sports,$9m for 139 seconds: Floyd Mayweather eases pa...,2018,12,31,65,sport,other-sports
1425456,20181231,news.ireland,Missing pregnant teenager Zoe Hitchcock safe a...,2018,12,31,53,news,ireland
1425457,20181231,news.ireland,Mourners bid farewell to 'beguiling' critic Ei...,2018,12,31,60,news,ireland
1425458,20181231,news.world.us,'Los Angeles Times' making 'progress' in wake ...,2018,12,31,60,news,world.us
1425459,20181231,business.economy,Benign outlook for US-China trade talks lifts ...,2018,12,31,65,business,economy


In [3]:
import nltk
from nltk.corpus import stopwords
df = pd.read_csv('irish_headlines_clean.csv')
stop_words = set(stopwords.words('english'))

In [5]:
test = df['headline_text'][0].split()
test

['UUP', 'sees', 'possibility', 'of', 'voting', 'Major', 'out']

In [31]:
filtered_headlines = []

for x in df['headline_text']:
    l = x.split()
    l = [i.lower() for i in l]
    filtered_headlines.append(' '.join([i for i in l if not i in stop_words]))

In [33]:
df['filtered_headlines'] = filtered_headlines

In [47]:
word_counts = count_words(df['filtered_headlines'])

100%|█████████████████████████████████████████████████████████████████████| 1425460/1425460 [00:20<00:00, 68688.21it/s]


In [40]:
word_counts_df = pd.DataFrame({'words': list(word_counts.values())}, index = list(word_counts.keys()))
word_counts_df.head()

Unnamed: 0,words
uup,952
sees,3262
possibility,227
voting,789
major,3952


In [57]:
word_counts_df.sort_values(by = 'words').to_json('word_counts_all.json')

In [34]:
df.to_csv('irish_headlines_clean.csv', index = False)

In [45]:
from typing import List


In [48]:
for x in df['filtered_headlines']:
    x.split()

In [49]:
df.head()

Unnamed: 0,publish_date,headline_category,headline_text,year,month,day,headline_len,category,subcategory,filtered_headlines
0,19960102,news,UUP sees possibility of voting Major out,1996,1,2,40,news,,uup sees possibility voting major
1,19960102,news,Pubs targeted as curbs on smoking are extended,1996,1,2,46,news,,pubs targeted curbs smoking extended
2,19960102,news,Papers reveal secret links with O'Neill cabinet,1996,1,2,47,news,,papers reveal secret links o'neill cabinet
3,19960102,news,Domestic chaos as Italy takes EU presidency,1996,1,2,43,news,,domestic chaos italy takes eu presidency
4,19960102,news,Learning about the star to which we owe life,1996,1,2,44,news,,learning star owe life


In [50]:
''.split()

[]

In [51]:
None.split()

AttributeError: 'NoneType' object has no attribute 'split'