### Load Samples of All Datasets

In [1]:
import pandas as pd
import json

data = pd.read_csv("assets/dataset/article_metadata.csv", lineterminator='\n',
                   names = ['ArticleId','Url','Title','Tags','Topic','DatePublished','Abstract','FullText'],
                   index_col='ArticleId')
data =data[1:] # exclude header row
print(data.shape)
data.head(5)

(13851, 7)


Unnamed: 0_level_0,Url,Title,Tags,Topic,DatePublished,Abstract,FullText
ArticleId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1275,/articles/how-free-immigration-could-double-th...,How Free Immigration Could Double the Economy,,,,"""The single best policy we could do for the po...","""\n\nGreat libertarian thinkers such as Milton..."
12897,/articles/amc-s-halt-and-catch-fire-is-capital...,"AMC’s ""Halt and Catch Fire"" Is Capitalism's Fi...","Capitalism,Competition,Property Rights,Entrepr...",Economics,9/2/2015 10:56:24 AM,"""The show is a vibrant look at the early PC in...","""AMC’s Halt and Catch Fire is a brilliant achi..."
58871,/articles/americas-aristocracy-of-privilege-an...,America's Aristocracy of Privilege and Power (1),"Government,Government Intervention",Policy,3/25/2015 12:00:00 AM,"""The problem with politics is power, not people.""","""Bush, Kennedy, Romney, Clinton, and, yes, eve..."
58872,/articles/does-government-spending-help-the-ec...,Does Government Spending Help the Economy? (1),"Government,Government Intervention,Government ...",Policy,4/9/2015 12:00:00 AM,"""We have strong reasons to think that less is ...","""How much government spending is enough, and h..."
58873,/articles/payday-loans-and-predatory-politicians/,Payday Loans and Predatory Politicians (1),"Special Interests,Money and Banking",Politics,3/27/2015 12:00:00 AM,"""Banning usury may&nbsp;feel&nbsp;good, but it...","""Progressive politicians have found a ripe old..."


In [2]:
# exclude rows with no publish date, url, or title - they are drafts or nto articles, or corrupt
data.dropna(subset=['DatePublished','Url','Title'],inplace=True)
print(data.shape)

(13835, 7)


In [53]:
data.head()

Unnamed: 0_level_0,Url,Title,Tags,Topic,DatePublished,Abstract,FullText
ArticleId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
12897,/articles/amc-s-halt-and-catch-fire-is-capital...,"AMC’s ""Halt and Catch Fire"" Is Capitalism's Fi...","Capitalism,Competition,Property Rights,Entrepr...",Economics,9/2/2015 10:56:24 AM,"""The show is a vibrant look at the early PC in...","""AMC’s Halt and Catch Fire is a brilliant achi..."
58871,/articles/americas-aristocracy-of-privilege-an...,America's Aristocracy of Privilege and Power (1),"Government,Government Intervention",Policy,3/25/2015 12:00:00 AM,"""The problem with politics is power, not people.""","""Bush, Kennedy, Romney, Clinton, and, yes, eve..."
58872,/articles/does-government-spending-help-the-ec...,Does Government Spending Help the Economy? (1),"Government,Government Intervention,Government ...",Policy,4/9/2015 12:00:00 AM,"""We have strong reasons to think that less is ...","""How much government spending is enough, and h..."
58873,/articles/payday-loans-and-predatory-politicians/,Payday Loans and Predatory Politicians (1),"Special Interests,Money and Banking",Politics,3/27/2015 12:00:00 AM,"""Banning usury may&nbsp;feel&nbsp;good, but it...","""Progressive politicians have found a ripe old..."
58874,/articles/earth-hour-and-how-the-west-plays-at...,Earth Hour and How the West Plays at Poverty (1),"Government Intervention,Regulation,Environment...",Policy,3/30/2015 12:00:00 AM,"""People who see virtue in doing without electr...","""On Saturday night, millions of rich people pl..."


## Decode JSON fields

In [3]:
# remove parentheses, as it is an artifact from CMS
data.Title = data.Title.str.replace(r"\(.*\)","")
data.head()

Unnamed: 0_level_0,Url,Title,Tags,Topic,DatePublished,Abstract,FullText
ArticleId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
12897,/articles/amc-s-halt-and-catch-fire-is-capital...,"AMC’s ""Halt and Catch Fire"" Is Capitalism's Fi...","Capitalism,Competition,Property Rights,Entrepr...",Economics,9/2/2015 10:56:24 AM,"""The show is a vibrant look at the early PC in...","""AMC’s Halt and Catch Fire is a brilliant achi..."
58871,/articles/americas-aristocracy-of-privilege-an...,America's Aristocracy of Privilege and Power,"Government,Government Intervention",Policy,3/25/2015 12:00:00 AM,"""The problem with politics is power, not people.""","""Bush, Kennedy, Romney, Clinton, and, yes, eve..."
58872,/articles/does-government-spending-help-the-ec...,Does Government Spending Help the Economy?,"Government,Government Intervention,Government ...",Policy,4/9/2015 12:00:00 AM,"""We have strong reasons to think that less is ...","""How much government spending is enough, and h..."
58873,/articles/payday-loans-and-predatory-politicians/,Payday Loans and Predatory Politicians,"Special Interests,Money and Banking",Politics,3/27/2015 12:00:00 AM,"""Banning usury may&nbsp;feel&nbsp;good, but it...","""Progressive politicians have found a ripe old..."
58874,/articles/earth-hour-and-how-the-west-plays-at...,Earth Hour and How the West Plays at Poverty,"Government Intervention,Regulation,Environment...",Policy,3/30/2015 12:00:00 AM,"""People who see virtue in doing without electr...","""On Saturday night, millions of rich people pl..."


In [8]:
# decode JSON fields
# for some reason json.loads did not work here 
# - I used Newtonson to encode the strong so that whitespace would not corrupt the CSV
import codecs
# print(codecs.decode(data.ix[0].FullText,'unicode-escape').replace('â','\'').replace('â','"').replace('Â','\"'))

def decodeJSONString(value):
    try:
        output = codecs.decode(value,'unicode-escape').replace('â','\'').replace('â','"').replace('Â','\"').replace('&nbsp;',' ')
        return output
    except:
        return value    

# print(decodeJSONString(data.ix[0].FullText))

data['Abstract'] = data.Abstract.map(lambda x:decodeJSONString(x))
data['FullText'] = data.FullText.map(lambda x:decodeJSONString(x))
data.head()

Unnamed: 0_level_0,Url,Title,Tags,Topic,DatePublished,Abstract,FullText
ArticleId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
12897,/articles/amc-s-halt-and-catch-fire-is-capital...,"AMC’s ""Halt and Catch Fire"" Is Capitalism's Fi...","Capitalism,Competition,Property Rights,Entrepr...",Economics,9/2/2015 10:56:24 AM,"""The show is a vibrant look at the early PC in...","""AMC's Halt and Catch Fire is a brilliant achi..."
58871,/articles/americas-aristocracy-of-privilege-an...,America's Aristocracy of Privilege and Power,"Government,Government Intervention",Policy,3/25/2015 12:00:00 AM,"""The problem with politics is power, not people.""","""Bush, Kennedy, Romney, Clinton, and, yes, eve..."
58872,/articles/does-government-spending-help-the-ec...,Does Government Spending Help the Economy?,"Government,Government Intervention,Government ...",Policy,4/9/2015 12:00:00 AM,"""We have strong reasons to think that less is ...","""How much government spending is enough, and h..."
58873,/articles/payday-loans-and-predatory-politicians/,Payday Loans and Predatory Politicians,"Special Interests,Money and Banking",Politics,3/27/2015 12:00:00 AM,"""Banning usury may feel good, but it sure won&...","""Progressive politicians have found a ripe old..."
58874,/articles/earth-hour-and-how-the-west-plays-at...,Earth Hour and How the West Plays at Poverty,"Government Intervention,Regulation,Environment...",Policy,3/30/2015 12:00:00 AM,"""People who see virtue in doing without electr...","""On Saturday night, millions of rich people pl..."


Unnamed: 0_level_0,Url,Title,Tags,Topic,DatePublished,Abstract,FullText
ArticleId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
12897,/articles/amc-s-halt-and-catch-fire-is-capital...,"AMC’s ""Halt and Catch Fire"" Is Capitalism's Fi...","Capitalism,Competition,Property Rights,Entrepr...",Economics,9/2/2015 10:56:24 AM,"""The show is a vibrant look at the early PC in...","""AMC’s Halt and Catch Fire is a brilliant achi..."
58871,/articles/americas-aristocracy-of-privilege-an...,America's Aristocracy of Privilege and Power,"Government,Government Intervention",Policy,3/25/2015 12:00:00 AM,"""The problem with politics is power, not people.""","""Bush, Kennedy, Romney, Clinton, and, yes, eve..."
58872,/articles/does-government-spending-help-the-ec...,Does Government Spending Help the Economy?,"Government,Government Intervention,Government ...",Policy,4/9/2015 12:00:00 AM,"""We have strong reasons to think that less is ...","""How much government spending is enough, and h..."
58873,/articles/payday-loans-and-predatory-politicians/,Payday Loans and Predatory Politicians,"Special Interests,Money and Banking",Politics,3/27/2015 12:00:00 AM,"""Banning usury may&nbsp;feel&nbsp;good, but it...","""Progressive politicians have found a ripe old..."
58874,/articles/earth-hour-and-how-the-west-plays-at...,Earth Hour and How the West Plays at Poverty,"Government Intervention,Regulation,Environment...",Policy,3/30/2015 12:00:00 AM,"""People who see virtue in doing without electr...","""On Saturday night, millions of rich people pl..."


In [10]:
print(data.to_pickle('assets\dataset\ArticleMetadata.pkl'))

None
