In [1]:
"""
Import modules
"""
import os
import sys
import csv
import numpy as np
import pandas as pd

# Set the CSV field limit as large as possible
csv.field_size_limit(sys.maxsize)

131072

In [7]:
"""
Get the dataset
"""

data_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00359/NewsAggregatorDataset.zip'
data_dir = './test'
if not os.path.exists(data_dir):
    print('Dataset does not exist. Requesting dataset...')
    os.mkdir(data_dir)
    import wget
    # download file
    output_dir = data_dir
    downloaded_file = wget.download(data_url, out=output_dir)
    print('Downloaded file at: {}'.format(downloaded_file))
    # unzip and remove the zipped file
    import zipfile
    zipped_file = zipfile.ZipFile(os.path.join(downloaded_file))
    zipped_file.extractall(data_dir)
    zipped_file.close()
    print('Cleaning up...')
    os.remove(os.path.join(downloaded_file))
print('Dataset is now ready')

Dataset does not exist. Requesting dataset...
Downloaded file at: ./test/NewsAggregatorDataset.zip
Cleaning up...
Dataset is now ready


In [30]:
"""
Read file and create Pandas DataFrame
"""
with open('./data/NewsAggregatorDataset/newsCorpora.csv', "r") as file:
    reader = csv.reader(file)
    columns = ['id', 'title', 'publisher', 'category']

    # containers for each field
    data_id = []
    title = []
    publisher = []
    category = []

    for row in reader:
        row = "".join(row).split('\t')
        data_id.append(int(row[0]))
        title.append(row[1])
        publisher.append(row[3])
        category.append(row[4])

    # create a dataframe and save it to disk
    corpus_df = pd.DataFrame(np.array([data_id, title, publisher, category]).T, columns=columns)
    csv_name = "./data/news_aggregator_dataset.csv"
    corpus_df.to_csv(csv_name)

In [40]:
"""
Extract articles from selected publishers
"""
target_pub = ['Reuters', 'Huffington Post', 'Businessweek', 'Contactmusic.com', 'Daily Mail']

# Reuters
reuter = corpus_df.loc[corpus_df['publisher'] == target_pub[0]]
print("# of Reuter articles: ", len(reuter))

# Huff Post
huff_post = corpus_df.loc[corpus_df['publisher'] == target_pub[1]]
print("# of Huffington Post articles: ", len(huff_post))

# Businessweek
b_week = corpus_df.loc[corpus_df['publisher'] == target_pub[2]]
print("# of Businessweek articles: ", len(b_week))

# Contactmusic.com
con_music = corpus_df.loc[corpus_df['publisher'] == target_pub[3]]
print("# of Contactmusic.com articles: ", len(con_music))

# Daily Mail
daily_mail = corpus_df.loc[corpus_df['publisher'] == target_pub[4]]
print("# of Daily Mail articles: ", len(daily_mail))

# of Reuter articles:  3868
# of Huffington Post articles:  2412
# of Businessweek articles:  2371
# of Contactmusic.com articles:  2288
# of Daily Mail articles:  2205
           id                                              title  \
0        3150  Lena Dunham Apologizes For Molestation Joke On...   
1       63277  Christina Aguilera 'expecting a baby girl' wit...   
2      103247  Is Katherine Heigl's Lawsuit The Most Bizarrel...   
3       12939  U.S. Navy SEALs take control of North Korean-f...   
4       77136  Chrysler Canada says auto sales climb 2 percen...   
...       ...                                                ...   
13139  296632  Jason Momoa Is Aquaman! But What Do We Know Ab...   
13140  219018  Girls Gone Wild's Joe Francis is arrested for ...   
13141  123018  HBO No GO: Your Game of Thrones Streaming Woes...   
13142   33782  So This Is What Twitter Has To Say About The #...   
13143   35625     US forces hand over seized oil tanker to Libya   

              

In [None]:
"""
Split into train, validation, and test dataset and save them
"""

selected_articles = pd.concat([reuter, huff_post, b_week, con_music, daily_mail])
selected_articles_shuffled = selected_articles.sample(frac=1).reset_index(drop=True)
print(selected_articles_shuffled)

num_train = int(len(selected_articles_shuffled) * 0.8)
num_validation = int((len(selected_articles_shuffled) - num_train) * 0.5)
num_test = int(len(selected_articles_shuffled) - num_train - num_validation)

# check if there's missing data
assert(num_train + num_validation + num_test == len(selected_articles_shuffled))

train_data = selected_articles_shuffled.iloc[:num_train]
validation_data = selected_articles_shuffled.iloc[num_train:num_train+num_validation]
test_data = selected_articles_shuffled.iloc[num_train+num_validation:]

train_data.to_csv('./data/train.txt', sep='\t')
validation_data.to_csv('./data/valid.txt', sep='\t')
test_data.to_csv('./data/test.txt', sep='\t')