In [3]:
import json
import string
import pandas as pd

from pathlib import Path
from itertools import chain
from collections import Counter

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [4]:
# Set constants

PATH = "data/"
CLEAN_AND_LEMMATIZE = False

In [23]:
# Retrieve articles from disk

print("Retrieving articles from disk...")
dataset = pd.read_csv(PATH + "dataset.csv")
print(dataset)

Retrieving articles from disk...
                                          Article_Title  \
0     If anti-Sanders Democrats were serious, they’d...   
1     The Japanese prime minister is going to Pearl ...   
2     The best argument for each of the 2020 Democra...   
3     Instagram found a new place to show you ads: I...   
4                           My Week Without Apple Watch   
...                                                 ...   
6808  EpiPen Price Hikes Reportedly Added Millions T...   
6809  The First Year Home May Be Most Dangerous For ...   
6810  Mistakes That Fueled Ebola Spread Are Preventi...   
6811              Taylor Spear's GPS Guide On Self Care   
6812  Hacktivists Couldn't Have Pulled Off Cyber Att...   

                                           Article_Text         Publish_Date  \
0     Democrats opposed to Sen. Bernie Sanders want ...  2020-02-29 20:00:00   
1     About six months ago, President Obama became t...  2016-12-27 14:20:01   
2     Share All sh

In [24]:
# Download stop words and lemmatization word net

nltk.download("stopwords")
nltk.download("wordnet")

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/student/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/student/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [25]:
# Clean article contents

punctuation = string.punctuation + "“”‘’0123456789―—–…\n"
stop_words = set(stopwords.words("english"))

def clean_article(article):
    # Lowercase the article
    article = article.lower()
        
    # Remove punctuation
#     article = article.translate(str.maketrans(' ', ' ', '—–'))
#     article = article.translate(str.maketrans('', '', "—–"))
    article = article.translate(str.maketrans(' ', ' ', punctuation))
        
    if CLEAN_AND_LEMMATIZE:
        # Remove stop words
        words = article.split()
        words = [word for word in words if not word in stop_words]
            
        # Apply lemmatization
        lemmatizer = WordNetLemmatizer()
        words = [lemmatizer.lemmatize(word) for word in words]
        article = " ".join(words)
            
    return article

print("Cleaning articles...")
dataset_clean = dataset
dataset_clean["Article_Text"] = dataset_clean["Article_Text"].apply(clean_article)
print(dataset_clean)

Cleaning articles...
                                          Article_Title  \
0     If anti-Sanders Democrats were serious, they’d...   
1     The Japanese prime minister is going to Pearl ...   
2     The best argument for each of the 2020 Democra...   
3     Instagram found a new place to show you ads: I...   
4                           My Week Without Apple Watch   
...                                                 ...   
6808  EpiPen Price Hikes Reportedly Added Millions T...   
6809  The First Year Home May Be Most Dangerous For ...   
6810  Mistakes That Fueled Ebola Spread Are Preventi...   
6811              Taylor Spear's GPS Guide On Self Care   
6812  Hacktivists Couldn't Have Pulled Off Cyber Att...   

                                           Article_Text         Publish_Date  \
0     democrats opposed to sen bernie sanders want y...  2020-02-29 20:00:00   
1     about six months ago president obama became th...  2016-12-27 14:20:01   
2     share all sharing option

In [26]:
# Save dataset to disk

print("Saving dataset to disk")
Path(PATH).mkdir(parents=True, exist_ok=True)
dataset_clean.to_csv(PATH + "dataset_clean.csv", index=False)
print("Done")

Saving dataset to disk
Done
