### Data Preprocessing

#### Data Cleaning 

Select the Columns that relevant to the scopes

In [1]:
import pandas as pd

# load the dataset
df = pd.read_csv('cnn_news_articles_final.csv')

# select the selected columns
df = df[['Date published', 'Category', 'Headline', 'Article text']]

Renamned columns

In [2]:
# Rename the columns
df = df.rename(columns={
    'Date published': 'date published',
    'Category': 'category',
    'Headline': 'headline',
    'Article text': 'text'
})

Drop N/A (Except Date Published)

In [3]:
# drop missing values
print("Before Drop: ")
print(df.isnull().sum())

df = df.dropna(subset=['text'])

print("\nAfter Drop: ")
print(df.isnull().sum())

Before Drop: 
date published    3928
category             0
headline             0
text                 9
dtype: int64

After Drop: 
date published    3928
category             0
headline             0
text                 0
dtype: int64


Drop Duplicated (Just Incase)

In [4]:
# check duplicated
print('Total Duplicated:', df.duplicated().sum())

# review duplicated
df[df.duplicated()]

Total Duplicated: 0


Unnamed: 0,date published,category,headline,text


In [5]:
# drop duplicates
df = df.drop_duplicates()

Remove Outliers in Category column

In [6]:
# value counts Category
category_counts = df['category'].value_counts()
print("Before Drop: ")
print(category_counts)

# list of categories to remove
categories_to_remove = ['travel', 'style', 'cnn', 'vr']

# remove rows with specified categories
df = df[~df['category'].isin(categories_to_remove)]

# value counts Category column after removed
category_counts_after_removal = df['category'].value_counts()
print("\nAfter Drop: ")
print(category_counts_after_removal)

Before Drop: 
category
news             18069
sport            15788
politics          3026
business          1775
health            1037
world              820
entertainment      557
us                 317
opinions           280
weather            115
travel              52
style               16
cnn                 11
vr                   5
Name: count, dtype: int64

After Drop: 
category
news             18069
sport            15788
politics          3026
business          1775
health            1037
world              820
entertainment      557
us                 317
opinions           280
weather            115
Name: count, dtype: int64


Replacing the 3928 missing date data (Metadata given is 2023-01-01 to 2023-10-06)

In [7]:
from datetime import datetime

# sequential dates from 2023-01-01 to 2023-10-06
start_date = datetime.strptime('2023-01-01', '%Y-%m-%d')
end_date = datetime.strptime('2023-10-06', '%Y-%m-%d')
date_range = pd.date_range(start=start_date, end=end_date, freq='D')

# convert to datetime format
df['date published'] = pd.to_datetime(df['date published'], errors='coerce')

# find where dates are missing
missing_indices = df[df['date published'].isna()].index

# count number of missing dates
num_missing = len(missing_indices)

# count number of dates in the range
num_dates = len(date_range)

# count how many times each date should be repeated
repeats_per_date = num_missing // num_dates
extra_repeats = num_missing % num_dates

# create list to fill the missing dates
fill_dates = []
for date in date_range:
    fill_dates.extend([date] * repeats_per_date)

# assign the extra repeats to the first few dates
for i in range(extra_repeats):
    fill_dates.append(date_range[i])

# sort the dates from 2023-01-01 to 2023-10-06
fill_dates = sorted(fill_dates)

# start filling
for i, index in enumerate(missing_indices):
    df.at[index, 'date published'] = fill_dates[i]

Last check after cleaning

In [8]:
# last check missing values 
print(df.isnull().sum())

# last check duplicated
print(df.duplicated().sum())

# dimensions 
print(df.shape)

date published    0
category          0
headline          0
text              0
dtype: int64
0
(41784, 4)


#### Text Cleaning

In [9]:
# lowercase text 
def lowercase_text(text):
    return text.lower()

# change apostrophe 
def replace_apostrophe(text):
    return text.replace("’", "'")

df[['text', 'headline']] = df[['text', 'headline']].applymap(lowercase_text)
df[['text', 'headline']] = df[['text', 'headline']].applymap(replace_apostrophe)
df.head(10)

  df[['text', 'headline']] = df[['text', 'headline']].applymap(lowercase_text)
  df[['text', 'headline']] = df[['text', 'headline']].applymap(replace_apostrophe)


Unnamed: 0,date published,category,headline,text
0,2021-07-15 02:46:59,news,"there's a shortage of truckers, but tusimple t...","(cnn)right now, there's a shortage of truck d..."
1,2021-05-12 07:52:09,news,bioservo's robotic 'ironhand' could protect fa...,(cnn)working in a factory or warehouse can me...
2,2021-06-16 02:51:30,news,this swarm of robots gets smarter the more it ...,"(cnn)in a hong kong warehouse, a swarm of aut..."
3,2022-03-18 14:37:21,business,"two years later, remote work has changed milli...",the pandemic thrust the working world into a n...
4,2022-03-19 11:41:08,business,why march is so volatile for stocks - cnn,new york (cnn business)march madness isn't jus...
5,2022-03-20 11:36:43,business,stocks week ahead: big oil rakes in billions a...,a version of this story first appeared in cnn ...
6,2022-03-18 14:26:26,business,oil 'emergency': work from home and drive slow...,new york (cnn business)governments around the ...
7,2022-03-20 12:57:36,business,opinion: technology is transforming the nature...,this interview has been edited from its origin...
8,2022-03-18 17:14:11,business,inflation is everywhere. except your cell phon...,new york (cnn business)inflation is everywhere...
9,2022-03-18 11:32:30,business,burger king partner 'refuses' to close 800 rus...,new york (cnn business)burger king is trying t...


In [10]:
import wordninja

# Word Segmentation on text column
df['headline'] = df['headline'].apply(lambda x: ' '.join(wordninja.split(x)))
df['text'] = df['text'].apply(lambda x: ' '.join(wordninja.split(x)))
df.head(10)

Unnamed: 0,date published,category,headline,text
0,2021-07-15 02:46:59,news,there's a shortage of truckers but tu simple t...,cnn right now there's a shortage of truck driv...
1,2021-05-12 07:52:09,news,bio servo's robotic ' iron hand ' could protec...,cnn working in a factory or warehouse can mean...
2,2021-06-16 02:51:30,news,this swarm of robots gets smarter the more it ...,cnn in a hong kong warehouse a swarm of autono...
3,2022-03-18 14:37:21,business,two years later remote work has changed millio...,the pandemic thrust the working world into a n...
4,2022-03-19 11:41:08,business,why march is so volatile for stocks cnn,new york cnn business march madness isn't just...
5,2022-03-20 11:36:43,business,stocks week ahead big oil rakes in billions as...,a version of this story first appeared in cnn ...
6,2022-03-18 14:26:26,business,oil ' emergency ' work from home and drive slo...,new york cnn business governments around the w...
7,2022-03-20 12:57:36,business,opinion technology is transforming the nature ...,this interview has been edited from its origin...
8,2022-03-18 17:14:11,business,inflation is everywhere except your cell phone...,new york cnn business inflation is everywhere ...
9,2022-03-18 11:32:30,business,burger king partner ' refuses ' to close 800 r...,new york cnn business burger king is trying to...


In [11]:
import contractions

# expand the words like cnnt to cannot / ve to have and so on
def expand_contractions(text):

    expanded_text = contractions.fix(text)
    return expanded_text

df['headline'] = df['headline'].apply(expand_contractions)
df['text'] = df['text'].apply(expand_contractions)
df.head(10)

Unnamed: 0,date published,category,headline,text
0,2021-07-15 02:46:59,news,there is a shortage of truckers but tu simple ...,cnn right now there is a shortage of truck dri...
1,2021-05-12 07:52:09,news,bio servo's robotic ' iron hand ' could protec...,cnn working in a factory or warehouse can mean...
2,2021-06-16 02:51:30,news,this swarm of robots gets smarter the more it ...,cnn in a hong kong warehouse a swarm of autono...
3,2022-03-18 14:37:21,business,two years later remote work has changed millio...,the pandemic thrust the working world into a n...
4,2022-03-19 11:41:08,business,why march is so volatile for stocks cnn,new york cnn business march madness is not jus...
5,2022-03-20 11:36:43,business,stocks week ahead big oil rakes in billions as...,a version of this story first appeared in cnn ...
6,2022-03-18 14:26:26,business,oil ' emergency ' work from home and drive slo...,new york cnn business governments around the w...
7,2022-03-20 12:57:36,business,opinion technology is transforming the nature ...,this interview has been edited from its origin...
8,2022-03-18 17:14:11,business,inflation is everywhere except your cell phone...,new york cnn business inflation is everywhere ...
9,2022-03-18 11:32:30,business,burger king partner ' refuses ' to close 800 r...,new york cnn business burger king is trying to...


In [12]:
import emoji
import re
from emoticons_lib import emoticons_lib

# convert emoticon with text   { :), :( }
def convert_emojis_with_text(text):
    emoticon_pattern = re.compile('|'.join(re.escape(emoticon) for emoticon in emoticons_lib.keys()))
    return emoticon_pattern.sub(lambda match: emoticons_lib[match.group(0)], text)

# list of columns to apply the functions
columns_to_transform = ['text', 'headline']

# apply the functions to both columns
for column in columns_to_transform:
    df[column] = df[column].apply(lambda e: convert_emojis_with_text(e))

    # convert emojis to text
    df[column] = df[column].apply(lambda e: emoji.demojize(e, language="en"))

    # replace underscore with whitespace (caused by emojis text like 😊 convert to :smiling_smiling_face ）
    df[column] = df[column].apply(lambda e: e.replace('_', ' '))

    # remove semicolon with whitespace (caused by emojis text)
    df[column] = df[column].apply(lambda e: e.replace(':', ' '))


In [13]:
# remove 'cnn' word in text column
def remove_cnn(text):
    return text.replace('cnn', '')

df['headline'] = df['headline'].apply(remove_cnn)
df['text'] = df['text'].apply(remove_cnn)
df.head(10)

Unnamed: 0,date published,category,headline,text
0,2021-07-15 02:46:59,news,there is a shortage of truckers but tu simple ...,right now there is a shortage of truck driver...
1,2021-05-12 07:52:09,news,bio servo's robotic ' iron hand ' could protec...,working in a factory or warehouse can mean do...
2,2021-06-16 02:51:30,news,this swarm of robots gets smarter the more it ...,in a hong kong warehouse a swarm of autonomou...
3,2022-03-18 14:37:21,business,two years later remote work has changed millio...,the pandemic thrust the working world into a n...
4,2022-03-19 11:41:08,business,why march is so volatile for stocks,new york business march madness is not just f...
5,2022-03-20 11:36:43,business,stocks week ahead big oil rakes in billions as...,a version of this story first appeared in bus...
6,2022-03-18 14:26:26,business,oil ' emergency ' work from home and drive slo...,new york business governments around the worl...
7,2022-03-20 12:57:36,business,opinion technology is transforming the nature ...,this interview has been edited from its origin...
8,2022-03-18 17:14:11,business,inflation is everywhere except your cell phone...,new york business inflation is everywhere gro...
9,2022-03-18 11:32:30,business,burger king partner ' refuses ' to close 800 r...,new york business burger king is trying to su...


In [14]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('wordnet')

# remove words =  or > 2 characters
def remove_short_words(text):
    return ' '.join([word for word in text.split() if len(word) > 2])

# remove symbols
def remove_symbols(text):
    symbol_pattern = re.compile(r'[\(\)\[\]:]')
    return symbol_pattern.sub('', text)

# remove symbols and digits
def remove_symbols_digits(text):
    return re.sub('[^a-zA-Z\s]', ' ', text)

# remove URLs
def remove_urls(text):
    return re.sub(r'http\S+', '', text)

# remove HTML tags
def remove_html_tags(text):
    return re.sub(r'<[^>]+>', '', text)

# remove extra whitespace
def remove_whitespace(text):
    return ' '.join(text.split())

# remove punctuation
def remove_punctuation(text):
    return re.sub(r'[^\w\s]', '', text)

# remove stopwords
def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    return ' '.join([token for token in text.split() if token.lower() not in stop_words])

# lemmatizing text
def lemmatize_text(text):
    lemmatizer = WordNetLemmatizer()
    return ' '.join([lemmatizer.lemmatize(token) for token in text.split()])

# list of columns to apply the functions
columns_to_transform = ['text', 'headline']

# apply the functions to both columns
for column in columns_to_transform:
    df[column] = df[column].apply(remove_short_words)
    df[column] = df[column].apply(remove_symbols)
    df[column] = df[column].apply(remove_symbols_digits)
    df[column] = df[column].apply(remove_urls)
    df[column] = df[column].apply(remove_html_tags)
    df[column] = df[column].apply(remove_whitespace)
    df[column] = df[column].apply(remove_punctuation)
    df[column] = df[column].apply(remove_stopwords)
    df[column] = df[column].apply(lemmatize_text)

df.head(10)

  return re.sub('[^a-zA-Z\s]', ' ', text)
[nltk_data] Downloading package stopwords to C:\Users\Joe
[nltk_data]     Chok\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Joe
[nltk_data]     Chok\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,date published,category,headline,text
0,2021-07-15 02:46:59,news,shortage trucker simple think solution driver ...,right shortage truck driver worldwide exacerba...
1,2021-05-12 07:52:09,news,bio servo robotic iron hand could protect fact...,working factory warehouse mean task repetition...
2,2021-06-16 02:51:30,news,swarm robot get smarter work,hong kong warehouse swarm autonomous robot wor...
3,2022-03-18 14:37:21,business,two year later remote work changed million career,pandemic thrust working world new reality marc...
4,2022-03-19 11:41:08,business,march volatile stock,new york business march madness college basket...
5,2022-03-20 11:36:43,business,stock week ahead big oil rake billion price so...,version story first appeared business bell new...
6,2022-03-18 14:26:26,business,oil emergency work home drive slower iea say,new york business government around world must...
7,2022-03-20 12:57:36,business,opinion technology transforming nature money a...,interview edited original version originally p...
8,2022-03-18 17:14:11,business,inflation everywhere except cell phone bill,new york business inflation everywhere grocery...
9,2022-03-18 11:32:30,business,burger king partner refuse close russian location,new york business burger king trying suspend o...


In [16]:
from nltk.corpus import words
from nltk.tokenize import word_tokenize
import nltk
nltk.download('words')
nltk.download('punkt')

# tokenize text
def tokenize_text(text):
    tokens = word_tokenize(text)
    return tokens

# remove any non-English words
english_words = set(words.words()) 
def remove_non_english(tokens):
    english_tokens = []
    for word in tokens:
        if word in english_words:
            english_tokens.append(word)
        else:
            english_tokens.append('')
    return [token for token in english_tokens if token != '']

# apply the functions to both columns
def preprocess_column(column):
    column = column.apply(tokenize_text)
    column = column.apply(remove_non_english)
    column = column.apply(lambda tokens: ' '.join(tokens))
    return column

# apply the preprocessing to both columns
df['text'] = preprocess_column(df['text'])
df['headline'] = preprocess_column(df['headline'])

df.head(10)

[nltk_data] Downloading package words to C:\Users\Joe
[nltk_data]     Chok\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Joe
[nltk_data]     Chok\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Unnamed: 0,date published,category,headline,text
0,2021-07-15 02:46:59,news,shortage trucker simple think solution driver,right shortage truck driver commerce boom brou...
1,2021-05-12 07:52:09,news,servo iron hand could protect factory worker i...,working factory warehouse mean task repetition...
2,2021-06-16 02:51:30,news,swarm robot get work,hong warehouse swarm autonomous robot work wor...
3,2022-03-18 14:37:21,business,two year later remote work million career,pandemic thrust working world new reality marc...
4,2022-03-19 11:41:08,business,march volatile stock,new york business march madness college basket...
5,2022-03-20 11:36:43,business,stock week ahead big oil rake billion price so...,version story first business bell newsletter s...
6,2022-03-18 14:26:26,business,oil emergency work home drive say,new york business government around world must...
7,2022-03-20 12:57:36,business,opinion technology transforming nature money a...,interview original version originally entirety...
8,2022-03-18 17:14:11,business,inflation everywhere except cell phone bill,new york business inflation everywhere grocery...
9,2022-03-18 11:32:30,business,king partner refuse close location,new york business king trying suspend operatio...


In [17]:
# check missing values 
print(df.isnull().sum())

date published    0
category          0
headline          0
text              0
dtype: int64


In [18]:
# check duplicated
print(df.duplicated().sum())

0


In [19]:
# check empty string in text column
df[df['text'] == '']

Unnamed: 0,date published,category,headline,text
4506,2017-07-05 17:32:27,news,quiz legit quit,
19168,2015-06-06 17:29:10,sport,champion league live barcelona,
19960,2015-11-20 17:10:22,sport,real barcelona live,
20062,2015-11-06 17:06:21,sport,live premier league football,
21019,2016-05-28 15:58:43,sport,champion league final live,
21093,2016-05-18 10:22:35,news,referendum next,
25303,2018-02-25 00:07:08,sport,winter day result live update,
25311,2018-02-23 00:00:16,sport,day result live update,
25312,2018-02-23 23:59:15,sport,winter day result live update,
25325,2018-02-21 23:59:26,sport,day result live update,


In [20]:
# check empty string in headline column
df[df['headline'] == '']

Unnamed: 0,date published,category,headline,text
863,2020-01-23 17:36:04,business,,outskirt one best medieval city one world mode...
2368,2021-01-22 11:25:25,news,,series essay distance lake telling story pande...
2499,2015-02-27 18:56:55,news,,story highlight gene gene may star trek charac...
4359,2014-02-19 14:44:18,health,,story highlight conflicting message regarding ...
4508,2017-05-16 17:43:26,entertainment,,story highlight show ran nine season original ...
...,...,...,...,...
31906,2020-01-31 18:49:17,sport,,start thrilling new rivalry sign good rookie p...
32895,2020-07-23 08:08:27,sport,,supposed week celebration japan would opening ...
35298,2021-05-07 12:29:03,news,,version story may edition royal news weekly di...
36053,2021-07-02 15:07:53,news,,look life president personal birth date august...


In [21]:
# filter out empty string for both columns
df = df[df['text'] != '']
df = df[df['headline'] != '']

Last Check

In [22]:
print(df.isnull().sum())

date published    0
category          0
headline          0
text              0
dtype: int64


In [23]:
print(df.duplicated().sum())

0


In [24]:
df[df['text'] == '']

Unnamed: 0,date published,category,headline,text


In [25]:
df[df['headline'] == '']

Unnamed: 0,date published,category,headline,text


In [26]:
df.shape

(41688, 4)

In [27]:
df.to_csv("cnn_news_articles_final_cleaned.csv", index=False)