Prepare dataset for news ML classifier.

# Extract news articles from the path

In [9]:
import pandas as pd

In [10]:
# Access the google drive account
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [11]:
# Set up Goggle drive folder path
busi_path = '/content/drive/MyDrive/Colab Notebooks/News_articles/business_articles.xlsx'
clim_path = '/content/drive/MyDrive/Colab Notebooks/News_articles/climate_change_articles.xlsx'
pol_path = '/content/drive/MyDrive/Colab Notebooks/News_articles/politics_articles.xlsx'
tech_path = '/content/drive/MyDrive/Colab Notebooks/News_articles/tech_articles.xlsx'
wld_path = '/content/drive/MyDrive/Colab Notebooks/News_articles/world_articles.xlsx'

In [12]:
# Read the excel file from the google drive for news articles
busi_art = pd.read_excel(busi_path)
clim_art = pd.read_excel(clim_path)
pol_art = pd.read_excel(pol_path)
tech_art = pd.read_excel(tech_path)
wld_art = pd.read_excel(wld_path)

In [13]:
# Add category column to each dataframe
busi_df = busi_art
busi_df['category'] = 'business'

clim_df = clim_art
clim_df['category'] = 'climate change'

pol_df = pol_art
pol_df["category"] = 'politics'

tech_df = tech_art
tech_df["category"] = 'technology'

wld_df = wld_art
wld_df["category"] = 'world'

# Expore the news articles based on their category

In [14]:
busi_df.columns

Index(['uri', 'lang', 'isDuplicate', 'date', 'time', 'dateTime', 'dateTimePub',
       'dataType', 'sim', 'url', 'title', 'body', 'source', 'authors', 'image',
       'eventUri', 'sentiment', 'wgt', 'relevance', 'sourceTitle', 'sourceUri',
       'category'],
      dtype='object')

## Explore business news articles

In [15]:
busi_df.loc[0:3, ["title", "body", "category", "sourceTitle"]]

Unnamed: 0,title,body,category,sourceTitle
0,Can China’s loan rule ‘trick’ help grease the ...,National Administration of Financial Regulatio...,business,South China Morning Post
1,State power: Xi calls for firmer government ha...,To ensure stability and security in decisive s...,business,South China Morning Post
2,How to fix China’s economy? Investors are unsu...,Investor sentiment around China’s prospects ha...,business,South China Morning Post
3,China spins up new party finance organ as prop...,"New commission, which began operations last mo...",business,South China Morning Post


## Explore climate change news articles

In [16]:
clim_df.loc[3:6, ["title", "body", "category", "sourceTitle"]]

Unnamed: 0,title,body,category,sourceTitle
3,UN chief tells world to 'get a grip' as report...,Governments' plans to limit climate change to ...,climate change,Sky News
4,"UN chief calls for an ""ambition supernova"" as ...",Countries´ climate action plans are still far ...,climate change,Daily Mail Online
5,Worsening warming is hurting people in all reg...,Revved-up climate change now permeates America...,climate change,Daily Mail Online
6,Italy's extreme drought mirrors climate in Eth...,"Climate change is causing ""hazard flips"", wher...",climate change,Sky News


## Explore politics news articles

In [17]:
pol_df.loc[10:13, ["title", "body", "category", "sourceTitle"]]

Unnamed: 0,title,body,category,sourceTitle
10,"Lee Atwater, Tough Ex-Head of GOP, Dies",Former Republican National Chairman Lee Atwate...,politics,Los Angeles Times
11,Opinion | This Is How the Republican Party Got...,"In 1969, a young aide in the Nixon White House...",politics,The New York Times
12,Confessions of a Brexit fanatic,{{ #verifyErrors }}{{ message }}{{ /verifyErro...,politics,The Independent
13,What? Where? When? A Brainy Russian Quiz Show ...,It's been on Russia's airwaves since the Sovie...,politics,RadioFreeEurope/RadioLiberty


## Explore technology news articles

In [18]:
tech_df.loc[20:23, ["title", "body", "category", "sourceTitle"]]

Unnamed: 0,title,body,category,sourceTitle
20,Tech Start-Ups Try to Sell a Cautious Pentagon...,"Reporting from Devils Lake, N.D., Denver and W...",technology,The New York Times
21,ANDREA MIOTTI: Leaders must resist siren calls...,"As I write, Silicon Valley tech leaders are fl...",technology,Daily Mail Online
22,The fight against fake photos: How Adobe is em...,A lawsuit filed by three visual artists agains...,technology,USA Today
23,Wednesday briefing: Inside the battle to conta...,In today's newsletter: As Rishi Sunak gets rea...,technology,The Guardian


## Explore world news articles

In [19]:
wld_df.loc[34:37, ["title", "body", "category", "sourceTitle"]]

Unnamed: 0,title,body,category,sourceTitle
34,China tells UK to stop using trade to improve ...,UK and Taiwan sign trade agreement they hailed...,world,The Guardian
35,Turkey is marking its centennial. But a brain ...,ISTANBUL (AP) - Huseyin Buyukdag says he loves...,world,Daily Mail Online
36,Fortune 500 Europe: Which company is the bigge...,Introducing a new ranking of Europe's ever-cha...,world,Euronews English
37,Go off-piste in Europe at these 13 wild skiing...,"From Albania to Poland, Europe has an abundanc...",world,Euronews English


# Combine each news category as a single dataset

In [20]:
news_art_df = pd.concat([busi_df,
                         clim_df,
                         pol_df,
                         tech_df,
                         wld_df],
                        axis = 0, ignore_index = True)

In [21]:
news_art_df.head()

Unnamed: 0,uri,lang,isDuplicate,date,time,dateTime,dateTimePub,dataType,sim,url,...,source,authors,image,eventUri,sentiment,wgt,relevance,sourceTitle,sourceUri,category
0,7816062173,eng,False,2023-11-03,09:25:06,2023-11-03T09:25:06Z,2023-11-03T09:24:15Z,news,0.552941,https://www.scmp.com/economy/china-economy/art...,...,"{'uri': 'scmp.com', 'dataType': 'news', 'title...",[],https://cdn.i-scmp.com/sites/default/files/sty...,eng-9028689,,100,100,South China Morning Post,scmp.com,business
1,7824577130,eng,False,2023-11-08,12:02:20,2023-11-08T12:02:20Z,2023-11-08T12:00:30Z,news,0.0,https://www.scmp.com/economy/china-economy/art...,...,"{'uri': 'scmp.com', 'dataType': 'news', 'title...",[],https://cdn.i-scmp.com/sites/default/files/sty...,,0.270588,76,76,South China Morning Post,scmp.com,business
2,7814197764,eng,False,2023-11-02,09:01:43,2023-11-02T09:01:43Z,2023-11-02T09:00:21Z,news,0.0,https://www.scmp.com/comment/opinion/article/3...,...,"{'uri': 'scmp.com', 'dataType': 'news', 'title...",[],https://cdn.i-scmp.com/sites/default/files/sty...,,0.098039,76,76,South China Morning Post,scmp.com,business
3,7783150762,eng,False,2023-10-13,22:02:16,2023-10-13T22:02:16Z,2023-10-13T22:00:08Z,news,0.0,https://www.scmp.com/economy/china-economy/art...,...,"{'uri': 'scmp.com', 'dataType': 'news', 'title...",[],https://cdn.i-scmp.com/sites/default/files/sty...,,0.184314,76,76,South China Morning Post,scmp.com,business
4,7825182017,eng,False,2023-11-08,18:06:38,2023-11-08T18:06:38Z,2023-11-08T17:54:52Z,news,0.792157,https://www.dailymail.co.uk/wires/ap/article-1...,...,"{'uri': 'dailymail.co.uk', 'dataType': 'news',...",[],https://i.dailymail.co.uk/1s/2023/11/08/17/wir...,eng-9042157,,51,51,Daily Mail Online,dailymail.co.uk,business


In [22]:
news_art_df.tail()

Unnamed: 0,uri,lang,isDuplicate,date,time,dateTime,dateTimePub,dataType,sim,url,...,source,authors,image,eventUri,sentiment,wgt,relevance,sourceTitle,sourceUri,category
15773,7818994285,eng,False,2023-11-05,08:29:55,2023-11-05T08:29:55Z,2023-11-05T08:22:43Z,news,0.658824,https://www.rferl.org/a/moldova-local-election...,...,"{'uri': 'rferl.org', 'dataType': 'news', 'titl...",[],https://gdb.rferl.org/5C1DE69C-336E-4163-B8A8-...,eng-9026106,,100,100,RadioFreeEurope/RadioLiberty,rferl.org,world
15774,7818990653,eng,False,2023-11-05,08:27:12,2023-11-05T08:27:12Z,2023-11-05T08:19:59Z,news,0.768627,https://www.theguardian.com/world/live/2023/no...,...,"{'uri': 'theguardian.com', 'dataType': 'news',...","[{'uri': 'jem_bartholomew@theguardian.com', 'n...",https://i.guim.co.uk/img/media/64c03ff6abc8688...,eng-9033413,-0.184314,100,100,The Guardian,theguardian.com,world
15775,7818984582,eng,False,2023-11-05,08:21:56,2023-11-05T08:21:56Z,2023-11-05T08:09:12Z,news,0.705882,https://www.dailymail.co.uk/wires/ap/article-1...,...,"{'uri': 'dailymail.co.uk', 'dataType': 'news',...",[],,eng-9026106,-0.058824,100,100,Daily Mail Online,dailymail.co.uk,world
15776,7818983414,eng,False,2023-11-05,08:19:26,2023-11-05T08:19:26Z,2023-11-05T08:18:53Z,news,0.0,https://www.scmp.com/lifestyle/arts-culture/ar...,...,"{'uri': 'scmp.com', 'dataType': 'news', 'title...",[],,,-0.137255,100,100,South China Morning Post,scmp.com,world
15777,7818935168,eng,False,2023-11-05,07:32:50,2023-11-05T07:32:50Z,2023-11-05T07:30:35Z,news,0.878431,https://www.euronews.com/2023/11/05/war-in-ukr...,...,"{'uri': 'euronews.com', 'dataType': 'news', 't...",[],https://static.euronews.com/articles/stories/0...,eng-9032512,-0.254902,100,100,Euronews English,euronews.com,world


# Cleanning and pre-processing

In [24]:
news_df = news_art_df[["title", "body", "category"]]

In [25]:
news_df.head(5)

Unnamed: 0,title,body,category
0,Can China’s loan rule ‘trick’ help grease the ...,National Administration of Financial Regulatio...,business
1,State power: Xi calls for firmer government ha...,To ensure stability and security in decisive s...,business
2,How to fix China’s economy? Investors are unsu...,Investor sentiment around China’s prospects ha...,business
3,China spins up new party finance organ as prop...,"New commission, which began operations last mo...",business
4,Commercial fishing groups sue 13 US tire maker...,"TACOMA, Wash. (AP) - The 13 largest U.S. tire ...",business


## Removing the stopwords.

In [33]:
import re
from nltk.corpus import stopwords

def standardize_sentences(sentences):

  clean_sentences = []
  stop_words = set(stopwords.words('english'))

  for sentence in sentences:
    lower_text = sentence.lower() # Lowercasing text

    # regular expression for symbols
    reg_exp_text = re.sub('[^a-zA-Z]', ' ', lower_text)

    # remove stopwords
    clean_sentence = ' '.join(word for word in reg_exp_text.split() if word
                               not in stop_words)

    # append clean sentences
    clean_sentences.append(clean_sentence)

  return clean_sentences

In [34]:
# Extract specific sentences to clean
body = news_df["body"]

# Call the function to standardise the above text
clean_body = standardize_sentences(body)

In [36]:
news_df['clean_body'] = clean_body
news_df.head(5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  news_df['clean_body'] = clean_body


Unnamed: 0,title,body,category,clean_body
0,Can China’s loan rule ‘trick’ help grease the ...,National Administration of Financial Regulatio...,business,national administration financial regulation l...
1,State power: Xi calls for firmer government ha...,To ensure stability and security in decisive s...,business,ensure stability security decisive sectors xi ...
2,How to fix China’s economy? Investors are unsu...,Investor sentiment around China’s prospects ha...,business,investor sentiment around china prospects rema...
3,China spins up new party finance organ as prop...,"New commission, which began operations last mo...",business,new commission began operations last month par...
4,Commercial fishing groups sue 13 US tire maker...,"TACOMA, Wash. (AP) - The 13 largest U.S. tire ...",business,tacoma wash ap largest u tire manufacturers fa...


## Labelling the news category

In [37]:
from sklearn.preprocessing import LabelEncoder

# Simplified dataframe for ML task
news_ML_df = news_df[["clean_body", "category"]]

# Extract categories
categories = news_ML_df['category']

# Call encoder for labelling the numeric classification
label_encoder = LabelEncoder()

# Covert categort into numeric categories
numeric_cat = label_encoder.fit_transform(categories)

# Mapping original and numeric categories
category_mapping = dict(zip(label_encoder.classes_,
                            label_encoder.transform(label_encoder.classes_)))
mapping_df = pd.DataFrame(list(category_mapping.items()),
                          columns = ['original category',
                                     'numeric category'])

# Replacing the original category
news_ML_df['category'] = numeric_cat

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  news_ML_df['category'] = numeric_cat


In [38]:
# show the mapping result of the category
print('Show the original categories and their cooresponding numeric \
value:')
print('')
print(mapping_df)

Show the original categories and their cooresponding numeric value:

  original category  numeric category
0          business                 0
1    climate change                 1
2          politics                 2
3        technology                 3
4             world                 4


In [39]:
# show the
news_ML_df.head(3)

Unnamed: 0,clean_body,category
0,national administration financial regulation l...,0
1,ensure stability security decisive sectors xi ...,0
2,investor sentiment around china prospects rema...,0


## Tokenization

Next, it will create two columns "tokenized_body" and "freq_tokens".

"tokenized_body" column tokenize all the words from the column "clean_body".

"freq_tokens" column tokenize the most frequent words from the column "clean_body".

In [40]:
# Tokenize the body
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist

nltk.download('punkt')

# Tokenize the body
news_ML_df['tokenized_body'] = news_ML_df['clean_body'].apply(word_tokenize)


# Calculate the frequency of each token
all_tokens = [token for tokens in news_ML_df['tokenized_body'] for
              token in tokens]
freq_dist = FreqDist(all_tokens)

# Define the number of most frequent words for tokenization
n_freq_words = 1000

# Get the most frequent tokens
freq_tokens = set(token for token, freq in freq_dist.most_common(n_freq_words))

# Keep the most frequent tokens
news_ML_df['freq_tokens'] = news_ML_df['tokenized_body'].apply(lambda tokens:
                            [token for token in tokens if token in freq_tokens])

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  news_ML_df['tokenized_body'] = news_ML_df['clean_body'].apply(word_tokenize)


In [41]:
news_ML_df.head(3)

Unnamed: 0,clean_body,category,tokenized_body,freq_tokens
0,national administration financial regulation l...,0,"[national, administration, financial, regulati...","[national, administration, financial, risk, ba..."
1,ensure stability security decisive sectors xi ...,0,"[ensure, stability, security, decisive, sector...","[ensure, security, xi, state, firms, market, a..."
2,investor sentiment around china prospects rema...,0,"[investor, sentiment, around, china, prospects...","[around, china, despite, significant, increase..."


In [42]:
tokenized_df = news_ML_df
tokenized_df['freq_tokens'] = tokenized_df['freq_tokens'].apply(' '.join)

In [43]:
tokenized_df["tokenized_body"] = tokenized_df['tokenized_body'].apply(' '.join)
tokenized_df["tokenized_body"]

0        national administration financial regulation l...
1        ensure stability security decisive sectors xi ...
2        investor sentiment around china prospects rema...
3        new commission began operations last month par...
4        tacoma wash ap largest u tire manufacturers fa...
                               ...                        
15773    chisinau moldovan voting november elect mayors...
15774    russia says one ship damaged ukrainian attack ...
15775    chisinau moldova ap moldovans casting ballots ...
15776    george orwell library set city ivanovo counter...
15777    russian attacks ukraine wounded least civilian...
Name: tokenized_body, Length: 15778, dtype: object

In [44]:
# Show the tokenized dataframe
tokenized_df.head()

Unnamed: 0,clean_body,category,tokenized_body,freq_tokens
0,national administration financial regulation l...,0,national administration financial regulation l...,national administration financial risk banks a...
1,ensure stability security decisive sectors xi ...,0,ensure stability security decisive sectors xi ...,ensure security xi state firms market access l...
2,investor sentiment around china prospects rema...,0,investor sentiment around china prospects rema...,around china despite significant increase poli...
3,new commission began operations last month par...,0,new commission began operations last month par...,new commission began operations last month par...
4,tacoma wash ap largest u tire manufacturers fa...,0,tacoma wash ap largest u tire manufacturers fa...,ap largest u facing california could force com...


## Vectorization

Preparing data for a machine learning classifier by converting the
tokenized body into a numeric value

## Create bag of words (BoW)

In [52]:
from sklearn.feature_extraction.text import CountVectorizer

# Extract tokenized text
tokenized_text = tokenized_df["tokenized_body"]

# Create a vectorizer
vectorizer = CountVectorizer()

# Fit and transfrom the tokenized text
X1_count = vectorizer.fit_transform(tokenized_text)

In [46]:
# Extract the most frequent tokens
freq_tokens_text = tokenized_df["freq_tokens"]

# Fit and transform the most frequent tokens
X2_count = vectorizer.fit_transform(freq_tokens_text)

## TF-IDF

In [47]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create a TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()


# Fit and transform the test
X1_tfidf = tfidf_vectorizer.fit_transform(tokenized_text)
X2_tfidf = tfidf_vectorizer.fit_transform(freq_tokens_text)

In [None]:
import pandas as pd
X1_count_df = pd.DataFrame(X1_count.toarray(), columns = vectorizer.get_feature_names_out())

In [None]:
vectorised_df = pd.cona