### Part 2: ML pipeline

In [8]:
import numpy as np
import pandas as pd
import re

from sklearn import preprocessing
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [9]:
df = pd.read_csv('../input_data/news_paper_data.csv')

In [10]:
df.head()

Unnamed: 0,node_id,title,date,url,category,article
0,3074331,Consumer rights body’s oil price monitoring st...,19-07-2022,/business/news/consumer-rights-bodys-oil-price...,business,The Directorate of National Consumer Rights Pr...
1,3074126,Chattogram shares bleed for another day,19-07-2022,/business/news/chattogram-shares-bleed-another...,business,Shares on the Chittagong Stock Exchange (CSE) ...
2,3073706,bKash’s instant add money service at Dev Bank,19-07-2022,/business/organisation-news/news/bkashs-instan...,business,Clients of Bangladesh Development Bank (BDB) c...
3,3074291,Textile millers demand 360 days for deferred L...,19-07-2022,/business/news/textile-millers-demand-360-days...,business,The leaders of Bangladesh Textile Mills Associ...
4,3074276,"Tk 25,000 crore refinance scheme unveiled for ...",19-07-2022,/business/news/tk-25000-crore-refinance-scheme...,business,"Bangladesh Bank today unveiled a Tk 25,000 cro..."


In [11]:
# Data Cleaning 
def cleaning_text(text):
    text = text.lower().replace('\n', ' ').replace('\r', ' ').strip()
    
    # remove extra spaces
    text = re.sub(' +', ' ', text)
    
    # only took words
    text = re.sub(r'[^\w\s]', '', text)
    
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text)
    
    clean_text = [w for w in word_tokens if w not in stop_words]
    
    return ' '.join(clean_text)

In [35]:
df['clean_data'] = df['article'].apply(cleaning_text)

df['remove_count']  = df.apply(lambda x: str(f'{round(((len(x.article) - len(x.clean_data)) * 100 / (len(x.article))), 2)} %'), axis = 1)

In [36]:
df.head()

Unnamed: 0,node_id,title,date,url,category,article,clean_data,remove_count
0,3074331,Consumer rights body’s oil price monitoring st...,19-07-2022,/business/news/consumer-rights-bodys-oil-price...,business,The Directorate of National Consumer Rights Pr...,directorate national consumer rights protectio...,28.58 %
1,3074126,Chattogram shares bleed for another day,19-07-2022,/business/news/chattogram-shares-bleed-another...,business,Shares on the Chittagong Stock Exchange (CSE) ...,shares chittagong stock exchange cse witnessed...,29.55 %
2,3073706,bKash’s instant add money service at Dev Bank,19-07-2022,/business/organisation-news/news/bkashs-instan...,business,Clients of Bangladesh Development Bank (BDB) c...,clients bangladesh development bank bdb use ad...,30.86 %
3,3074291,Textile millers demand 360 days for deferred L...,19-07-2022,/business/news/textile-millers-demand-360-days...,business,The leaders of Bangladesh Textile Mills Associ...,leaders bangladesh textile mills association b...,20.92 %
4,3074276,"Tk 25,000 crore refinance scheme unveiled for ...",19-07-2022,/business/news/tk-25000-crore-refinance-scheme...,business,"Bangladesh Bank today unveiled a Tk 25,000 cro...",bangladesh bank today unveiled tk 25000 crore ...,28.67 %


In [39]:
#checking
#print(len(df.article[0]), len(df.clean_data[0]))

In [40]:
df.tail()

Unnamed: 0,node_id,title,date,url,category,article,clean_data,remove_count
320,3075121,National dance-drama festival begins at BSA today,20-07-2022,/entertainment/theatre-arts/news/national-danc...,entertainment,A three-day-long national dance-drama festival...,threedaylong national dancedrama festival begu...,26.1 %
321,3075111,The Colors of Youth: celebrating art across co...,20-07-2022,/entertainment/theatre-arts/news/the-colors-yo...,entertainment,"""The Colors of Youth"", an exhibition run by Ab...",colors youth exhibition run abu subhan welfare...,25.24 %
322,3074206,Portraying the bucolic beauty of Bangladesh,21-07-2022,/entertainment/theatre-arts/news/portraying-th...,entertainment,Renowned artist Samar Majumder's third solo ex...,renowned artist samar majumders third solo exh...,22.93 %
323,3074006,Prachyanat to stage ‘Dumurkheko Manush’ today,19-07-2022,/entertainment/theatre-arts/news/prachyanat-st...,entertainment,The certificate distribution ceremony for the ...,certificate distribution ceremony 42nd batch p...,25.8 %
324,3066871,‘Beckoning Horizon’: Azmeer Hossain’s exhibit...,08-07-2022,/entertainment/theatre-arts/news/beckoning-hor...,entertainment,"Azmeer Hossain's fifth solo exhibition ""Beckon...",azmeer hossains fifth solo exhibition beckonin...,30.19 %


In [6]:
label_encoder = preprocessing.LabelEncoder()
df['category_label'] = label_encoder.fit_transform(df['category'])

In [7]:
df.head()

Unnamed: 0,node_id,title,date,url,category,article,clean_data,category_label
0,3074331,Consumer rights body’s oil price monitoring st...,19-07-2022,/business/news/consumer-rights-bodys-oil-price...,business,The Directorate of National Consumer Rights Pr...,directorate national consumer rights protectio...,0
1,3074126,Chattogram shares bleed for another day,19-07-2022,/business/news/chattogram-shares-bleed-another...,business,Shares on the Chittagong Stock Exchange (CSE) ...,shares chittagong stock exchange cse witnessed...,0
2,3073706,bKash’s instant add money service at Dev Bank,19-07-2022,/business/organisation-news/news/bkashs-instan...,business,Clients of Bangladesh Development Bank (BDB) c...,clients bangladesh development bank bdb use ad...,0
3,3074291,Textile millers demand 360 days for deferred L...,19-07-2022,/business/news/textile-millers-demand-360-days...,business,The leaders of Bangladesh Textile Mills Associ...,leaders bangladesh textile mills association b...,0
4,3074276,"Tk 25,000 crore refinance scheme unveiled for ...",19-07-2022,/business/news/tk-25000-crore-refinance-scheme...,business,"Bangladesh Bank today unveiled a Tk 25,000 cro...",bangladesh bank today unveiled tk 25000 crore ...,0


In [42]:
# categorywise count
df['category'].value_counts()

sports           132
business          98
entertainment     95
Name: category, dtype: int64

In [44]:
# test train split
X_train, X_test, Y_train, Y_test = train_test_split(df['clean_data'],
                                                   df['category'], 
                                                   test_size = 0.2,
                                                   random_state = 8)

In [45]:
print(f'Shape of X: {X_train.shape} and {X_test.shape}')
print(f'Shape of Y: {Y_train.shape} and {Y_test.shape}')

Shape of X: (260,) and (65,)
Shape of Y: (260,) and (65,)


In [47]:
ngram_range = (1, 2)
min_df = 10
max_df = 1.
max_features = 65

tfidf = TfidfVectorizer(encoding='utf-8',
                         ngram_range = ngram_range,
                         min_df = min_df, 
                         max_df = max_df,
                         norm = 'l2',
                         max_features = max_features,
                         stop_words = None,
                         lowercase = False,
                         sublinear_tf = True)

feature_train = tfidf.fit_transform(X_train).toarray()
label_train = Y_train

feature_test = tfidf.fit_transform(X_test).toarray()
label_test = Y_test

print(f'Size of features object: {feature_train.shape} and {feature_test.shape}')

Size of features object: (260, 65) and (65, 65)


In [48]:
# Make random forest classifier
model = RandomForestClassifier()

# fit the model
model.fit(feature_train, label_train)

# get prediction
model_prediction = model.predict(feature_test)

# calculate accuracy
accuracy_val = accuracy_score(label_test, model_prediction)

print(f'Accuracy of the algorithm: {accuracy_val*100} %')

Accuracy of the algorithm: 49.23076923076923 %




In [49]:
summary = classification_report(label_test, model_prediction)
print(f'Summary of the model prediction: \n{summary}')

Summary of the model prediction: 
               precision    recall  f1-score   support

     business       0.65      0.69      0.67        16
entertainment       0.44      0.30      0.36        23
       sports       0.44      0.54      0.48        26

    micro avg       0.49      0.49      0.49        65
    macro avg       0.51      0.51      0.50        65
 weighted avg       0.49      0.49      0.48        65

