In [109]:
import numpy as np
import pandas as pd
import re

from sklearn import preprocessing
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [83]:
df = pd.read_csv('../input_data/news_paper_data.csv')

In [86]:
df.head()

Unnamed: 0,node_id,title,date,url,category,article
0,3074331,Consumer rights body’s oil price monitoring st...,19-07-2022,/business/news/consumer-rights-bodys-oil-price...,business,The Directorate of National Consumer Rights Pr...
1,3074126,Chattogram shares bleed for another day,19-07-2022,/business/news/chattogram-shares-bleed-another...,business,Shares on the Chittagong Stock Exchange (CSE) ...
2,3073706,bKash’s instant add money service at Dev Bank,19-07-2022,/business/organisation-news/news/bkashs-instan...,business,Clients of Bangladesh Development Bank (BDB) c...
3,3074291,Textile millers demand 360 days for deferred L...,19-07-2022,/business/news/textile-millers-demand-360-days...,business,The leaders of Bangladesh Textile Mills Associ...
4,3074276,"Tk 25,000 crore refinance scheme unveiled for ...",19-07-2022,/business/news/tk-25000-crore-refinance-scheme...,business,"Bangladesh Bank today unveiled a Tk 25,000 cro..."


In [87]:
# Data Cleaning 
def cleaning_text(text):
    text = text.lower().replace('\n', ' ').replace('\r', ' ').strip()
    
    # remove extra spaces
    text = re.sub(' +', ' ', text)
    
    # only took words
    text = re.sub(r'[^\w\s]', '', text)
    
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text)
    
    clean_text = [w for w in word_tokens if w not in stop_words]
    
    return ' '.join(clean_text)

In [88]:
df['clean_data'] = df['article'].apply(cleaning_text)

df.head()

Unnamed: 0,node_id,title,date,url,category,article,clean_data
0,3074331,Consumer rights body’s oil price monitoring st...,19-07-2022,/business/news/consumer-rights-bodys-oil-price...,business,The Directorate of National Consumer Rights Pr...,directorate national consumer rights protectio...
1,3074126,Chattogram shares bleed for another day,19-07-2022,/business/news/chattogram-shares-bleed-another...,business,Shares on the Chittagong Stock Exchange (CSE) ...,shares chittagong stock exchange cse witnessed...
2,3073706,bKash’s instant add money service at Dev Bank,19-07-2022,/business/organisation-news/news/bkashs-instan...,business,Clients of Bangladesh Development Bank (BDB) c...,clients bangladesh development bank bdb use ad...
3,3074291,Textile millers demand 360 days for deferred L...,19-07-2022,/business/news/textile-millers-demand-360-days...,business,The leaders of Bangladesh Textile Mills Associ...,leaders bangladesh textile mills association b...
4,3074276,"Tk 25,000 crore refinance scheme unveiled for ...",19-07-2022,/business/news/tk-25000-crore-refinance-scheme...,business,"Bangladesh Bank today unveiled a Tk 25,000 cro...",bangladesh bank today unveiled tk 25000 crore ...


In [89]:
label_encoder = preprocessing.LabelEncoder()
df['category_label'] = label_encoder.fit_transform(df['category'])

In [90]:
df.head()

Unnamed: 0,node_id,title,date,url,category,article,clean_data,category_label
0,3074331,Consumer rights body’s oil price monitoring st...,19-07-2022,/business/news/consumer-rights-bodys-oil-price...,business,The Directorate of National Consumer Rights Pr...,directorate national consumer rights protectio...,0
1,3074126,Chattogram shares bleed for another day,19-07-2022,/business/news/chattogram-shares-bleed-another...,business,Shares on the Chittagong Stock Exchange (CSE) ...,shares chittagong stock exchange cse witnessed...,0
2,3073706,bKash’s instant add money service at Dev Bank,19-07-2022,/business/organisation-news/news/bkashs-instan...,business,Clients of Bangladesh Development Bank (BDB) c...,clients bangladesh development bank bdb use ad...,0
3,3074291,Textile millers demand 360 days for deferred L...,19-07-2022,/business/news/textile-millers-demand-360-days...,business,The leaders of Bangladesh Textile Mills Associ...,leaders bangladesh textile mills association b...,0
4,3074276,"Tk 25,000 crore refinance scheme unveiled for ...",19-07-2022,/business/news/tk-25000-crore-refinance-scheme...,business,"Bangladesh Bank today unveiled a Tk 25,000 cro...",bangladesh bank today unveiled tk 25000 crore ...,0


In [91]:
len(df)

164

In [95]:
# test train split
X_train, X_test, Y_train, Y_test = train_test_split(df['clean_data'],
                                                   df['category_label'], 
                                                   test_size = 0.2,
                                                   random_state = 8)

In [93]:
print(f'Shape of X: {X_train.shape} and {X_test.shape}')
print(f'Shape of Y: {Y_train.shape} and {Y_test.shape}')

Shape of X: (131,) and (33,)
Shape of Y: (131,) and (33,)


In [107]:
ngram_range = (1, 2)
min_df = 10
max_df = 1.
max_features = 300

tfidf = TfidfVectorizer(encoding='utf-8',
                         ngram_range = ngram_range,
                         min_df = min_df, 
                         max_df = max_df,
                         max_features = max_features,
                         norm = 'l2',
                         stop_words = None,
                         lowercase = False,
                         sublinear_tf = True)

feature_train = tfidf.fit_transform(X_train).toarray()
label_train = Y_train

feature_test = tfidf.fit_transform(X_test).toarray()
label_test = Y_test

In [108]:
print(f'Size of features object: {feature_train.shape} and {feature_test.shape}')

Size of features object: (131, 265) and (33, 11)


In [None]:
# Make random forest classifier
model = RandomForestClassifier()

# fit the model
model.fit(feature_train, label_train)

# get predition
model.prediction = model.predict(feature_test)

print()