##### Problem Statement

Create an automated system that accurately classifies news articles into relevant categories such as education, science, politics, etc., to aid in efficient content categorization and organization.

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix
import nltk
from nltk.corpus import stopwords
from sklearn.preprocessing import LabelEncoder
import re
from nltk.tokenize import word_tokenize
from sklearn.ensemble import RandomForestClassifier
!pip install lightgbm
import lightgbm as lgb
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import RandomizedSearchCV



## Step 1: Read the data and clean it using regular expressions

In [3]:
df = pd.read_excel('news_category.xlsx') 
df.head()

Unnamed: 0.1,Unnamed: 0,headline,short_description,category
0,0,These Lesbian Farmers Aren't Here To Take Over...,Rush Limbaugh seems to thinks queer farmers ar...,QUEER VOICES
1,1,Taking the 'I' Out of Volunteering,Organizations that help nonprofits understand ...,IMPACT
2,2,How Losing The Love Of My Life Made Me Distrus...,"""If your introduction to the world is full of ...",QUEER VOICES
3,3,Sam Brownback Declares War On Kansas: This Is ...,It’s not uncommon to see developments named af...,POLITICS
4,4,Brexit And The Long History Of English Propert...,"Thus in two centuries, the profile of the Brit...",WORLD NEWS


In [4]:
# Dropping the null values
df=df.dropna()

In [5]:
df.isna().any()

Unnamed: 0           False
headline             False
short_description    False
category             False
dtype: bool

In [6]:
# Concatenate 'short_description' and 'headline' columns
df['Text'] = df['short_description'].astype(str) + ' ' + df['headline'].astype(str)

In [7]:
df.drop(['short_description', 'headline'], axis=1, inplace=True)

In [8]:
df= df.drop(df.columns[0], axis=1)

In [9]:
df.head()

Unnamed: 0,category,Text
0,QUEER VOICES,Rush Limbaugh seems to thinks queer farmers ar...
1,IMPACT,Organizations that help nonprofits understand ...
2,QUEER VOICES,"""If your introduction to the world is full of ..."
3,POLITICS,It’s not uncommon to see developments named af...
4,WORLD NEWS,"Thus in two centuries, the profile of the Brit..."


In [10]:
df["category"].value_counts()

POLITICS          29578
ENTERTAINMENT     11670
HEALTHY LIVING     5265
QUEER VOICES       4270
THE WORLDPOST      3664
PARENTS            3556
SPORTS             3507
BLACK VOICES       3452
BUSINESS           3394
COMEDY             3256
WOMEN              3102
MEDIA              2275
WEIRD NEWS         2209
IMPACT             2205
WORLD NEWS         2175
CRIME              2164
GREEN              2046
TASTE              1940
RELIGION           1857
TRAVEL             1678
STYLE              1567
ARTS & CULTURE     1339
WORLDPOST          1242
TECH               1231
FIFTY              1042
GOOD NEWS          1039
LATINO VOICES      1021
SCIENCE             978
COLLEGE             921
EDUCATION           892
ARTS                863
Name: category, dtype: int64

In [11]:
# Grouping the similar categories
df['category'] = df['category'].replace({'SCIENCE': 'EDUCATION', 'ARTS & CULTURE': 'EDUCATION','ARTS': 'EDUCATION','COLLEGE': 'EDUCATION'})
df['category'] = df['category'].replace({'MEDIA': 'GLOBAL','CRIME': 'GLOBAL','WEIRD NEWS': 'GLOBAL','WORLD NEWS': 'GLOBAL','GOOD NEWS': 'GLOBAL'})
df['category'] = df['category'].replace({'SPORTS': 'ENTERTAINMENT','COMEDY': 'ENTERTAINMENT'})
df['category'] = df['category'].replace({'TASTE': 'MISCELLANEOUS','PARENTS': 'MISCELLANEOUS','FIFTY': 'MISCELLANEOUS','STYLE': 'MISCELLANEOUS','GREEN': 'MISCELLANEOUS'})
df['category'] = df['category'].replace({'BLACK VOICES': 'SOCIAL JUSTICE','QUEER VOICES': 'SOCIAL JUSTICE','LATINO VOICES': 'SOCIAL JUSTICE'})


In [12]:
df["category"].value_counts()

POLITICS          29578
ENTERTAINMENT     18433
MISCELLANEOUS     10151
GLOBAL             9862
SOCIAL JUSTICE     8743
HEALTHY LIVING     5265
EDUCATION          4993
THE WORLDPOST      3664
BUSINESS           3394
WOMEN              3102
IMPACT             2205
RELIGION           1857
TRAVEL             1678
WORLDPOST          1242
TECH               1231
Name: category, dtype: int64

In [13]:
encoder = LabelEncoder()

# Fit the encoder on the categories and transform the variable
df['Category'] = encoder.fit_transform(df['category'])

# Print the DataFrame to see the encoded values
df.head()

Unnamed: 0,category,Text,Category
0,SOCIAL JUSTICE,Rush Limbaugh seems to thinks queer farmers ar...,9
1,IMPACT,Organizations that help nonprofits understand ...,5
2,SOCIAL JUSTICE,"""If your introduction to the world is full of ...",9
3,POLITICS,It’s not uncommon to see developments named af...,7
4,GLOBAL,"Thus in two centuries, the profile of the Brit...",3


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 105398 entries, 0 to 124988
Data columns (total 3 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   category  105398 non-null  object
 1   Text      105398 non-null  object
 2   Category  105398 non-null  int32 
dtypes: int32(1), object(2)
memory usage: 2.8+ MB


In [15]:
df

Unnamed: 0,category,Text,Category
0,SOCIAL JUSTICE,Rush Limbaugh seems to thinks queer farmers ar...,9
1,IMPACT,Organizations that help nonprofits understand ...,5
2,SOCIAL JUSTICE,"""If your introduction to the world is full of ...",9
3,POLITICS,It’s not uncommon to see developments named af...,7
4,GLOBAL,"Thus in two centuries, the profile of the Brit...",3
...,...,...,...
124984,THE WORLDPOST,The fallout from Turkey's attempted coup conti...,11
124985,ENTERTAINMENT,Officials recommend he be deposed in the Unite...,2
124986,POLITICS,"It would ""diminish the guest experience of our...",7
124987,POLITICS,"""I have avoided doing that. I am trying to run...",7


In [16]:
df['text']=df['Text'].apply(lambda x: str(x).lower())
df.head()

Unnamed: 0,category,Text,Category,text
0,SOCIAL JUSTICE,Rush Limbaugh seems to thinks queer farmers ar...,9,rush limbaugh seems to thinks queer farmers ar...
1,IMPACT,Organizations that help nonprofits understand ...,5,organizations that help nonprofits understand ...
2,SOCIAL JUSTICE,"""If your introduction to the world is full of ...",9,"""if your introduction to the world is full of ..."
3,POLITICS,It’s not uncommon to see developments named af...,7,it’s not uncommon to see developments named af...
4,GLOBAL,"Thus in two centuries, the profile of the Brit...",3,"thus in two centuries, the profile of the brit..."


In [17]:
stop_words = set(stopwords.words('english'))

def clean_text(text):
    tokens = word_tokenize(text)
    tokens = [token for token in tokens if token.lower() not in stop_words]
    cleaned_text = ' '.join(tokens)
    return cleaned_text

df['clean_text'] = df['text'].apply(clean_text)

In [18]:
df.head()

Unnamed: 0,category,Text,Category,text,clean_text
0,SOCIAL JUSTICE,Rush Limbaugh seems to thinks queer farmers ar...,9,rush limbaugh seems to thinks queer farmers ar...,rush limbaugh seems thinks queer farmers threa...
1,IMPACT,Organizations that help nonprofits understand ...,5,organizations that help nonprofits understand ...,organizations help nonprofits understand use e...
2,SOCIAL JUSTICE,"""If your introduction to the world is full of ...",9,"""if your introduction to the world is full of ...",`` introduction world full men ’ want becomes ...
3,POLITICS,It’s not uncommon to see developments named af...,7,it’s not uncommon to see developments named af...,’ uncommon see developments named displace som...
4,GLOBAL,"Thus in two centuries, the profile of the Brit...",3,"thus in two centuries, the profile of the brit...","thus two centuries , profile british france co..."


In [19]:
df['clean_text']

0         rush limbaugh seems thinks queer farmers threa...
1         organizations help nonprofits understand use e...
2         `` introduction world full men ’ want becomes ...
3         ’ uncommon see developments named displace som...
4         thus two centuries , profile british france co...
                                ...                        
124984    fallout turkey 's attempted coup continues cau...
124985    officials recommend deposed united states . br...
124986    would `` diminish guest experience brand , '' ...
124987    `` avoided . trying run issue-oriented campaig...
124988    colin kaepernick 's former team , san francisc...
Name: clean_text, Length: 105398, dtype: object

In [20]:
# Defining X and y and then Splitting the data
X = df.clean_text
y = df['Category']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [21]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((84318,), (21080,), (84318,), (21080,))

## Step 2. Apply Count Vectorizer and Build Model

#### Using MultinomialNB with Count Vectorizer

In [22]:
# Apply count vectorizer
vectorizer = CountVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [23]:
X_train_vec

<84318x56438 sparse matrix of type '<class 'numpy.int64'>'
	with 1379694 stored elements in Compressed Sparse Row format>

In [24]:
X_test_vec

<21080x56438 sparse matrix of type '<class 'numpy.int64'>'
	with 338126 stored elements in Compressed Sparse Row format>

In [25]:
# Build a model using Naive Bayes (MultinomialNB)
model = MultinomialNB(alpha=0.1)
model.fit(X_train_vec, y_train)

MultinomialNB(alpha=0.1)

In [26]:
# Evaluate the model's performance
y_pred = model.predict(X_test_vec)
accuracy_mNB_CV = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy_mNB_CV)
accuracy = {'Model':"MultinomialNB with Count Vectorizer",'Accuracy':accuracy_mNB_CV}

Accuracy: 0.6300284629981024


In [27]:
result = pd.DataFrame(columns=['Model','Accuracy'])
result = result.append(accuracy,ignore_index=True)
result

Unnamed: 0,Model,Accuracy
0,MultinomialNB with Count Vectorizer,0.630028


#### Using LGBMClassifier with hyperparameter tunning by count vectorizer

In [28]:
from sklearn.model_selection import GridSearchCV
import lightgbm as lgb

X_train_vec = X_train_vec.astype(np.float32)
y_train = y_train.astype(np.float32)

# Create a LightGBM classifier object
lgb_classifier = lgb.LGBMClassifier(random_state=42)

# Define the parameter grid
param_grid = {
    'num_leaves': [10, 20, 30],
    'learning_rate': [0.1, 0.01, 0.001],
    'n_estimators': [100, 300, 500],
}

# Create the GridSearchCV object
grid_search = GridSearchCV(estimator=lgb_classifier, param_grid=param_grid)

# Perform the grid search
grid_search.fit(X_train_vec, y_train)

# Get the best hyperparameters and model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 23337
[LightGBM] [Info] Number of data points in the train set: 67454, number of used features: 7675
[LightGBM] [Info] Start training from score -3.430746
[LightGBM] [Info] Start training from score -3.049546
[LightGBM] [Info] Start training from score -1.744279
[LightGBM] [Info] Start training from score -2.368835
[LightGBM] [Info] Start training from score -3.011180
[LightGBM] [Info] Start training from score -3.865023
[LightGBM] [Info] Start training from score -2.329084
[LightGBM] [Info] Start training from score -1.269431
[LightGBM] [Info] Start training from score -4.050178
[LightGBM] [Info] Start training from score -2.494769
[LightGBM] [Info] Start training from score -4.444640
[LightGBM] [Info] Start training from score -3.365149
[LightGBM] [Info] Start training from score -4.133559
[LightGBM] [Info] Start training from score -3

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 23331
[LightGBM] [Info] Number of data points in the train set: 67455, number of used features: 7646
[LightGBM] [Info] Start training from score -3.431219
[LightGBM] [Info] Start training from score -3.049561
[LightGBM] [Info] Start training from score -1.744209
[LightGBM] [Info] Start training from score -2.368691
[LightGBM] [Info] Start training from score -3.010894
[LightGBM] [Info] Start training from score -3.865746
[LightGBM] [Info] Start training from score -2.329099
[LightGBM] [Info] Start training from score -1.269393
[LightGBM] [Info] Start training from score -4.050193
[LightGBM] [Info] Start training from score -2.494964
[LightGBM] [Info] Start training from score -4.444655
[LightGBM] [Info] Start training from score -3.364734
[LightGBM] [Info] Start training from score -4.133574
[LightGBM] [Info] Start training from score -3.536987
[LightGBM] [Info] Start training from score -4.427132
Yo

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 23232
[LightGBM] [Info] Number of data points in the train set: 67454, number of used features: 7627
[LightGBM] [Info] Start training from score -3.430746
[LightGBM] [Info] Start training from score -3.049546
[LightGBM] [Info] Start training from score -1.744279
[LightGBM] [Info] Start training from score -2.368835
[LightGBM] [Info] Start training from score -3.010879
[LightGBM] [Info] Start training from score -3.865023
[LightGBM] [Info] Start training from score -2.329084
[LightGBM] [Info] Start training from score -1.269431
[LightGBM] [Info] Start training from score -4.050178
[LightGBM] [Info] Start training from score -2.494769
[LightGBM] [Info] Start training from score -4.444640
[LightGBM] [Info] Start training from score -3.365149
[LightGBM] [Info] Start training from score -4.133559
[LightGBM] [Info] Start training from score -3

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 23326
[LightGBM] [Info] Number of data points in the train set: 67455, number of used features: 7659
[LightGBM] [Info] Start training from score -3.431219
[LightGBM] [Info] Start training from score -3.049561
[LightGBM] [Info] Start training from score -1.744209
[LightGBM] [Info] Start training from score -2.368691
[LightGBM] [Info] Start training from score -3.011195
[LightGBM] [Info] Start training from score -3.865038
[LightGBM] [Info] Start training from score -2.329251
[LightGBM] [Info] Start training from score -1.269446
[LightGBM] [Info] Start training from score -4.049342
[LightGBM] [Info] Start training from score -2.494784
[LightGBM] [Info] Start training from score -4.444655
[LightGBM] [Info] Start training from score -3.365163
[LightGBM] [Info] Start training from score -4.134500
[LightGBM] [Info] Start training from score -3

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 23310
[LightGBM] [Info] Number of data points in the train set: 67454, number of used features: 7654
[LightGBM] [Info] Start training from score -3.431204
[LightGBM] [Info] Start training from score -3.049546
[LightGBM] [Info] Start training from score -1.744194
[LightGBM] [Info] Start training from score -2.368676
[LightGBM] [Info] Start training from score -3.010879
[LightGBM] [Info] Start training from score -3.865731
[LightGBM] [Info] Start training from score -2.329084
[LightGBM] [Info] Start training from score -1.269378
[LightGBM] [Info] Start training from score -4.050178
[LightGBM] [Info] Start training from score -2.494949
[LightGBM] [Info] Start training from score -4.444640
[LightGBM] [Info] Start training from score -3.365149
[LightGBM] [Info] Start training from score -4.133559
[LightGBM] [Info] Start training from score -3

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 23337
[LightGBM] [Info] Number of data points in the train set: 67454, number of used features: 7675
[LightGBM] [Info] Start training from score -3.430746
[LightGBM] [Info] Start training from score -3.049546
[LightGBM] [Info] Start training from score -1.744279
[LightGBM] [Info] Start training from score -2.368835
[LightGBM] [Info] Start training from score -3.011180
[LightGBM] [Info] Start training from score -3.865023
[LightGBM] [Info] Start training from score -2.329084
[LightGBM] [Info] Start training from score -1.269431
[LightGBM] [Info] Start training from score -4.050178
[LightGBM] [Info] Start training from score -2.494769
[LightGBM] [Info] Start training from score -4.444640
[LightGBM] [Info] Start training from score -3.365149
[LightGBM] [Info] Start training from score -4.133559
[LightGBM] [Info] Start training from score -3.536972
[LightGBM] [Info] Start training from score -4.425877
Yo

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 23331
[LightGBM] [Info] Number of data points in the train set: 67455, number of used features: 7646
[LightGBM] [Info] Start training from score -3.431219
[LightGBM] [Info] Start training from score -3.049561
[LightGBM] [Info] Start training from score -1.744209
[LightGBM] [Info] Start training from score -2.368691
[LightGBM] [Info] Start training from score -3.010894
[LightGBM] [Info] Start training from score -3.865746
[LightGBM] [Info] Start training from score -2.329099
[LightGBM] [Info] Start training from score -1.269393
[LightGBM] [Info] Start training from score -4.050193
[LightGBM] [Info] Start training from score -2.494964
[LightGBM] [Info] Start training from score -4.444655
[LightGBM] [Info] Start training from score -3.364734
[LightGBM] [Info] Start training from score -4.133574
[LightGBM] [Info] Start training from score -3.536987
[LightGBM] [Info] Start training from score -4.427132
Yo

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 23232
[LightGBM] [Info] Number of data points in the train set: 67454, number of used features: 7627
[LightGBM] [Info] Start training from score -3.430746
[LightGBM] [Info] Start training from score -3.049546
[LightGBM] [Info] Start training from score -1.744279
[LightGBM] [Info] Start training from score -2.368835
[LightGBM] [Info] Start training from score -3.010879
[LightGBM] [Info] Start training from score -3.865023
[LightGBM] [Info] Start training from score -2.329084
[LightGBM] [Info] Start training from score -1.269431
[LightGBM] [Info] Start training from score -4.050178
[LightGBM] [Info] Start training from score -2.494769
[LightGBM] [Info] Start training from score -4.444640
[LightGBM] [Info] Start training from score -3.365149
[LightGBM] [Info] Start training from score -4.133559
[LightGBM] [Info] Start training from score -3

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 23326
[LightGBM] [Info] Number of data points in the train set: 67455, number of used features: 7659
[LightGBM] [Info] Start training from score -3.431219
[LightGBM] [Info] Start training from score -3.049561
[LightGBM] [Info] Start training from score -1.744209
[LightGBM] [Info] Start training from score -2.368691
[LightGBM] [Info] Start training from score -3.011195
[LightGBM] [Info] Start training from score -3.865038
[LightGBM] [Info] Start training from score -2.329251
[LightGBM] [Info] Start training from score -1.269446
[LightGBM] [Info] Start training from score -4.049342
[LightGBM] [Info] Start training from score -2.494784
[LightGBM] [Info] Start training from score -4.444655
[LightGBM] [Info] Start training from score -3.365163
[LightGBM] [Info] Start training from score -4.134500
[LightGBM] [Info] Start training from score -3

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 23310
[LightGBM] [Info] Number of data points in the train set: 67454, number of used features: 7654
[LightGBM] [Info] Start training from score -3.431204
[LightGBM] [Info] Start training from score -3.049546
[LightGBM] [Info] Start training from score -1.744194
[LightGBM] [Info] Start training from score -2.368676
[LightGBM] [Info] Start training from score -3.010879
[LightGBM] [Info] Start training from score -3.865731
[LightGBM] [Info] Start training from score -2.329084
[LightGBM] [Info] Start training from score -1.269378
[LightGBM] [Info] Start training from score -4.050178
[LightGBM] [Info] Start training from score -2.494949
[LightGBM] [Info] Start training from score -4.444640
[LightGBM] [Info] Start training from score -3.365149
[LightGBM] [Info] Start training from score -4.133559
[LightGBM] [Info] Start training from score -3

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 23337
[LightGBM] [Info] Number of data points in the train set: 67454, number of used features: 7675
[LightGBM] [Info] Start training from score -3.430746
[LightGBM] [Info] Start training from score -3.049546
[LightGBM] [Info] Start training from score -1.744279
[LightGBM] [Info] Start training from score -2.368835
[LightGBM] [Info] Start training from score -3.011180
[LightGBM] [Info] Start training from score -3.865023
[LightGBM] [Info] Start training from score -2.329084
[LightGBM] [Info] Start training from score -1.269431
[LightGBM] [Info] Start training from score -4.050178
[LightGBM] [Info] Start training from score -2.494769
[LightGBM] [Info] Start training from score -4.444640
[LightGBM] [Info] Start training from score -3.365149
[LightGBM] [Info] Start training from score -4.133559
[LightGBM] [Info] Start training from score -3

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 23331
[LightGBM] [Info] Number of data points in the train set: 67455, number of used features: 7646
[LightGBM] [Info] Start training from score -3.431219
[LightGBM] [Info] Start training from score -3.049561
[LightGBM] [Info] Start training from score -1.744209
[LightGBM] [Info] Start training from score -2.368691
[LightGBM] [Info] Start training from score -3.010894
[LightGBM] [Info] Start training from score -3.865746
[LightGBM] [Info] Start training from score -2.329099
[LightGBM] [Info] Start training from score -1.269393
[LightGBM] [Info] Start training from score -4.050193
[LightGBM] [Info] Start training from score -2.494964
[LightGBM] [Info] Start training from score -4.444655
[LightGBM] [Info] Start training from score -3.364734
[LightGBM] [Info] Start training from score -4.133574
[LightGBM] [Info] Start training from score -3

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 23232
[LightGBM] [Info] Number of data points in the train set: 67454, number of used features: 7627
[LightGBM] [Info] Start training from score -3.430746
[LightGBM] [Info] Start training from score -3.049546
[LightGBM] [Info] Start training from score -1.744279
[LightGBM] [Info] Start training from score -2.368835
[LightGBM] [Info] Start training from score -3.010879
[LightGBM] [Info] Start training from score -3.865023
[LightGBM] [Info] Start training from score -2.329084
[LightGBM] [Info] Start training from score -1.269431
[LightGBM] [Info] Start training from score -4.050178
[LightGBM] [Info] Start training from score -2.494769
[LightGBM] [Info] Start training from score -4.444640
[LightGBM] [Info] Start training from score -3.365149
[LightGBM] [Info] Start training from score -4.133559
[LightGBM] [Info] Start training from score -3

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 23326
[LightGBM] [Info] Number of data points in the train set: 67455, number of used features: 7659
[LightGBM] [Info] Start training from score -3.431219
[LightGBM] [Info] Start training from score -3.049561
[LightGBM] [Info] Start training from score -1.744209
[LightGBM] [Info] Start training from score -2.368691
[LightGBM] [Info] Start training from score -3.011195
[LightGBM] [Info] Start training from score -3.865038
[LightGBM] [Info] Start training from score -2.329251
[LightGBM] [Info] Start training from score -1.269446
[LightGBM] [Info] Start training from score -4.049342
[LightGBM] [Info] Start training from score -2.494784
[LightGBM] [Info] Start training from score -4.444655
[LightGBM] [Info] Start training from score -3.365163
[LightGBM] [Info] Start training from score -4.134500
[LightGBM] [Info] Start training from score -3.536477
[LightGBM] [Info] Start training from score -4.425892
Yo

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 23310
[LightGBM] [Info] Number of data points in the train set: 67454, number of used features: 7654
[LightGBM] [Info] Start training from score -3.431204
[LightGBM] [Info] Start training from score -3.049546
[LightGBM] [Info] Start training from score -1.744194
[LightGBM] [Info] Start training from score -2.368676
[LightGBM] [Info] Start training from score -3.010879
[LightGBM] [Info] Start training from score -3.865731
[LightGBM] [Info] Start training from score -2.329084
[LightGBM] [Info] Start training from score -1.269378
[LightGBM] [Info] Start training from score -4.050178
[LightGBM] [Info] Start training from score -2.494949
[LightGBM] [Info] Start training from score -4.444640
[LightGBM] [Info] Start training from score -3.365149
[LightGBM] [Info] Start training from score -4.133559
[LightGBM] [Info] Start training from score -3

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 23337
[LightGBM] [Info] Number of data points in the train set: 67454, number of used features: 7675
[LightGBM] [Info] Start training from score -3.430746
[LightGBM] [Info] Start training from score -3.049546
[LightGBM] [Info] Start training from score -1.744279
[LightGBM] [Info] Start training from score -2.368835
[LightGBM] [Info] Start training from score -3.011180
[LightGBM] [Info] Start training from score -3.865023
[LightGBM] [Info] Start training from score -2.329084
[LightGBM] [Info] Start training from score -1.269431
[LightGBM] [Info] Start training from score -4.050178
[LightGBM] [Info] Start training from score -2.494769
[LightGBM] [Info] Start training from score -4.444640
[LightGBM] [Info] Start training from score -3.365149
[LightGBM] [Info] Start training from score -4.133559
[LightGBM] [Info] Start training from score -3

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 23331
[LightGBM] [Info] Number of data points in the train set: 67455, number of used features: 7646
[LightGBM] [Info] Start training from score -3.431219
[LightGBM] [Info] Start training from score -3.049561
[LightGBM] [Info] Start training from score -1.744209
[LightGBM] [Info] Start training from score -2.368691
[LightGBM] [Info] Start training from score -3.010894
[LightGBM] [Info] Start training from score -3.865746
[LightGBM] [Info] Start training from score -2.329099
[LightGBM] [Info] Start training from score -1.269393
[LightGBM] [Info] Start training from score -4.050193
[LightGBM] [Info] Start training from score -2.494964
[LightGBM] [Info] Start training from score -4.444655
[LightGBM] [Info] Start training from score -3.364734
[LightGBM] [Info] Start training from score -4.133574
[LightGBM] [Info] Start training from score -3

In [29]:
best_params

{'learning_rate': 0.1, 'n_estimators': 500, 'num_leaves': 20}

In [30]:
# Convert the feature data type to np.float32
X_train_vec = X_train_vec.astype(np.float32)
X_train_vec

<84318x56438 sparse matrix of type '<class 'numpy.float32'>'
	with 1379694 stored elements in Compressed Sparse Row format>

In [31]:
# Create a LightGBM classifier object
lgb_classifier = lgb.LGBMClassifier(num_leaves =20,learning_rate= 0.1,n_estimators= 500,random_state=42)
lgb_classifier.fit(X_train_vec, y_train)

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 26964
[LightGBM] [Info] Number of data points in the train set: 84318, number of used features: 8816
[LightGBM] [Info] Start training from score -3.431027
[LightGBM] [Info] Start training from score -3.049552
[LightGBM] [Info] Start training from score -1.744234
[LightGBM] [Info] Start training from score -2.368746
[LightGBM] [Info] Start training from score -3.011005
[LightGBM] [Info] Start training from score -3.865312
[LightGBM] [Info] Start training from score -2.329121
[LightGBM] [Info] Start training from score -1.269416
[LightGBM] [Info] Start training from score -4.050013
[LightGBM] [Info] Start training from score -2.494847
[LightGBM] [Info] Start training from score -4.444646
[LightGBM] [Info] Start training from score -3.365069
[LightGBM] [Info] Start training from score -4.133750
[LightGBM] [Info] Start training from score -3

LGBMClassifier(n_estimators=500, num_leaves=20, random_state=42)

In [32]:
# Convert the test feature data type to np.float32
X_test_vec = X_test_vec.astype(np.float32)
# Predict the labels for the test set
y_pred = lgb_classifier.predict(X_test_vec)

In [33]:
# Calculate the accuracy of the classifier
accuracy_LGBM_CV = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy_LGBM_CV)
accuracy = {'Model':"Using LGBMClassifier with hyperparameter tunning by count vectorizer",'Accuracy':accuracy_LGBM_CV}
print(accuracy)

Accuracy: 0.6416034155597723
{'Model': 'Using LGBMClassifier with hyperparameter tunning by count vectorizer', 'Accuracy': 0.6416034155597723}


In [34]:
result = result.append(accuracy,ignore_index=True)
result

Unnamed: 0,Model,Accuracy
0,MultinomialNB with Count Vectorizer,0.630028
1,Using LGBMClassifier with hyperparameter tunni...,0.641603


## Step 3: Apply Tf-Idf transformation and build a model

#### Using MultinomialNB with Tf-Idf transformation

In [35]:
# Apply Tf-Idf transformation
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [36]:
# Build a model using Naive Bayes (MultinomialNB)
model_tfidf = MultinomialNB()
model_tfidf.fit(X_train_tfidf, y_train)

MultinomialNB()

In [39]:
# Define the parameter grid
param_grid = {
    'alpha': [0.1, 1.0, 2.0]
}

# Build a model using Naive Bayes (MultinomialNB)
model_tfidf = MultinomialNB()
# Tune the model using GridSearchCV
grid_search_tfidf = GridSearchCV(model_tfidf, param_grid, cv=5)
grid_search_tfidf.fit(X_train_tfidf, y_train)
best_model_tfidf = grid_search_tfidf.best_estimator_

In [40]:
best_model_tfidf

MultinomialNB(alpha=0.1)

In [41]:
# Evaluate the model's performance
y_pred_tfidf = best_model_tfidf.predict(X_test_tfidf)
accuracy_tfidf = accuracy_score(y_test, y_pred_tfidf)
confusion_mat_tfidf = confusion_matrix(y_test, y_pred_tfidf)
print("Accuracy:", accuracy_tfidf)

Accuracy: 0.6053605313092979


#### Using Random ForestClassifier with hyperparameter tunning by Tf-Idf transformation

In [42]:
model=GradientBoostingClassifier(learning_rate= 0.1, max_depth=3, n_estimators=300)
model.fit(X_train_vec, y_train)

GradientBoostingClassifier(n_estimators=300)

In [43]:
y_pred = model.predict(X_test_vec)
ac = accuracy_score(y_test, y_pred)
ac

0.5958728652751423

In [44]:
param_grid = {
    'n_estimators': [100, 200, 300],  # Number of trees in the forest
    'max_depth': [5, 10, 20],  # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split an internal node
    # Number of features to consider when looking for the best split
}

In [None]:
# Create a Random Forest classifier with best parameter
GB = GradientBoostingClassifier()

# Perform randomized search
random_search = RandomizedSearchCV(estimator=GB, param_distributions=param_grid, n_iter=10)
random_search.fit(X_train_vec, y_train)

# Get the best parameters found by randomized search
best_params_random = random_search.best_params_
print("Best Parameters (Randomized Search):", best_params_random)

# Evaluate the model with the best parameters on the test set
best_gb = GradientBoostingClassifier(**best_params_random)
best_gb.fit(X_train_ec, y_train)
accuracy = best_gb.score(X_test_vec, y_test)
print("Accuracy:", accuracy)

#### Compare model performance

In [None]:
print("\nModel Performance Comparison:")
print("Count Vectorizer Model Accuracy:", accuracy)
print("Tf-Idf Model Accuracy:", accuracy_tfidf)

## Getting Best accuracy in LGBMClassifier with count vectorizer which is 0.641