In [None]:
import re
import numpy as np
import pandas as pd

import nltk
from nltk.tokenize import WhitespaceTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.naive_bayes import MultinomialNB

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [2]:
# reading dataset
dataset = pd.read_csv('./dataset/filtered_data.csv', sep='\t')

In [3]:
# exploring dataset
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 197208 entries, 0 to 197207
Data columns (total 3 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   Title     197207 non-null  object
 1   URL       197208 non-null  object
 2   Category  197208 non-null  object
dtypes: object(3)
memory usage: 4.5+ MB


In [4]:
# getting the description of the dataset
dataset.describe()

Unnamed: 0,Title,URL,Category
count,197207,197208,197208
unique,191000,197125,3
top,Sunday Roundup,http://www.japantimes.co.jp/news/2014/04/18/wo...,b
freq,90,5,115967


In [5]:
# checking the shape of the dataset
dataset.shape

(197208, 3)

In [6]:
dataset.head()

Unnamed: 0,Title,URL,Category
0,"Fed official says weak data caused by weather,...",http://www.latimes.com/business/money/la-fi-mo...,b
1,Fed's Charles Plosser sees high bar for change...,http://www.livemint.com/Politics/H2EvwJSK2VE6O...,b
2,US open: Stocks fall after Fed official hints ...,http://www.ifamagazine.com/news/us-open-stocks...,b
3,"Fed risks falling 'behind the curve', Charles ...",http://www.ifamagazine.com/news/fed-risks-fall...,b
4,Fed's Plosser: Nasty Weather Has Curbed Job Gr...,http://www.moneynews.com/Economy/federal-reser...,b


In [7]:
# shuffle the dataset for each category to have atleast 100 rows
dataset = dataset.groupby('Category').apply(lambda x: x.sample(n=100, random_state=42)).sample(frac=1, random_state=42).reset_index(drop=True)

  dataset = dataset.groupby('Category').apply(lambda x: x.sample(n=100, random_state=42)).sample(frac=1, random_state=42).reset_index(drop=True)


In [8]:
# check for row count based on category
dataset.groupby('Category').nunique()

Unnamed: 0_level_0,Title,URL
Category,Unnamed: 1_level_1,Unnamed: 2_level_1
b,100,100
m,100,100
p,100,100


In [9]:
# replacing the non-aplhabetical characters with whitespace
dataset['Title'] = dataset['Title'].str.replace('[^a-zA-Z]',' ')

In [10]:
# converting the word into smaller words
dataset['Title'] = [word.lower() for word in dataset['Title']]

In [11]:
# applying tokenization
dataset['Title'] = dataset['Title'].apply(nltk.tokenize.WhitespaceTokenizer().tokenize)

In [12]:
# removing stopwords
dataset['Title'] = dataset['Title'].apply(lambda words: [word for word in words if not word in stopwords.words('english')])

In [13]:
lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    # lemmatize the wors from text
    return(lemmatizer.lemmatize(word) for word in text)

# applying lemmatization
dataset['Title'] = dataset['Title'].apply(lemmatize_text)

In [14]:
dataset['lematized_title'] = 0
for i in range(0, len(dataset)):
    dataset['lematized_title'][i] = ' '.join(dataset['Title'][i])

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  dataset['lematized_title'][i] = ' '.join(dataset['Title'][i])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d

In [15]:
train_data = dataset['lematized_title'].values
target = dataset['Category'].values

In [16]:
# converting data into numerical values
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(train_data)
Y_train = target

In [17]:
# training data into MultinomialNB
naive_clf = MultinomialNB()
naive_clf.fit(X_train, Y_train)

In [20]:
# prediction:
input_text = input('Enter text to classify: ')

# function to tokenize and lemmatize the user input
def text_tokenize_lemmatize(text):
    text = re.sub('[^a-zA-Z]', ' ', text)  # Replace non-alphabetic characters with spaces
    lower_text = text.lower()
    tokenizer = nltk.tokenize.WhitespaceTokenizer()
    tokenized = tokenizer.tokenize(lower_text)
    lemmatized_output = ' '.join([lemmatizer.lemmatize(w) for w in tokenized])
    return lemmatized_output

# process user imput value for tokenization
processed_text = text_tokenize_lemmatize(input_text)

# extracting the features from the lemmatized text
vectorized_text = vectorizer.transform([processed_text])

# predict the classification
predicted_classification = naive_clf.predict(vectorized_text)

# function to classify the text
def text_classification(predicted):
    if predicted_classification == 'b':
        print('The given text represents to category: Business.')
    elif predicted_classification == 'm':
        print('The given text represents to category: Health.')
    elif predicted_classification == 'p':
        print('The given text represents to category: Politics.')

# get the classification
classified_text = text_classification(predicted_classification)

Enter text to classify:  Drug-resistant malaria has spread to critical border regions of South-east Asia


The given text represents to category: Health.
