# Tag Classification using ML Algorithms

In [46]:
from nltk.corpus import stopwords
import re
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.metrics import accuracy_score

In [47]:
data = pd.read_csv("MediumDataAfterEDA.csv")

In [48]:
df = data[["Paragraph","Tag"]].copy()
df = df.rename(columns = {"Paragraph":"Blog"})
df.head(2)

Unnamed: 0,Blog,Tag
0,"In October 2020, I was interviewed by DrivenDa...",Data Science
1,Talking is a lot like writing in that it force...,Data Science


In [7]:
df['Tag'].unique()

array(['Data Science', 'Personal Development', 'Technology',
       'Machine Learning', 'Programming', 'Education'], dtype=object)

- Since Dataset only contains 155 Blogs, To create more sample for model to train better we devide each blog to chunk size of 80 words with 20 overlapping owords

In [49]:
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')
def create_overlapping_segments(text, chunk_size=80, overlap=20):
  """
    Function to create overlapping segments of text with specified chunk size and overlap.

    Parameters:
        text (str): The input text to be segmented.
        chunk_size (int): The desired chunk size in terms of number of words.
        overlap (int): The overlap between adjacent chunks in terms of number of words.

    Returns:
        List of overlapping segments.
  """
  tokens = word_tokenize(text)
  segments = []
  start = 0
  end = chunk_size
  while start < len(tokens):
    segment = tokens[start:end]
    segments.append(" ".join(segment))
    start += chunk_size - overlap
    end = start + chunk_size
  return segments


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


#### *This is Done to increase the Dataset Size*

In [50]:
segment = create_overlapping_segments("Sleep is important to a number of brain functions, including how nerve cells (neurons) communicate with each other. In fact, your brain and body stay remarkably active while you sleep. Recent findings suggest that sleep plays a housekeeping role that removes toxins in your brain that build up while you are awake.")

In [51]:
segment[0]

'Sleep is important to a number of brain functions , including how nerve cells ( neurons ) communicate with each other . In fact , your brain and body stay remarkably active while you sleep . Recent findings suggest that sleep plays a housekeeping role that removes toxins in your brain that build up while you are awake .'

In [52]:
X = []
Y = []
for i in range(len(df["Blog"])):
  segment =  create_overlapping_segments(df["Blog"][i])
  for sentence in segment:
    X.append(sentence)
    Y.append(df["Tag"][i])

In [53]:
len(X)

7191

- Increased the SIze of Dataset

In [54]:
max_length = max(len(s) for s in X)
padded_strings = [s.ljust(max_length) for s in X]

In [55]:
len(Y)

7191

In [56]:
X[0],Y[0]

('In October 2020 , I was interviewed by DrivenData , an organization that hosts data science competitions for good , one of which I placed second in while teaching myself data science . My interview appeared condensed and edited on DrivenData ’ s blog and here are my full , unedited answers . Even if you don ’ t make it through this article — there ’ s a reason my responses were condensed— I ’ d recommend checking out',
 'Data Science')

In [57]:
new_df = pd.DataFrame({"Blog": X, "Tag": Y})
new_df.head()

Unnamed: 0,Blog,Tag
0,"In October 2020 , I was interviewed by DrivenD...",Data Science
1,it through this article — there ’ s a reason m...,Data Science
2,to use in the best interests of our world . Wh...,Data Science
3,"models providing continuous , real-time operat...",Data Science
4,field where the best decision for the environm...,Data Science


In [58]:
new_df["Tag"].value_counts()

Data Science            2523
Machine Learning        2520
Personal Development     770
Education                718
Programming              382
Technology               278
Name: Tag, dtype: int64

#### ***Since Data is Imbalanced I Will use SMOTE for Oversampling ***

In [59]:
new_df.isnull().sum()

Blog    0
Tag     0
dtype: int64

In [60]:
new_df.drop_duplicates(inplace=True)

In [61]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

- Preprocessing Text Data
- I am Lowering it, removing any punctuation,splitting them and using Lemmatizer

In [62]:
STOPWORDS = set(stopwords.words('english'))
def clean_text(text):
  sentence = text.lower()
  sentence = re.sub("[^a-z0-9]",' ',sentence)
  sentence = sentence.split()
  sentence = [lemmatizer.lemmatize(word) for word in sentence if word not in STOPWORDS]
  sentence = " ".join(sentence)
  return sentence

In [63]:
new_df["Blog"] = new_df["Blog"].apply(clean_text)

In [64]:
from sklearn.model_selection import train_test_split
x = new_df.Blog
y = new_df.Tag
x_train,x_test, y_train,y_test = train_test_split(x,y,test_size=0.1,random_state=42,stratify=y)

In [65]:
x_train.shape,x_test.shape,y_train.shape,y_test.shape

((6459,), (718,), (6459,), (718,))

In [66]:
from category_encoders import CountEncoder
count_enc = CountEncoder(normalize=True)
y_train_encoded = count_enc.fit_transform(y_train)
y_test_encoded = count_enc.transform(y_test)

In [67]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

In [68]:
y_train_encoded

array([5, 4, 3, ..., 0, 0, 3])

In [69]:
from xgboost import XGBClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from category_encoders import TargetEncoder
from imblearn.over_sampling import SMOTE
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import classification_report
from imblearn.pipeline import Pipeline as ImbPipeline ## Pipeline to train the Model
# Creating Pipeline TO Efficiently Train
mulnb = ImbPipeline([
    ('vect',CountVectorizer()), ## Using Count Vectorizer and TfidfTransformer to create Vectors from Text
    ('tfidf',TfidfTransformer()),
    ('smote', SMOTE(random_state=42)),
    ('clf',MultinomialNB())
])
mulnb.fit(x_train,y_train_encoded)
y_pred = mulnb.predict(x_test)

In [70]:
accuracy_score(y_pred,y_test_encoded)

0.8774373259052924

In [71]:
report = classification_report(y_test_encoded, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.91      0.81      0.85       251
           1       0.92      0.92      0.92        72
           2       0.91      0.92      0.92       252
           3       0.82      0.91      0.86        77
           4       0.84      0.84      0.84        38
           5       0.62      0.93      0.74        28

    accuracy                           0.88       718
   macro avg       0.84      0.89      0.86       718
weighted avg       0.88      0.88      0.88       718



In [72]:
xgboost = ImbPipeline([
    ('vect',CountVectorizer()),
    ('tfidf',TfidfTransformer()),
    ('smote', SMOTE(random_state=42)),
    ('clf',XGBClassifier())
])
xgboost.fit(x_train,y_train_encoded)
y_pred = xgboost.predict(x_test)

In [73]:
accuracy_score(y_pred,y_test_encoded)

0.8147632311977716

In [74]:
report = classification_report(y_test_encoded, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.77      0.81      0.79       251
           1       0.83      0.82      0.83        72
           2       0.86      0.91      0.89       252
           3       0.73      0.75      0.74        77
           4       1.00      0.63      0.77        38
           5       0.73      0.39      0.51        28

    accuracy                           0.81       718
   macro avg       0.82      0.72      0.76       718
weighted avg       0.82      0.81      0.81       718



In [75]:
from sklearn.ensemble import RandomForestClassifier
random_forest = ImbPipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('smote', SMOTE(random_state=42)),
    ('clf', RandomForestClassifier(random_state=42))
])
random_forest.fit(x_train, y_train_encoded)
y_pred = random_forest.predict(x_test)

In [76]:
accuracy_score(y_pred,y_test_encoded)

0.8119777158774373

In [77]:
report = classification_report(y_test_encoded, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.75      0.85      0.80       251
           1       0.87      0.74      0.80        72
           2       0.85      0.91      0.88       252
           3       0.78      0.73      0.75        77
           4       1.00      0.66      0.79        38
           5       0.78      0.25      0.38        28

    accuracy                           0.81       718
   macro avg       0.84      0.69      0.73       718
weighted avg       0.82      0.81      0.81       718



In [78]:
from sklearn.svm import SVC
svm_pipeline = ImbPipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('smote', SMOTE(random_state=42)),
    ('clf', SVC(random_state=42))
])
svm_pipeline.fit(x_train, y_train_encoded)
y_pred = svm_pipeline.predict(x_test)

In [79]:
accuracy_score(y_pred,y_test_encoded)

0.8635097493036211

In [80]:
report = classification_report(y_test_encoded, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.77      0.93      0.85       251
           1       0.91      0.85      0.88        72
           2       0.93      0.91      0.92       252
           3       0.91      0.81      0.86        77
           4       1.00      0.66      0.79        38
           5       0.90      0.32      0.47        28

    accuracy                           0.86       718
   macro avg       0.90      0.75      0.79       718
weighted avg       0.87      0.86      0.86       718



In [83]:
from sklearn.ensemble import VotingClassifier
hybrid_classifier = VotingClassifier(
    estimators=[('svm', svm_pipeline), ('xgboost', xgboost)],
    voting='hard'
)
hybrid_classifier.fit(x_train, y_train_encoded)
y_pred = hybrid_classifier.predict(x_test)

In [84]:
accuracy_score(y_pred,y_test_encoded)

0.8398328690807799

In [85]:
report = classification_report(y_test_encoded, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.72      0.95      0.82       251
           1       0.87      0.85      0.86        72
           2       0.94      0.88      0.91       252
           3       0.98      0.69      0.81        77
           4       1.00      0.58      0.73        38
           5       0.86      0.21      0.34        28

    accuracy                           0.84       718
   macro avg       0.90      0.69      0.75       718
weighted avg       0.86      0.84      0.83       718



- So Multinomial Naive Bayes Algorithm Performed Better Than Other Model
