#Tag Classification Using Deep Learning Models

In [1]:
import nltk
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
import re

In [2]:
data = pd.read_csv("MediumDataAfterEDA.csv")
df = data[["Paragraph","Tag"]].copy()

In [3]:
df = df.rename(columns={"Paragraph": "Blog"})

Since we have 155 Blogs only, I will devide the Blogs into a 80 words with 20 words overlapping in each sentence with their corresponding tags intact to increase the dataset

In [94]:
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')
def create_overlapping_segments(text, chunk_size=80, overlap=20):
  """
    Function to create overlapping segments of text with specified chunk size and overlap.

    Parameters:
        text (str): The input text to be segmented.
        chunk_size (int): The desired chunk size in terms of number of words.
        overlap (int): The overlap between adjacent chunks in terms of number of words.

    Returns:
        List of overlapping segments.
  """
  tokens = word_tokenize(text)
  segments = []
  start = 0
  end = chunk_size
  while start < len(tokens):
    segment = tokens[start:end]
    segments.append(" ".join(segment))
    start += chunk_size - overlap
    end = start + chunk_size
  return segments


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [95]:
X = []
Y = []
for i in range(len(df["Blog"])):
  segment =  create_overlapping_segments(df["Blog"][i])
  for sentence in segment:
    X.append(sentence)
    Y.append(df["Tag"][i])

In [96]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

Pre-processing steps :
1 ) First removing punctuation and html tags if any. note that the html tas may be present ast the data must be scraped from net.

2) Tokenize the reviews into tokens or words .

3) Next remove the stop words and shorter words as they cause noise.

4) Stem or lemmatize the words depending on what does better. Herer I have yse lemmatizer.

In [97]:
STOPWORDS = set(stopwords.words('english'))
def clean_text(text):
  sentence = text.lower()
  sentence = re.sub("[^a-z0-9]",' ',sentence)
  sentence = sentence.split()
  sentence = [lemmatizer.lemmatize(word) for word in sentence if word not in STOPWORDS]
  sentence = " ".join(sentence)
  return sentence

In [98]:
new_df = pd.DataFrame({"Blog": X, "Tag": Y})

In [99]:
new_df["Blog"] = new_df["Blog"].apply(clean_text)

In [100]:
from imblearn.over_sampling import SMOTE
from gensim.models import Word2Vec
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder

Training Word2Vec Model on my Own Corpus

In [101]:
words = []
for doc in new_df["Blog"]:
  word = doc.split()
  words.append(word)

In [102]:
word2vec_model = Word2Vec(sentences=words, vector_size=300, window=5, min_count=1,epochs=15)

In [103]:
word2vec_model.corpus_count

7191

In [104]:
word2vec_model.wv.similar_by_word('sleep')

[('likelihood', 0.9140306115150452),
 ('observed', 0.8850325345993042),
 ('grows', 0.8681285977363586),
 ('span', 0.8631398677825928),
 ('asleep', 0.8579186201095581),
 ('estimated', 0.8543052077293396),
 ('measurement', 0.8522398471832275),
 ('bedroom', 0.8469898104667664),
 ('credible', 0.8426560759544373),
 ('variability', 0.8386530876159668)]

In [105]:
word2vec_model.wv['sleep']

array([-3.70772220e-02,  2.78305084e-01, -4.30310398e-01,  4.66126613e-02,
       -3.43960911e-01, -8.49517465e-01,  5.31706333e-01,  6.14339828e-01,
       -2.41224673e-02, -3.12504292e-01, -9.52668265e-02,  1.24070719e-01,
       -3.25106472e-01, -2.68386573e-01,  2.76851542e-02, -3.47611755e-02,
        2.17228811e-02,  2.23206028e-01,  4.60965097e-01, -1.12694707e-02,
        5.65774832e-03,  4.50185873e-02,  2.20357955e-01,  5.50502762e-02,
        3.62800658e-02, -2.00003847e-01,  1.39758781e-01, -1.33429185e-01,
       -2.28759214e-01, -3.70532632e-01, -6.75683783e-04, -1.25981599e-01,
        5.24161160e-02,  6.43315201e-04,  1.74346287e-03,  2.36083895e-01,
        1.48884296e-01, -2.81826258e-01,  5.13244048e-02, -4.98542078e-02,
        3.68328869e-01,  7.41518214e-02,  2.65826792e-01,  4.40521426e-02,
        3.34167659e-01,  2.54520535e-01,  5.68871647e-02,  2.31123775e-01,
        1.08772494e-01,  3.90704989e-01,  1.20209657e-01, -4.38411906e-02,
        5.52475899e-02,  

In [106]:
maxi=-1
for i,sen in enumerate(new_df['Blog']):
    tokens=sen.split()
    if(len(tokens)>maxi):
        maxi=len(tokens)

Now integer encode the words in the reviews using Keras tokenizer.

In [107]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)
X_sequences = tokenizer.texts_to_sequences(X)

In [108]:
X_padded= pad_sequences(X_sequences, maxlen=maxi, padding='post')
X_padded.shape

(7191, 77)

Now we need to pass the w2v word embeddings to the embedding layer in Keras. For this lets create the embedding matrix and pass it as 'embedding_initializer' parameter to the layer.

In [109]:
vocab_size = len(tokenizer.word_index) + 1
embed_dim=300
vocab_size

13303

In [110]:
embedding_matrix=np.zeros(shape=(vocab_size,embed_dim))
for word, i in tokenizer.word_index.items():
  if word in word2vec_model.wv:
    embedding_matrix[i] = word2vec_model.wv[word]

In [111]:
embedding_matrix[1]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0.

In [112]:
label_encoder = LabelEncoder()
encoded_tags = label_encoder.fit_transform(new_df['Tag'])
Y=to_categorical(encoded_tags)
Y[0]

array([1., 0., 0., 0., 0., 0.], dtype=float32)

In [113]:
smote = SMOTE()
X_resampled, y_resampled = smote.fit_resample(X_padded, Y)

In [114]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.1, random_state=42)

In [115]:
from keras.models import Sequential
from keras.layers import Embedding, Flatten, LSTM, Dense, SpatialDropout1D
from keras.initializers import Constant
from keras.layers import Dropout

model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1,
                    output_dim=word2vec_model.vector_size,
                    weights=[embedding_matrix],
                    input_length=maxi,
                    trainable=False))
model.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(Y.shape[1], activation='softmax'))

In [116]:
y_train.shape

(13624, 6)

In [117]:
X_train.shape

(13624, 77)

In [118]:
model.summary()

Model: "sequential_9"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_9 (Embedding)     (None, 77, 300)           3990900   
                                                                 
 lstm_11 (LSTM)              (None, 64)                93440     
                                                                 
 dense_10 (Dense)            (None, 6)                 390       
                                                                 
Total params: 4084730 (15.58 MB)
Trainable params: 93830 (366.52 KB)
Non-trainable params: 3990900 (15.22 MB)
_________________________________________________________________


In [119]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=5, batch_size=64, validation_split=0.1)
loss, accuracy = model.evaluate(X_test, y_test)
print("Test Accuracy:", accuracy)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test Accuracy: 0.4907529652118683


In [120]:
from keras.models import Sequential
from keras.layers import Embedding, Conv1D, MaxPooling1D, Flatten, Dense
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1,
                    output_dim=word2vec_model.vector_size,
                    weights=[embedding_matrix],
                    input_length=maxi,
                    trainable=False))
model.add(Conv1D(filters=64, kernel_size=3, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(64, activation='relu'))
model.add(Dense(Y.shape[1], activation='softmax'))

In [121]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

Model: "sequential_10"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_10 (Embedding)    (None, 77, 300)           3990900   
                                                                 
 conv1d_1 (Conv1D)           (None, 75, 64)            57664     
                                                                 
 max_pooling1d_1 (MaxPoolin  (None, 37, 64)            0         
 g1D)                                                            
                                                                 
 flatten_1 (Flatten)         (None, 2368)              0         
                                                                 
 dense_11 (Dense)            (None, 64)                151616    
                                                                 
 dense_12 (Dense)            (None, 6)                 390       
                                                     

In [122]:
model.fit(X_train, y_train, epochs=5, batch_size=64, validation_split=0.1)
loss, accuracy = model.evaluate(X_test, y_test)
print("Test Accuracy:", accuracy)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test Accuracy: 0.4550858736038208


In [123]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, GRU, Dense
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1,
                    output_dim=word2vec_model.vector_size,
                    weights=[embedding_matrix],
                    input_length=maxi,
                    trainable=False))
model.add(LSTM(32, dropout=0.2, recurrent_dropout=0.2, return_sequences=True))
model.add(GRU(32, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(Y.shape[1], activation='softmax'))
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()
model.fit(X_train, y_train, epochs=5, batch_size=64, validation_split=0.1)
loss, accuracy = model.evaluate(X_test, y_test)
print("Test Accuracy:", accuracy)


Model: "sequential_11"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_11 (Embedding)    (None, 77, 300)           3990900   
                                                                 
 lstm_12 (LSTM)              (None, 77, 32)            42624     
                                                                 
 gru_1 (GRU)                 (None, 32)                6336      
                                                                 
 dense_13 (Dense)            (None, 6)                 198       
                                                                 
Total params: 4040058 (15.41 MB)
Trainable params: 49158 (192.02 KB)
Non-trainable params: 3990900 (15.22 MB)
_________________________________________________________________
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test Accuracy: 0.48084545135498047
