# Importing the Libraries

In [1]:
import numpy as np 
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import seaborn as sns
import re
from nltk.corpus import stopwords
from nltk import word_tokenize
STOPWORDS = set(stopwords.words('english'))
from keras.callbacks import EarlyStopping

In [2]:
tf.__version__

'2.7.0'

# Loading the Data

In [3]:
dataset = pd.read_csv('Context.csv')

In [4]:
dataset.head()

Unnamed: 0,Text,Context
0,The eternal mystique of Goldman Sachs,Politics
1,Either you don't care enough to actually tell ...,Love
2,I am such an IDIOT.,Heavy Emotion
3,While lifting weights on Friday and doing bent...,Health
4,Something's watching me,Animals


In [5]:
dataset.shape

(31386, 2)

In [6]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31386 entries, 0 to 31385
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Text     31386 non-null  object
 1   Context  31386 non-null  object
dtypes: object(2)
memory usage: 490.5+ KB


In [7]:
dataset.Context.value_counts()

Heavy Emotion    3674
Religion         3466
Love             3229
Self             3105
Compliment       3061
Animals          2622
Health           2595
Education        2534
Joke             2476
Science          2428
Politics         2196
Name: Context, dtype: int64

# Cleaning the Texts

In [8]:
special_character_remover = re.compile('[/(){}\[\]\|@,;]')
extra_symbol_remover = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

In [9]:
def clean_text(text):
    text = text.lower()
    text = special_character_remover.sub(' ', text)
    text = extra_symbol_remover.sub('', text)
    text = ' '.join(word for word in text.split() if word not in STOPWORDS)
    return text
    
dataset['Text'] = dataset['Text'].apply(clean_text)

In [10]:
print(dataset['Text'].apply(lambda x: len(x.split(' '))).sum())

405759


# Finding Maximum Sequence Length

In [11]:
g=[]

In [12]:
for i in dataset['Text']:
    g.append(i)

In [13]:
len(g)

31386

In [14]:
maxl = max([len(s) for s in g])
print ('Maximum sequence length in the list of sentences:', maxl)

Maximum sequence length in the list of sentences: 4624


In [15]:
MAX_NB_WORDS = 50000

MAX_SEQUENCE_LENGTH = 500

EMBEDDING_DIM = 100

tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(dataset['Text'].values)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 39335 unique tokens.


# Padding

In [16]:
X = tokenizer.texts_to_sequences(dataset['Text'].values)
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', X.shape)

Shape of data tensor: (31386, 500)


In [17]:
Y = pd.get_dummies(dataset['Context']).values
print('Shape of label tensor:', Y.shape)

Shape of label tensor: (31386, 11)


# Train and Test Split up

In [18]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.10, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

(28247, 500) (28247, 11)
(3139, 500) (3139, 11)


# Building the ANN

Initializing the ANN

In [19]:
ann = tf.keras.models.Sequential()

In [20]:
ann.add(tf.keras.layers.Dense(units=66, activation ='relu'))

In [21]:
ann.add(tf.keras.layers.Dense(units=66, activation ='relu'))

In [22]:
ann.add(tf.keras.layers.Dense(units=11, activation ='softmax'))

# Training the ANN

In [23]:
ann.compile(optimizer= 'adam',loss='categorical_crossentropy', metrics = ['accuracy'])
ann.fit(X_train,Y_train, batch_size=256, epochs = 20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x1e860cf5760>

# Model Summary

In [24]:
ann.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 66)                33066     
                                                                 
 dense_1 (Dense)             (None, 66)                4422      
                                                                 
 dense_2 (Dense)             (None, 11)                737       
                                                                 
Total params: 38,225
Trainable params: 38,225
Non-trainable params: 0
_________________________________________________________________


# Final Inference

Since the accuracy is very low compared to other models, the ANN model needs to be tuned in right way to get better results

# Relevant Hyperparameters to tune:

a. Number of nodes and hidden layers

b. Number of units in a Dense layer

c. Weight Initialization

d. Activation Functions

e. Learning Rate

f. Number of Epochs and Batch Size

