In [38]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score

# **Load first dataset- Twitter**

In [39]:
# Load the dataset
data = pd.read_csv("Twitter_Data.csv")

In [40]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23941 entries, 0 to 23940
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   clean_text  23940 non-null  object 
 1   category    23940 non-null  float64
dtypes: float64(1), object(1)
memory usage: 374.2+ KB


In [41]:
data.head()

Unnamed: 0,clean_text,category
0,when modi promised “minimum government maximum...,-1.0
1,talk all the nonsense and continue all the dra...,0.0
2,what did just say vote for modi welcome bjp t...,1.0
3,asking his supporters prefix chowkidar their n...,1.0
4,answer who among these the most powerful world...,1.0


In [42]:
# Remove empty rows
data.dropna(subset=['clean_text'], inplace=True)

# **Preprocessing steps**

In [43]:
# Download NLTK resources
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [44]:
# use NLTK tools
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

In [45]:
#function to preprocess the text
def preprocess_text(text):
    # Lowercase the text
    text = text.lower()
    # Remove special characters and punctuation
    text = ''.join([char for char in text if char.isalnum() or char.isspace()])
    # Tokenize the text
    words = text.split()
    # Remove stopwords and apply stemming
    words = [stemmer.stem(word) for word in words if word not in stop_words]
    # Join words back into a single string
    return ' '.join(words)

In [46]:
# Apply the preprocessing function to the text column
data['clean_text'] = data['clean_text'].apply(preprocess_text)

In [47]:
# Split the data into training and testing sets
X = data['clean_text']
y = data['category']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [48]:
# TF-IDF Vectorization - fit only on the training data
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train).toarray()  # Convert to array for TensorFlow
X_test_tfidf = vectorizer.transform(X_test).toarray()

# **Recheck for null values in dataset**

In [49]:
# Check for null values in the dataset
null_values = data.isnull().sum()

# Print the number of null values in each column
print("Null values in each column:\n", null_values)

# Alternatively, you can check if there are any null values in the dataset
if null_values.any():
    print("The dataset contains null values.")
else:
    print("The dataset does not contain any null values.")

Null values in each column:
 clean_text    0
category      1
dtype: int64
The dataset contains null values.


In [50]:
# Remove rows where 'category' is null
data.dropna(subset=['category'], inplace=True)

# Verify that there are no more null values
print("Null values after removal:\n", data.isnull().sum())

Null values after removal:
 clean_text    0
category      0
dtype: int64


# **Model training**

In [53]:
# Check for NaNs in the labels
print(y_train.isnull().sum())
print(y_test.isnull().sum())

# Remove NaNs or fill them with a valid category if necessary
y_train = y_train.fillna(0).astype(int)  # Replace NaNs with 0 or any valid category
y_test = y_test.fillna(0).astype(int)

1
0


In [54]:
# Convert labels to one-hot encoded format
y_train_categorical = to_categorical(y_train + 1)  # Shift labels to start from 0 (0, 1, 2)
y_test_categorical = to_categorical(y_test + 1)

In [55]:
# Define the Neural Network model
model = Sequential()
model.add(Dense(128, input_dim=X_train_tfidf.shape[1], activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(32, activation='relu'))
model.add(Dense(3, activation='softmax'))  # 3 classes for multi-class classification

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [57]:
# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

In [69]:
# Train the model
model.fit(X_train_tfidf, y_train_categorical, epochs=15, batch_size=32, validation_split=0.2, verbose=1)

Epoch 1/15
[1m479/479[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 17ms/step - accuracy: 0.9914 - loss: 0.0259 - val_accuracy: 0.7700 - val_loss: 1.4485
Epoch 2/15
[1m479/479[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 13ms/step - accuracy: 0.9893 - loss: 0.0300 - val_accuracy: 0.7703 - val_loss: 1.3905
Epoch 3/15
[1m479/479[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 16ms/step - accuracy: 0.9922 - loss: 0.0225 - val_accuracy: 0.7698 - val_loss: 1.5251
Epoch 4/15
[1m479/479[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 15ms/step - accuracy: 0.9930 - loss: 0.0201 - val_accuracy: 0.7721 - val_loss: 1.5041
Epoch 5/15
[1m479/479[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 12ms/step - accuracy: 0.9930 - loss: 0.0242 - val_accuracy: 0.7724 - val_loss: 1.4244
Epoch 6/15
[1m479/479[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 17ms/step - accuracy: 0.9926 - loss: 0.0236 - val_accuracy: 0.7732 - val_loss: 1.4983
Epoch 7/15
[1m479/479

<keras.src.callbacks.history.History at 0x7d9b5dee1e40>

In [70]:

y_pred = model.predict(X_test_tfidf)
y_pred_classes = np.argmax(y_pred, axis=1) - 1  # Convert back to original labels (-1, 0, 1)


[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step


In [82]:
loss, accuracy = model.evaluate(X_test_tfidf, y_test_categorical)
recall = recall_score(y_test, y_pred_classes, average='weighted')
precision = precision_score(y_test, y_pred_classes, average='weighted')
f1 = f1_score(y_test, y_pred_classes, average='weighted')


print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Recall: {recall * 100:.2f}%")
print(f"Precision: {precision * 100:.2f}%")
print(f"F1 Score: {f1 * 100:.2f}%")

[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.7680 - loss: 1.8915
Accuracy: 77.51%
Recall: 77.51%
Precision: 77.50%
F1 Score: 77.28%


# **Reddit dataset Results**

In [83]:
# Load the dataset
data2 = pd.read_csv("Reddit_Data.csv")

In [84]:
data2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37249 entries, 0 to 37248
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   clean_comment  37149 non-null  object
 1   category       37249 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 582.1+ KB


In [85]:
data2.head()

Unnamed: 0,clean_comment,category
0,family mormon have never tried explain them t...,1
1,buddhism has very much lot compatible with chr...,1
2,seriously don say thing first all they won get...,-1
3,what you have learned yours and only yours wha...,0
4,for your own benefit you may want read living ...,1


In [87]:
# Remove empty rows
data2.dropna(subset=['clean_comment'], inplace=True)

In [88]:
# use NLTK tools
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

In [89]:
#function to preprocess the text
def preprocess_text(text):
    # Lowercase the text
    text = text.lower()
    # Remove special characters and punctuation
    text = ''.join([char for char in text if char.isalnum() or char.isspace()])
    # Tokenize the text
    words = text.split()
    # Remove stopwords and apply stemming
    words = [stemmer.stem(word) for word in words if word not in stop_words]
    # Join words back into a single string
    return ' '.join(words)

In [91]:
# Apply the preprocessing function to the text column
data2['clean_comment'] = data2['clean_comment'].apply(preprocess_text)

In [92]:
# Split the data into training and testing sets
X = data2['clean_comment']
y = data2['category']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [93]:
# TF-IDF Vectorization - fit only on the training data
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train).toarray()  # Convert to array for TensorFlow
X_test_tfidf = vectorizer.transform(X_test).toarray()

In [94]:
# Check for null values in the dataset
null_values = data2.isnull().sum()

# Print the number of null values in each column
print("Null values in each column:\n", null_values)

# Alternatively, you can check if there are any null values in the dataset
if null_values.any():
    print("The dataset contains null values.")
else:
    print("The dataset does not contain any null values.")

Null values in each column:
 clean_comment    0
category         0
dtype: int64
The dataset does not contain any null values.


In [98]:
# Convert labels to one-hot encoded format
y_train_categorical = to_categorical(y_train + 1)  # Shift labels to start from 0 (0, 1, 2)
y_test_categorical = to_categorical(y_test + 1)

In [99]:
# Define the Neural Network model
model = Sequential()
model.add(Dense(128, input_dim=X_train_tfidf.shape[1], activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(32, activation='relu'))
model.add(Dense(3, activation='softmax'))  # 3 classes for multi-class classification

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [100]:
# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

In [101]:
# Train the model
model.fit(X_train_tfidf, y_train_categorical, epochs=20, batch_size=32, validation_split=0.3, verbose=1)

Epoch 1/20
[1m569/569[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 23ms/step - accuracy: 0.5259 - loss: 0.9406 - val_accuracy: 0.7892 - val_loss: 0.5431
Epoch 2/20
[1m569/569[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 13ms/step - accuracy: 0.8333 - loss: 0.4537 - val_accuracy: 0.8076 - val_loss: 0.5078
Epoch 3/20
[1m569/569[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 14ms/step - accuracy: 0.8996 - loss: 0.3032 - val_accuracy: 0.8044 - val_loss: 0.5542
Epoch 4/20
[1m569/569[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 15ms/step - accuracy: 0.9296 - loss: 0.2227 - val_accuracy: 0.8045 - val_loss: 0.5857
Epoch 5/20
[1m569/569[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 13ms/step - accuracy: 0.9499 - loss: 0.1580 - val_accuracy: 0.7983 - val_loss: 0.6896
Epoch 6/20
[1m569/569[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 13ms/step - accuracy: 0.9610 - loss: 0.1306 - val_accuracy: 0.7995 - val_loss: 0.7145
Epoch 7/20
[1m569/

<keras.src.callbacks.history.History at 0x7d9b5ae1b4f0>

In [102]:
y_pred = model.predict(X_test_tfidf)
y_pred_classes = np.argmax(y_pred, axis=1) - 1  # Convert back to original labels (-1, 0, 1)

[1m349/349[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step


In [103]:
loss, accuracy = model.evaluate(X_test_tfidf, y_test_categorical)
recall = recall_score(y_test, y_pred_classes, average='weighted')
precision = precision_score(y_test, y_pred_classes, average='weighted')
f1 = f1_score(y_test, y_pred_classes, average='weighted')


print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Recall: {recall * 100:.2f}%")
print(f"Precision: {precision * 100:.2f}%")
print(f"F1 Score: {f1 * 100:.2f}%")

[1m349/349[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.7886 - loss: 1.3540
Accuracy: 78.70%
Recall: 78.70%
Precision: 78.66%
F1 Score: 78.60%
