In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
import numpy as np
from nltk.corpus import stopwords
from collections import defaultdict, Counter
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_json('News_Category_Dataset_v3.json', lines=True)
df.head()

Unnamed: 0,link,headline,category,short_description,authors,date
0,https://www.huffpost.com/entry/covid-boosters-...,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...,"Carla K. Johnson, AP",2022-09-23
1,https://www.huffpost.com/entry/american-airlin...,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...,Mary Papenfuss,2022-09-23
2,https://www.huffpost.com/entry/funniest-tweets...,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,"""Until you have a dog you don't understand wha...",Elyse Wanshel,2022-09-23
3,https://www.huffpost.com/entry/funniest-parent...,The Funniest Tweets From Parents This Week (Se...,PARENTING,"""Accidentally put grown-up toothpaste on my to...",Caroline Bologna,2022-09-23
4,https://www.huffpost.com/entry/amy-cooper-lose...,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS,Amy Cooper accused investment firm Franklin Te...,Nina Golgowski,2022-09-22


In [3]:
def preprocess_text(text):
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    return text

# Apply preprocessing to the 'text' column
df['description'] = df['short_description'] + df['headline']
df['cleaned_description'] = df['description'].apply(preprocess_text)

In [4]:
# Initialize the TF-IDF Vectorizer
vectorizer = TfidfVectorizer(stop_words='english', max_features=10000)

# Fit and transform the cleaned text
X = vectorizer.fit_transform(df['cleaned_description']).toarray()

# X is now a sparse matrix of TF-IDF features
print(X.shape)  # Display the shape of the matrix

(209527, 10000)


In [5]:
# Encode the target labels (categories)
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['category'])

In [6]:
# One-hot encode the target labels
y = to_categorical(y)

In [7]:
# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
# Print the shape of the resulting matrix
print("Shape of TF-IDF matrix:", X.shape)

# Feature size (number of columns in the matrix)
feature_size = X.shape[1]
print("Feature size:", feature_size)

Shape of TF-IDF matrix: (209527, 10000)
Feature size: 10000


In [9]:
# Define the Neural Network model
model = Sequential()

# Add input layer and first hidden layer (dense layer)
model.add(Dense(512, input_dim=X.shape[1], activation='relu'))
model.add(Dropout(0.5))  # Dropout for regularization

# Add second hidden layer
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.5))

# Add output layer with softmax activation for multi-class classification
model.add(Dense(y.shape[1], activation='softmax'))  # Output layer has as many neurons as categories

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=10, batch_size=64, validation_data=(X_test, y_test))

Epoch 1/10
[1m2620/2620[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m156s[0m 58ms/step - accuracy: 0.4039 - loss: 2.3717 - val_accuracy: 0.5704 - val_loss: 1.5623
Epoch 2/10
[1m2620/2620[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m139s[0m 53ms/step - accuracy: 0.5953 - loss: 1.4578 - val_accuracy: 0.5800 - val_loss: 1.5154
Epoch 3/10
[1m2620/2620[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m138s[0m 53ms/step - accuracy: 0.6471 - loss: 1.2216 - val_accuracy: 0.5814 - val_loss: 1.5180
Epoch 4/10
[1m2620/2620[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m139s[0m 53ms/step - accuracy: 0.7007 - loss: 1.0157 - val_accuracy: 0.5791 - val_loss: 1.5646
Epoch 5/10
[1m2620/2620[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m140s[0m 53ms/step - accuracy: 0.7455 - loss: 0.8388 - val_accuracy: 0.5752 - val_loss: 1.6484
Epoch 6/10
[1m2620/2620[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m140s[0m 54ms/step - accuracy: 0.7858 - loss: 0.6947 - val_accuracy: 0.5717 - val_loss: 1.745

In [10]:
# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy:.4f}")

[1m1310/1310[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 7ms/step - accuracy: 0.5659 - loss: 2.2638
Test Accuracy: 0.5648


In [11]:
# Define the Neural Network model
model = Sequential()

# Add input layer and first hidden layer (dense layer)
model.add(Dense(512, input_dim=X.shape[1], activation='relu'))
model.add(Dropout(0.5))  # Dropout for regularization

# Add second hidden layer
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.5))

# Add third hidden layer
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))

# Add output layer with softmax activation for multi-class classification
model.add(Dense(y.shape[1], activation='softmax'))  # Output layer has as many neurons as categories

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=10, batch_size=64, validation_data=(X_test, y_test))

Epoch 1/10
[1m2620/2620[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m165s[0m 61ms/step - accuracy: 0.3499 - loss: 2.6107 - val_accuracy: 0.5305 - val_loss: 1.7442
Epoch 2/10
[1m2620/2620[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m148s[0m 56ms/step - accuracy: 0.5440 - loss: 1.7219 - val_accuracy: 0.5541 - val_loss: 1.6515
Epoch 3/10
[1m2620/2620[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m144s[0m 55ms/step - accuracy: 0.5991 - loss: 1.4586 - val_accuracy: 0.5577 - val_loss: 1.6371
Epoch 4/10
[1m2620/2620[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m200s[0m 54ms/step - accuracy: 0.6450 - loss: 1.2463 - val_accuracy: 0.5619 - val_loss: 1.6771
Epoch 5/10
[1m2620/2620[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m146s[0m 56ms/step - accuracy: 0.6860 - loss: 1.0800 - val_accuracy: 0.5590 - val_loss: 1.7369
Epoch 6/10
[1m2620/2620[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m200s[0m 55ms/step - accuracy: 0.7217 - loss: 0.9383 - val_accuracy: 0.5582 - val_loss: 1.813

In [12]:
# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy:.4f}")

[1m1310/1310[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 7ms/step - accuracy: 0.5490 - loss: 2.1945
Test Accuracy: 0.5508


In [13]:
# Define the Neural Network model
model = Sequential()

# Add input layer and first hidden layer (dense layer)
model.add(Dense(512, input_dim=X.shape[1], activation='relu'))
model.add(Dropout(0.5))  # Dropout for regularization

# Add second hidden layer
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.5))

# Add output layer with softmax activation for multi-class classification
model.add(Dense(y.shape[1], activation='softmax'))  # Output layer has as many neurons as categories

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

Epoch 1/10
[1m5239/5239[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m276s[0m 52ms/step - accuracy: 0.4188 - loss: 2.2940 - val_accuracy: 0.5673 - val_loss: 1.5640
Epoch 2/10
[1m5239/5239[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m259s[0m 49ms/step - accuracy: 0.6007 - loss: 1.4349 - val_accuracy: 0.5771 - val_loss: 1.5141
Epoch 3/10
[1m5239/5239[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m275s[0m 53ms/step - accuracy: 0.6611 - loss: 1.1822 - val_accuracy: 0.5798 - val_loss: 1.5306
Epoch 4/10
[1m5239/5239[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m252s[0m 48ms/step - accuracy: 0.7081 - loss: 0.9915 - val_accuracy: 0.5789 - val_loss: 1.5785
Epoch 5/10
[1m5239/5239[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m231s[0m 44ms/step - accuracy: 0.7517 - loss: 0.8297 - val_accuracy: 0.5769 - val_loss: 1.6665
Epoch 6/10
[1m5239/5239[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m227s[0m 43ms/step - accuracy: 0.7872 - loss: 0.6981 - val_accuracy: 0.5739 - val_loss: 1.780

In [14]:
model.summary()

In [15]:
# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy:.4f}")

[1m1310/1310[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 6ms/step - accuracy: 0.5652 - loss: 2.2750
Test Accuracy: 0.5636


In [16]:
# Define the Neural Network model
model = Sequential()

# Add input layer and first hidden layer (dense layer)
model.add(Dense(256, input_dim=X.shape[1], activation='relu'))
model.add(Dropout(0.5))  # Dropout for regularization

# Add output layer with softmax activation for multi-class classification
model.add(Dense(y.shape[1], activation='softmax'))  # Output layer has as many neurons as categories

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=10, batch_size=128, validation_data=(X_test, y_test))

Epoch 1/10
[1m1310/1310[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step - accuracy: 0.3566 - loss: 2.6315

KeyboardInterrupt: 

In [None]:
model.summary()

In [None]:
# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy:.4f}")

In [None]:
# Define the Neural Network model
model = Sequential()

# Add input layer and first hidden layer (dense layer)
model.add(Dense(512, input_dim=X.shape[1], activation='relu'))

model.add(Dense(256, activation='relu'))

# Add output layer with softmax activation for multi-class classification
model.add(Dense(y.shape[1], activation='softmax'))  # Output layer has as many neurons as categories

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=10, batch_size=128, validation_data=(X_test, y_test))

In [None]:
# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy:.4f}")