In [17]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, GlobalMaxPooling1D, Dense, Dropout
from sklearn.preprocessing import LabelEncoder

In [2]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/news_category_dataset.csv')
df = df.loc[df['category']!='Travel']
df = df.dropna()
text_data = (df['title'] + " " + df['description']).tolist()
labels = df['category'].tolist()

In [3]:
for text in text_data:
  if type(text) != str:
    print(text)

In [4]:
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(labels)

In [5]:
max_words = 10000
max_sequence_length = 100
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(text_data)
sequences = tokenizer.texts_to_sequences(text_data)
X = pad_sequences(sequences, maxlen=max_sequence_length)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)

In [25]:
model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=50, input_length=max_sequence_length))
model.add(Conv1D(64, 5, activation='relu'))
model.add(MaxPooling1D(5))
model.add(Conv1D(64, 5, activation='relu'))
model.add(GlobalMaxPooling1D())
# model.add(Conv1D(64, 5, activation='relu'))
# model.add(Dropout(0.5))
model.add(Dense(32, activation='relu'))
model.add(Dense(len(label_encoder.classes_), activation='softmax'))

In [26]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [27]:
model.fit(X_train, y_train, epochs=3, batch_size=64, validation_data=(X_test, y_test))

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.src.callbacks.History at 0x78fda63b4160>

In [28]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test loss: {loss:.4f}, Test accuracy: {accuracy:.4f}')

Test loss: 0.5910, Test accuracy: 0.8161


In [29]:
test_dataset = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/LR Model Dataset new.csv")
test_dataset = test_dataset.loc[test_dataset['category']!='Health']
test_dataset

Unnamed: 0,title,description,category
0,"""Admire Lakshya Sen A Lot"": Indian Shuttler An...",Indian badminton player Anupama Upadhyaya who ...,Sports
1,"""Amazing Host"" Priyanka Chopra And Preity Zint...","""Last night I officially became a fan,"" writes...",Entertainment
2,"""Beyond People's Imagination"": Rajeev Shukla O...",Fiery centuries from Virat Kohli and KL Rahul ...,Sports
3,"""Don't Think Kohli Wants Power. He Just..."": E...",Virat Kohli just needs three more centuries to...,Sports
4,"""Good Workout"" For India: On Bangladesh Loss, ...","Bangladesh were reeling at 59/4 stage, before ...",Sports
...,...,...,...
731,YouTube's new games and UI changes might impro...,YouTube is adding playable games to its front ...,Technology
732,"'Yuvraj, Dhoni,...': Australia Legend Advices ...","The last time India hosted the ODI World Cup, ...",Sports
733,Zaggle Prepaid IPO: Zaggle IPO subscribed 31% ...,"Zaggle Prepaid IPO subscribed 31% on day 2, re...",Business
734,Zeenat Aman Recalls Norway Issuing Stamps In H...,Zeenat Aman opened up on a heartwarming surpri...,Entertainment


In [30]:
new_text_data = (test_dataset['title'] + " " + test_dataset['description']).tolist()
new_sequences = tokenizer.texts_to_sequences(new_text_data)
new_X = pad_sequences(new_sequences, maxlen=max_sequence_length)

In [31]:
predictions = model.predict(new_X)



In [32]:
decoded_predictions = label_encoder.inverse_transform(predictions.argmax(axis=1))
len(decoded_predictions)

715

In [33]:
for text, category in zip(new_text_data, decoded_predictions):
    print(f"Text: {text}\nPredicted Category: {category}\n")

Text: "Admire Lakshya Sen A Lot": Indian Shuttler Anupama Ahead Of Asian Games Indian badminton player Anupama Upadhyaya who is part of the Badminton team contingent for the Hangzhou Asian Games, said on Saturday that she admires Lakshya Sen
Predicted Category: Sports

Text: "Amazing Host" Priyanka Chopra And Preity Zinta Had This Much Fun At Jonas Brothers Concert "Last night I officially became a fan," writes Preity Zinta
Predicted Category: Entertainment

Text: "Beyond People's Imagination": Rajeev Shukla On India's Win vs Pakistan Fiery centuries from Virat Kohli and KL Rahul followed by Kuldeep Yadav's five-wicket haul powered India to 228 runs win over Pakistan
Predicted Category: Entertainment

Text: "Don't Think Kohli Wants Power. He Just...": Ex-India Star's Blunt Remark Virat Kohli just needs three more centuries to surpass Sachin Tendulkar in the list of most ODI tons
Predicted Category: Entertainment

Text: "Good Workout" For India: On Bangladesh Loss, Shastri's Big Observa

In [36]:
model.save("/content/drive/MyDrive/Colab Notebooks/category_predictor_cnn.h5")

  saving_api.save_model(
