In [29]:
from sklearn.datasets import fetch_20newsgroups
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report,confusion_matrix
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from bs4 import BeautifulSoup
from unidecode import unidecode
from gensim.models import Word2Vec
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from nltk.stem import WordNetLemmatizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense,Dropout, BatchNormalization
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping
import fasttext
import numpy as np


In [28]:
!pip install fasttext
!pip install unidecode



In [30]:
data = fetch_20newsgroups(subset='all', shuffle=True, remove=('headers', 'footers', 'quotes'))
df = pd.DataFrame({'text': data.data, 'label': data.target})
df = df[df['text'].apply(lambda x: len(x.strip()) > 0)]

In [31]:
len(df['label'].value_counts())

20

In [32]:
#df['text']

In [33]:
pd.set_option('display.max_colwidth', None)
df['text'].head(15)


0                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       

In [34]:
def preprocess_text(df_column):
    # Remove HTML tags
    df_column = df_column.apply(BeautifulSoup, args=('html.parser',)).apply(lambda x: x.get_text())

    # Removing accented characters
    df_column = df_column.apply(unidecode)
    df_column = df_column.str.lower()
    df_column = df_column.apply(lambda x: re.sub(r'[^a-zA-Z\s]', '', x))
    # Stemming
    #stemmer = PorterStemmer()
    #df_column = df_column.apply(lambda x: ' '.join([stemmer.stem(word) for word in x.split()]))
    lemmatizer = WordNetLemmatizer()
    df_column = df_column.apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]))

    # Removing stop-words
    stop_words = set(stopwords.words('english'))
    df_column = df_column.apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in stop_words]))


    return df_column

In [35]:
import nltk
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [36]:
df['text'] = preprocess_text(df['text'])

  return func(x, *args, **kwargs)


In [37]:
df = df[['label','text']]

In [38]:
df.head(6)

Unnamed: 0,label,text
0,10,sure bashers pen fan pretty confused lack kind post recent pen massacre devil actually bit puzzled bit relieved however going put end nonpittsburghers relief bit praise pen man killing devil worse thought jagr showed much better regular season stats also lot fo fun watch playoff bowman let jagr lot fun next couple game since pen going beat pulp jersey anyway wa disappointed see islander lose final regular season game pen rule
1,3,brother market highperformance video card support vesa local bus mb ram doe anyone suggestionsideas diamond stealth pro local bus orchid farenheit ati graphic ultra pro highperformance vlb card please post email thank matt
2,17,finally said dream mediterranean wa new area greater year like holocaust number ist july usa sweden april still cold changed calendar nothing mentioned true let say true shall azeri woman child going pay price raped killed tortured armenian hearded something called geneva convention facist ohhh forgot armenian fight nobody ha forgot killing rapings torture kurd turk upon time ohhhh swedish redcross worker lie ever say regional killer dont like person shoot thats policyl confused search turkish plane dont know talking turkey government ha announced giving weapon azerbadjan since armenia started attack azerbadjan self karabag province search plane weapon since content announced weapon one thats confused thats right give weapon azeri since armenian started fight azerbadjan shoot armenian bread butter arm personel russian army
3,3,think scsi card dma transfer disk scsi card dma transfer containing data scsi device attached want important feature scsi ability detach device free scsi bus device typically used multitasking start transfer several device device seeking data bus free command data transfer device ready transfer data aquire bus send data ide bus start transfer bus busy disk ha seeked data transfered typically second lock process wanting bus irrespective transfer time
4,4,old jasmine drive cannot use new system understanding upsate driver modern one order gain compatability system doe anyone know inexpensive program seen formatters buit idea work another ancient device one tape drive back utility freeze system try use drive jasmine direct tape bought used w tape techmar mechanism essentially question anyone know inexpensive beckup utility use system
5,12,back high school worked lab assistant bunch experimental psychologist bell lab visual perception memory experiment used vectortype display millisecond refresh rate common case th sec quite practical experimenter probably sure wa millisecond either steve


In [39]:
df.to_csv('cleaned_dataset.csv', index=False)

In [40]:
new_df = df

In [41]:
max_sequence_length = 60 #assuming this vlaue

In [42]:
X_train1, X_test1, y_train1, y_test1 = train_test_split(new_df['text'],new_df['label'],test_size = 0.33, random_state = 96)

In [43]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train1)
train_sequences = tokenizer.texts_to_sequences(X_train1)
test_sequences = tokenizer.texts_to_sequences(X_test1)

In [44]:
train_padded = pad_sequences(train_sequences, maxlen=max_sequence_length, padding='post')
test_padded = pad_sequences(test_sequences, maxlen=max_sequence_length, padding='post')
vocab_size = len(tokenizer.word_index) + 1

In [45]:
y_train1

13595     4
9803      2
16587     4
9742     18
1314     19
         ..
14182    10
9049     15
2483     12
9912      0
14834     4
Name: label, Length: 12281, dtype: int64

In [None]:
#from sklearn.preprocessing import LabelEncoder
#y_train_encoded = label_encoder.fit_transform(y_train)
#y_test_encoded = label_encoder.fit_transform(y_test)

In [25]:
early_stop = EarlyStopping(monitor='val_loss', patience=8, restore_best_weights=True)

lstm_model = Sequential()
lstm_model.add(Embedding(input_dim=vocab_size, output_dim=200, input_length=max_sequence_length))
lstm_model.add(LSTM(100))
lstm_model.add(Dense(100, activation='relu'))
lstm_model.add(Dense(20, activation='softmax'))
lstm_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Use early stopping during model training
lstm_model.fit(train_padded, y_train1, validation_data=(test_padded, y_test1),epochs = 50, callbacks=[early_stop])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50


<keras.src.callbacks.History at 0x7e7a66691360>

In [26]:
y_pred = lstm_model.predict(test_padded)
y_pred_class = [np.argmax(element) for element in y_pred]
acc = accuracy_score(y_test1, y_pred_class)
print(f"Accuracy: {acc * 100:.2f}%")
print(f"Confusion Matrix: {confusion_matrix(y_test1,y_pred_class)}")
lstm_classification_report = classification_report(y_test1, y_pred_class, digits=4)
print(lstm_classification_report)
print(y_pred)

Accuracy: 55.69%
Confusion Matrix: [[ 89   0   2   0   0   0   3   4  14   0   0   5   1   2   4  59  13   4
   15  50]
 [  0 112  36  15   3  19  21   2   6   2   0   6  48   4  19   1   6   1
    0   1]
 [  3  23 110  25  25  87  12   0   5   0   0   0  24   0   5   0   4   0
    0   0]
 [  0  15  35 136  55  15  15  35   7   0   0   0  33   0   1   0   0   0
    1   0]
 [  1   6  17  39 156   5  21  28  15   0   0   1  24   2   1   0   1   0
    0   0]
 [  1  24  68   9   2 184  15   1   7   1   0   2  11   0   0   0   1   0
    0   0]
 [  0  15  20   3  20   2 193   8  17   0   0   4  14   0  12   0  12   1
    1   0]
 [  2   1   3   2   4   0  14 172  52   3   0   1  45   1   9   1  10   0
    3   2]
 [  1   1   0   0   2   0   2  47 210   4   0   1  13   7   5   3  21   0
    6   4]
 [  3   1   0   0   0   0   4   5  32 228   5   0   4   1   1   2  18   1
    3  10]
 [  5   0   0   0   0   1   3   0  14  57 205   2   0   1   3   1  41   0
    0   4]
 [  1   4   3   0   1   1   6 

In [27]:
# Evaluate LSTM Model
lstm_loss, lstm_accuracy = lstm_model.evaluate(test_padded, y_test1)

print("\nLSTM Model:")
print(f"Accuracy: {lstm_accuracy}")


LSTM Model:
Accuracy: 0.5568594932556152


In [46]:
early_stop = EarlyStopping(monitor='val_loss', patience=7, restore_best_weights=True)

lstm_model = Sequential()
lstm_model.add(Embedding(input_dim=vocab_size, output_dim=120, input_length=max_sequence_length))
lstm_model.add(LSTM(120))
lstm_model.add(Dense(60, activation='relu'))
lstm_model.add(Dense(20, activation='softmax'))
lstm_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Use early stopping during model training
lstm_model.fit(train_padded, y_train1, validation_data=(test_padded, y_test1),epochs = 50, callbacks=[early_stop])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50


<keras.src.callbacks.History at 0x7e7a8b3a4850>

In [47]:
###for the smallest model
y_pred = lstm_model.predict(test_padded)
y_pred_class = [np.argmax(element) for element in y_pred]
acc = accuracy_score(y_test1, y_pred_class)
print(f"Accuracy: {acc * 100:.2f}%")
print(f"Confusion Matrix: {confusion_matrix(y_test1,y_pred_class)}")
lstm_classification_report = classification_report(y_test1, y_pred_class, digits=4)
print(lstm_classification_report)
print(y_pred)

Accuracy: 52.69%
Confusion Matrix: [[ 29   4   1   0   0   0   0   4   0   2   3  15   1   1   7  45   4  13
   29 107]
 [  0 172  20  19  13   1   8   0   0   0   0  17  39   1   4   1   2   2
    3   0]
 [  0  37 192  33   9  28   1   0   0   2   0   7   7   1   3   0   0   2
    1   0]
 [  0  95  36 139  33   4   8   1   0   0   0   5  22   0   3   0   0   2
    0   0]
 [  0  88  15 130  38   3  11   3   0   3   0   5  13   1   3   1   1   1
    1   0]
 [  0  26 191  13   5  66   4   0   0   1   0  13   3   0   2   0   1   1
    0   0]
 [  0  83   2   3   6   1 112   5   1   0   3  12  88   0   3   0   1   0
    0   2]
 [  0   5   2   0   0   0   1 199   8   0   0  16  75   1   9   0   2   1
    3   3]
 [  1   5   1   0   0   0   0  85 128   7   0  17  26   4  38   3   2   4
    2   4]
 [  0   3   1   0   0   0   0   0   3 238  20  20   4   8   5   1   3   5
    7   0]
 [  0   0   3   0   0   0   0   2   0  28 263  18   1   1   3   0   3   3
   12   0]
 [  0  17   2   0   0   0   0 

In [48]:
# Evaluate LSTM Model
lstm_loss, lstm_accuracy = lstm_model.evaluate(test_padded, y_test1)

print("\nLSTM Model:")
print(f"Accuracy: {lstm_accuracy}")


LSTM Model:
Accuracy: 0.5269421339035034
