<a href="https://colab.research.google.com/github/Exion007/Colab/blob/main/multiclass.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import re
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report

In [2]:
rawdf = pd.read_csv('file2.csv', engine="python", on_bad_lines="skip")
rawdf.dropna(inplace=True)

rawdf.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6345 entries, 0 to 8510
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    6345 non-null   object
 1   label   6345 non-null   object
dtypes: object(2)
memory usage: 148.7+ KB


In [None]:
labels = ['AVI', 'MIS', 'OTH', 'WIN', 'WEA',
           'COC', 'PRP', 'MEC', 'ELC', 'LAG',
           'FLC', 'FFC', 'ECS', 'FLU']

def filter_by_label(df):
  filtered_df = rawdf[rawdf['label'].apply(lambda x: isinstance(x, str) and x in labels)]
  return filtered_df

def expand_contractions(text):
    contractions = {
        "n't": "not",
        "'ve": "have",
        "'ll": "will",
        "'d": "would",
        "'re": "are",
        "'s": "is",
        "'m": "am"
    }
    pattern = re.compile(r"\b(?:" + "|".join(contractions.keys()) + r")\b")
    return pattern.sub(lambda match: contractions[match.group(0)], text)


def remove_special_characters(text):
    special_characters_pattern = re.compile(r"[#%@\<>{}()=\[\]*\-\\|_½]")
    return special_characters_pattern.sub("", text.lower())

df = filter_by_label(rawdf)
new = df["text"].apply(expand_contractions)
df["text"] = new
new = df["text"].apply(remove_special_characters)
df["text"] = new

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20867 entries, 0 to 23667
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    20867 non-null  object
 1   label   20867 non-null  object
dtypes: object(2)
memory usage: 489.1+ KB


In [None]:
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)

max_vocab_size = 15000
tokenizer = Tokenizer(num_words=max_vocab_size)
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)


max_seq_length = 100
X_train_padded = pad_sequences(X_train_seq, maxlen=max_seq_length)
X_test_padded = pad_sequences(X_test_seq, maxlen=max_seq_length)

In [None]:
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

In [None]:
embedding_dim = 100
num_classes = len(df['label'].unique())

In [None]:
model = Sequential()
model.add(Embedding(input_dim=max_vocab_size, output_dim=embedding_dim, input_length=max_seq_length))
model.add(LSTM(128))
model.add(Dense(num_classes, activation='softmax'))

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

batch_size = 32
num_epochs = 20
model.fit(X_train_padded, y_train_encoded, epochs=num_epochs, batch_size=batch_size, validation_data=(X_test_padded, y_test_encoded))

#y_pred_encoded = model.predict_classes(X_test_padded)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x794b8debb6a0>

In [None]:
y_pred = model.predict(X_test_padded)
y_pred_encoded = np.argmax(y_pred,axis=1)



In [None]:
accuracy = accuracy_score(y_test_encoded, y_pred_encoded)
print("Accuracy:", accuracy)
print(classification_report(y_test_encoded, y_pred_encoded))

Accuracy: 0.4563967417345472
              precision    recall  f1-score   support

           0       0.51      0.63      0.56      1531
           1       0.30      0.20      0.24       376
           2       0.20      0.12      0.15         8
           3       0.00      0.00      0.00         5
           4       0.29      0.24      0.26        29
           5       0.36      0.34      0.35       103
           6       0.00      0.00      0.00         3
           7       0.29      0.18      0.22        39
           8       0.00      0.00      0.00        23
           9       0.41      0.40      0.40       824
          10       0.53      0.43      0.47       824
          11       0.26      0.27      0.26        73
          12       0.40      0.37      0.39       316
          13       0.67      0.20      0.31        20

    accuracy                           0.46      4174
   macro avg       0.30      0.24      0.26      4174
weighted avg       0.45      0.46      0.45      41