##### Deep Learing ###

1) Data Cleaning

In [5]:
from logic.processing import load_data, preproc

import re

from sklearn.model_selection import train_test_split

import pandas as pd

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.callbacks import EarlyStopping

In [10]:
data = load_data('Combined Data.csv')

In [11]:
data = preproc(data)

In [12]:
def remove_unicode(s):
    """Remove unicode characters from a string."""
    if isinstance(s, str):
        return re.sub(r'[^\x00-\x7F]+', '', s)
    return s

data['clean'] = data['clean'].apply(remove_unicode)

In [13]:
def clean_text(s):
    """Clean the input text by removing URLs, markdown links, mentions, and punctuation."""
    if isinstance(s, str):
        s = re.sub(r"http[s]?://\S+", "", s)
        s = re.sub(r"\[.*?\]\(.*?\)", "", s)
        s = re.sub(r"@\w+", "", s)  
        s = re.sub(r"[^\w\s]", "", s)
        s = s.strip() 
        return s
    return s



In [14]:
data['clean'] = data['clean'].apply(clean_text)

2) Split train/test

In [15]:
X = data['clean']
y = data['status']

In [16]:
X.shape,y.shape

((52681,), (52681,))

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

3) Label encoding

In [18]:
y_train_encoded = pd.get_dummies(y_train).values
y_test_encoded = pd.get_dummies(y_test).values

4) Tokenisation

In [19]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
X_train_token = tokenizer.texts_to_sequences(X_train)
X_test_token = tokenizer.texts_to_sequences(X_test)

In [114]:
vocab_size = len(tokenizer.word_index)

5) Padding

In [83]:
max_len = 500

In [84]:
X_train_pad = pad_sequences(X_train_token, padding='post',maxlen=max_len)
X_test_pad = pad_sequences(X_test_token, padding='post',maxlen=max_len)

6.1) Model A

In [100]:
output_dim = y_train_encoded.shape[1]
output_dim

7

In [143]:
model_base = Sequential([
    tf.keras.Input(shape=(500,)),
    Embedding(input_dim=vocab_size+1, output_dim=50),
    LSTM(units=20),
    Dense(output_dim, activation='softmax')
])

# Compile the model
model_base.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model_base.summary()

In [144]:
es = EarlyStopping(patience=4, restore_best_weights=True)

model_base.fit(X_train_pad, y_train_encoded, 
          epochs=20, 
          batch_size=64,
          validation_split=0.3,
          callbacks=[es]
         )

Epoch 1/20
[1m461/461[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m84s[0m 175ms/step - accuracy: 0.2958 - loss: 1.6563 - val_accuracy: 0.3082 - val_loss: 1.6232
Epoch 2/20
[1m461/461[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 173ms/step - accuracy: 0.3125 - loss: 1.6112 - val_accuracy: 0.3082 - val_loss: 1.6238
Epoch 3/20
[1m461/461[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m79s[0m 171ms/step - accuracy: 0.3039 - loss: 1.6206 - val_accuracy: 0.3082 - val_loss: 1.6227
Epoch 4/20
[1m461/461[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 177ms/step - accuracy: 0.3161 - loss: 1.6121 - val_accuracy: 0.2932 - val_loss: 1.6222
Epoch 5/20
[1m461/461[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 173ms/step - accuracy: 0.2978 - loss: 1.6243 - val_accuracy: 0.3081 - val_loss: 1.6224
Epoch 6/20
[1m461/461[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 173ms/step - accuracy: 0.3137 - loss: 1.6113 - val_accuracy: 0.3081 - val_loss: 1.6227
Epoch 7/20

<keras.src.callbacks.history.History at 0x145a99b10>

In [146]:
model_base.evaluate(X_test_pad,y_test_encoded)

[1m330/330[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 35ms/step - accuracy: 0.2923 - loss: 1.6192


[1.6222203969955444, 0.2862294912338257]