# Deep learning

Due to time constraint we are gonna try 2 deep learning models

LTSM because LSTMs are particularly effective for tasks involving sequential or time-series data, such as natural language processing (NLP) and speech recognition. They can capture long-term dependencies in sequences.





In [31]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from tensorflow import keras
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

In [32]:
X = pd.read_csv('data/x_train.csv')
y = pd.read_csv('data/y_train.csv')
X=X['text_lemmatized']
y=y['scoreSentiment']
na_indices = X[X.isna()].index  # Assuming you want to drop rows with missing values in X

# Drop rows from X and y based on na_indices
X = X.drop(na_indices)
y = y.drop(na_indices)
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)
X_train, X_test, y_train, y_test = train_test_split(X[:10000], y[:10000], test_size=0.2, random_state=42)

In [33]:
print(y_train.dtype)
print(y_test.dtype)

int32
int32


In [34]:
from tensorflow.keras.utils import to_categorical
y_train = to_categorical(y_train)
y_test= to_categorical(y_test)

def preprocess_text(X_train, max_words, max_sequence_length):
    tokenizer = Tokenizer(num_words=max_words)
    tokenizer.fit_on_texts(X_train)
    sequences = tokenizer.texts_to_sequences(X_train)
    sequences = pad_sequences(sequences, maxlen=max_sequence_length)
    return tokenizer, sequences

max_words = 10000 
max_sequence_length = 100 
embedding_dim = 32
lstm_units = 32

tokenizer, X_train= preprocess_text(X_train, max_words, max_sequence_length)
X_test = pad_sequences(tokenizer.texts_to_sequences(X_test), maxlen=max_sequence_length)

modelLSTM = keras.Sequential()
modelLSTM.add(Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_sequence_length))
modelLSTM.add(LSTM(units=lstm_units))
modelLSTM.add(Dense(2, activation='softmax'))  # Update output layer to have two nodes
modelLSTM.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
early_stopping = EarlyStopping(monitor="loss", patience=3)
modelLSTM.fit(X_train, y_train, epochs=8, batch_size=2, callbacks=[early_stopping])

def evaluate_model(model, X, y):
    y_pred = model.predict(X)
    y_pred = np.argmax(y_pred, axis=1)
    accuracy = accuracy_score(np.argmax(y, axis=1), y_pred)
    return accuracy

lstm_accuracy = evaluate_model(modelLSTM, X_test, y_test)
print("LSTM Model Accuracy:", lstm_accuracy)

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
LSTM Model Accuracy: 0.7575


Here our accuracy is very good but i cannot manage to plot a confusion amtrix and because of the shape of ytest and ypred

# Second Model : Bert

In [47]:
from transformers import BertTokenizer, TFBertForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from tensorflow.keras.optimizers import Adam
import tensorflow as tf


X_train, X_test, y_train, y_test = train_test_split(X[:10000], y[:10000], test_size=0.2, random_state=42)
X_val = X[10001:12000]
y_val = y[10001:12000]

# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# Convert X_train, X_val, and X_test to lists of strings
X_train_texts = list(X_train)
X_val_texts = list(X_val)
X_test_texts = list(X_test)

# Tokenize and encode the training data
X_train_tokens = tokenizer(X_train_texts, padding=True, truncation=True, return_tensors='tf', max_length=max_sequence_length)
X_val_tokens = tokenizer(X_val_texts, padding=True, truncation=True, return_tensors='tf', max_length=max_sequence_length)
X_test_tokens = tokenizer(X_test_texts, padding=True, truncation=True, return_tensors='tf', max_length=max_sequence_length)


# Convert labels to categorical (if not already)
y_train_categorical = to_categorical(y_train, num_classes=2)
y_val_categorical = to_categorical(y_val, num_classes=2)

# Load pre-trained BERT model
bert_model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Compile the model
optimizer = Adam(learning_rate=2e-5)
bert_model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

# Convert BatchEncoding objects to tuples for hashability
X_train_tokens_hashable = (
    X_train_tokens['input_ids'],
    X_train_tokens['token_type_ids'],
    X_train_tokens['attention_mask']
)

X_val_tokens_hashable = (
    X_val_tokens['input_ids'],
    X_val_tokens['token_type_ids'],
    X_val_tokens['attention_mask']
)

X_test_tokens_hashable = (
    X_test_tokens['input_ids'],
    X_test_tokens['token_type_ids'],
    X_test_tokens['attention_mask']
)

# Now use the hashable versions when fitting the model
history = bert_model.fit(
    X_train_tokens_hashable,
    y_train_categorical,
    validation_data=(X_val_tokens_hashable, y_val_categorical),
    epochs=3,
    batch_size=8
)

# Evaluate the model
y_pred_proba = bert_model.predict(X_test_tokens)
y_pred = tf.argmax(y_pred_proba.logits, axis=1)
accuracy = accuracy_score(y_test, y_pred)
print("BERT Model Accuracy:", accuracy)

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3
 103/1000 [==>...........................] - ETA: 45:52 - loss: 0.6903 - accuracy: 0.3544

KeyboardInterrupt: 