# LSTM

## Tokenise and Pad sequences (Optionel, deja fait)

In [2]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Initialize the tokenizer
tokenizer = Tokenizer(num_words=20000, oov_token='<OOV>')
tokenizer.fit_on_texts(df['clean_text'])

# Convert texts to sequences
train_sequences = tokenizer.texts_to_sequences(train_texts)
val_sequences = tokenizer.texts_to_sequences(val_texts)

# Pad sequences
max_length = 100
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding='post')
val_padded = pad_sequences(val_sequences, maxlen=max_length, padding='post')


NameError: name 'df' is not defined

## Build the LSTM Model

In [1]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

max_length = 100

model = Sequential([
    Embedding(input_dim=20000, output_dim=128),
    LSTM(128, dropout=0.2, recurrent_dropout=0.2),
    Dense(1, activation='sigmoid')
])


#### Embedding Layer :
- Purpose: Converts word indices into dense vectors (embeddings).
- Parameters:
    - output_dim = 128 : Dimension of the embedding vectors. Each word will be represented as a 128-dimensional vector.
    - input_length = max_length : Length of input sequences. Required because the model needs to know the shape of its input.

#### LSTM Layer :
- Purpose: Processes the embedded sequences and captures temporal dependencies.
- Parameters:
    - 128 : Number of units (dimensionality of the output space). This is the number of LSTM cells in the layer.
    - dropout = 0.2 : Fraction of the input units to drop (regular dropout) to prevent overfitting.
    - recurrent_dropout = 0.2 : Fraction of the recurrent units to drop (dropout on the connections between the recurrent units).

#### Dense output layer :
- Purpose: Outputs a probability between 0 and 1, indicating the sentiment.
- Parameters:
    - 1 : Single neuron because it's a binary classification problem.
    - activation = 'sigmoid': Activation function that outputs values between 0 and 1.

## Compile the model

In [2]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

- loss = 'binary_crossentropy': Loss function suitable for binary classification tasks.
- optimizer = 'adam': Adam optimizer is an efficient stochastic gradient descent method.
- metrics = ['accuracy']: Specifies the metric to evaluate during training and testing.

## Train the model

In [3]:
import numpy as np
from sklearn.model_selection import train_test_split

# Assuming train_neg_padded and train_pos_padded are already defined
filename_pos = 'data/twitter-datasets/train_pos_embedding.txt'
filename_neg = 'data/twitter-datasets/train_neg_embedding.txt'

# Function to load embeddings
def load_embeddings(filename):
    embeddings = []
    with open(filename, 'r') as f:
        for line in f:
            line = line.strip()
            if line:
                str_values = line.split()
                float_values = [float(val) for val in str_values]
                embedding = np.array(float_values)
                embeddings.append(embedding)
    return embeddings

pos_embeddings = load_embeddings(filename_pos)
neg_embeddings = load_embeddings(filename_neg)

train_neg_padded = np.array(neg_embeddings)
train_pos_padded = np.array(pos_embeddings)

# Create labels
neg_labels = np.zeros(train_neg_padded.shape[0], dtype=int)
pos_labels = np.ones(train_pos_padded.shape[0], dtype=int)

# Combine data and labels
train_padded = np.concatenate((train_neg_padded, train_pos_padded), axis=0)
train_labels = np.concatenate((neg_labels, pos_labels), axis=0)

# Shuffle data
indices = np.arange(train_padded.shape[0])
np.random.shuffle(indices)
train_padded = train_padded[indices]
train_labels = train_labels[indices]

# Split into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    train_padded, train_labels, test_size=0.1, random_state=42)

# Train the model
model.fit(X_train, y_train, epochs=5, batch_size=32,
          validation_data=(X_val, y_val))


Epoch 1/5
[1m5625/5625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m123s[0m 22ms/step - accuracy: 0.5007 - loss: 0.6934 - val_accuracy: 0.5002 - val_loss: 0.6932
Epoch 2/5
[1m5625/5625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m133s[0m 24ms/step - accuracy: 0.4976 - loss: 0.6932 - val_accuracy: 0.5002 - val_loss: 0.6928
Epoch 3/5
[1m5625/5625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m95s[0m 17ms/step - accuracy: 0.5006 - loss: 0.6930 - val_accuracy: 0.5009 - val_loss: 0.6934
Epoch 4/5
[1m5625/5625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m134s[0m 24ms/step - accuracy: 0.4988 - loss: 0.6929 - val_accuracy: 0.5001 - val_loss: 0.6931
Epoch 5/5
[1m5625/5625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m134s[0m 24ms/step - accuracy: 0.5005 - loss: 0.6928 - val_accuracy: 0.5009 - val_loss: 0.6929


<keras.src.callbacks.history.History at 0x302b1d9a0>

- train_padded : Input data (padded sequences) for training.
- train_labels : Corresponding labels (0 or 1) for training data.
- epochs=5 : Number of times the model will cycle through the entire training dataset.
- batch_size = 32  Number of samples per gradient update. Smaller batch sizes can lead to more stable gradient estimates but increase computation time.
- validation_data=(val_padded, val_labels): Data on which to evaluate the model at the end of each epoch.

# Feedforward Neural Network

In [4]:
import numpy as np
from sklearn.model_selection import train_test_split

# Assuming train_neg_padded and train_pos_padded are already defined
filename_pos = 'data/twitter-datasets/train_pos_embedding.txt'
filename_neg = 'data/twitter-datasets/train_neg_embedding.txt'

# Function to load embeddings
def load_embeddings(filename):
    embeddings = []
    with open(filename, 'r') as f:
        for line in f:
            line = line.strip()
            if line:
                str_values = line.split()
                float_values = [float(val) for val in str_values]
                embedding = np.array(float_values)
                embeddings.append(embedding)
    return embeddings

pos_embeddings = load_embeddings(filename_pos)
neg_embeddings = load_embeddings(filename_neg)

train_neg_padded = np.array(neg_embeddings)
train_pos_padded = np.array(pos_embeddings)

# Create labels
neg_labels = np.zeros(train_neg_padded.shape[0], dtype=int)
pos_labels = np.ones(train_pos_padded.shape[0], dtype=int)

# Combine data and labels
train_padded = np.concatenate((train_neg_padded, train_pos_padded), axis=0)
train_labels = np.concatenate((neg_labels, pos_labels), axis=0)

# Shuffle data
indices = np.arange(train_padded.shape[0])
np.random.shuffle(indices)
train_padded = train_padded[indices]
train_labels = train_labels[indices]

# Split into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    train_padded, train_labels, test_size=0.1, random_state=42)

In [5]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

# Define the model architecture
model = Sequential([
    Dense(128, activation='relu', input_shape=(train_padded.shape[1],)),
    Dropout(0.2),
    Dense(64, activation='relu'),
    Dropout(0.2),
    Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32,
          validation_data=(X_val, y_val))


Epoch 1/10


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m5625/5625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 495us/step - accuracy: 0.5752 - loss: 0.6755 - val_accuracy: 0.5991 - val_loss: 0.6574
Epoch 2/10
[1m5625/5625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 467us/step - accuracy: 0.6008 - loss: 0.6586 - val_accuracy: 0.6040 - val_loss: 0.6534
Epoch 3/10
[1m5625/5625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 458us/step - accuracy: 0.6027 - loss: 0.6550 - val_accuracy: 0.6090 - val_loss: 0.6505
Epoch 4/10
[1m5625/5625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 478us/step - accuracy: 0.6066 - loss: 0.6512 - val_accuracy: 0.6101 - val_loss: 0.6489
Epoch 5/10
[1m5625/5625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 460us/step - accuracy: 0.6062 - loss: 0.6497 - val_accuracy: 0.6091 - val_loss: 0.6485
Epoch 6/10
[1m5625/5625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 463us/step - accuracy: 0.6088 - loss: 0.6476 - val_accuracy: 0.6129 - val_loss: 0.6441
Epoch 7/10
[1m

<keras.src.callbacks.history.History at 0x30b5c27f0>

# Logistic Regression

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Initialize the model
log_reg = LogisticRegression(max_iter=1000)

# Train the model
log_reg.fit(X_train, y_train)

# Evaluate the model
y_pred = log_reg.predict(X_val)
print(f'Validation Accuracy: {accuracy_score(y_val, y_pred):.4f}')
print(classification_report(y_val, y_pred))


Validation Accuracy: 0.5706
              precision    recall  f1-score   support

           0       0.58      0.51      0.55     10004
           1       0.56      0.63      0.59      9996

    accuracy                           0.57     20000
   macro avg       0.57      0.57      0.57     20000
weighted avg       0.57      0.57      0.57     20000



# Support Vector Machine

In [None]:
from sklearn.svm import SVC

# Initialize the model
svm_model = SVC(kernel='linear', probability=True)

# Train the model
svm_model.fit(X_train, y_train)

# Evaluate the model
y_pred = svm_model.predict(X_val)
print(f'Validation Accuracy: {accuracy_score(y_val, y_pred):.4f}')
print(classification_report(y_val, y_pred))


# Gradient Boosting


In [1]:
import xgboost as xgb

# Initialize the model
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')

# Train the model
xgb_model.fit(X_train, y_train)

# Evaluate the model
y_pred = xgb_model.predict(X_val)
print(f'Validation Accuracy: {accuracy_score(y_val, y_pred):.4f}')
print(classification_report(y_val, y_pred))


ModuleNotFoundError: No module named 'xgboost'

# LSTM 2

In [2]:
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

# 1. Load your vocabulary and embeddings
vocab = np.load('data/vocab.pkl', allow_pickle=True)
embeddings = np.load('data/embeddings.npy')

# 2. Load tweets and labels
with open('./data/twitter-datasets/train_pos.txt', 'r', encoding='utf-8') as f:
    pos_tweets = [line.strip() for line in f]

with open('./data/twitter-datasets/train_neg.txt', 'r', encoding='utf-8') as f:
    neg_tweets = [line.strip() for line in f]

tweets = pos_tweets + neg_tweets
labels = [1]*len(pos_tweets) + [0]*len(neg_tweets)

# 3. Tokenize tweets
tokenizer = Tokenizer(oov_token='<OOV>')
tokenizer.fit_on_texts(tweets)
sequences = tokenizer.texts_to_sequences(tweets)
word_index = tokenizer.word_index
vocab_size = len(word_index) + 1

# 4. Create embedding matrix
embedding_dim = embeddings.shape[1]
embedding_matrix = np.zeros((vocab_size, embedding_dim))

for word, i in word_index.items():
    vocab_index = vocab.get(word)
    if vocab_index is not None:
        embedding_vector = embeddings[vocab_index]
        embedding_matrix[i] = embedding_vector
    else:
        # Handle out-of-vocabulary words (optional)
        embedding_matrix[i] = np.random.normal(scale=0.6, size=(embedding_dim,))

# 5. Pad sequences
max_length = 50  # Adjust based on your data
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post', truncating='post')
labels = np.array(labels)

# 6. Split data
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(padded_sequences, labels, test_size=0.1, random_state=42)

# 7. Build the model
model = Sequential([
    Embedding(input_dim=vocab_size,
              output_dim=embedding_dim,
              weights=[embedding_matrix],
              input_length=max_length,
              trainable=False),
    LSTM(128, dropout=0.2, recurrent_dropout=0.2),
    Dense(1, activation='sigmoid')
])

# 8. Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# 9. Train the model
model.fit(X_train, y_train, epochs=5, batch_size=32, validation_data=(X_val, y_val))

# 10. Evaluate the model
loss, accuracy = model.evaluate(X_val, y_val)
print(f'Validation Loss: {loss:.4f}')
print(f'Validation Accuracy: {accuracy:.4f}')

# 11. Predict on test data
with open('./data/twitter-datasets/test_data.txt', 'r', encoding='utf-8') as f:
    test_tweets = [line.strip() for line in f]

test_sequences = tokenizer.texts_to_sequences(test_tweets)
test_padded = pad_sequences(test_sequences, maxlen=max_length, padding='post', truncating='post')

predictions = model.predict(test_padded)
predicted_labels = (predictions > 0.5).astype(int).flatten()

# Prepare submission
test_ids = [i for i in range(1, len(test_tweets) + 1)]  # Adjust based on your test data IDs
submission = pd.DataFrame({
    'Id': test_ids,
    'Prediction': predicted_labels
})
submission.to_csv('submission.csv', index=False)


Epoch 1/5




[1m5625/5625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m105s[0m 19ms/step - accuracy: 0.6215 - loss: 0.6223 - val_accuracy: 0.6445 - val_loss: 0.5851
Epoch 2/5
[1m5625/5625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m104s[0m 19ms/step - accuracy: 0.6371 - loss: 0.5985 - val_accuracy: 0.6446 - val_loss: 0.5842
Epoch 3/5
[1m5625/5625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m103s[0m 18ms/step - accuracy: 0.6459 - loss: 0.5897 - val_accuracy: 0.6486 - val_loss: 0.5801
Epoch 4/5
[1m5625/5625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m104s[0m 19ms/step - accuracy: 0.6441 - loss: 0.5859 - val_accuracy: 0.6467 - val_loss: 0.5754
Epoch 5/5
[1m5625/5625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m103s[0m 18ms/step - accuracy: 0.6431 - loss: 0.5832 - val_accuracy: 0.6569 - val_loss: 0.5711
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 7ms/step - accuracy: 0.6597 - loss: 0.5707
Validation Loss: 0.5711
Validation Accuracy: 0.6569
[1m313/313[0m [3