### Just getting started ###

In [1]:
import numpy as np

# 1. Load the vocabulary (list of words)
with open("vocab_cut.txt", "r") as f:
    words = [line.strip() for line in f]

# Check the number of words in vocab.txt
print(f"Number of words in vocab.txt: {len(words)}")

# 2. Load the embedding matrix
embedding_matrix = np.load("embeddings_transfo.npy")

# Check the shape of the embedding matrix
#print(f"Embedding matrix shape: {embedding_matrix.shape}")  # Should be (101298, 20)

# 3. Create the word-to-embedding dictionary
glove_embeddings = {words[i]: embedding_matrix[i] for i in range(len(words))}

Number of words in vocab.txt: 101298


In [2]:
import pandas as pd

# Define file paths
data_path = "data/twitter-datasets/"
train_neg_path = f"{data_path}train_neg_full.txt"
train_pos_path = f"{data_path}train_pos_full.txt"
test_path = f"{data_path}test_data.txt"

# Load negative tweets and assign a label of -1
with open(train_neg_path, "r") as f:
    neg_tweets = [(line.strip(), -1) for line in f]

# Load positive tweets and assign a label of +1
with open(train_pos_path, "r") as f:
    pos_tweets = [(line.strip(), 1) for line in f]

with open(test_path, "r") as f:
    test_tweets = [(line.strip(), -1) for line in f]

# Combine the positive and negative tweets into a single list
tweets_with_labels = neg_tweets + pos_tweets

# Optional: Shuffle the dataset (important for training)
import random
random.shuffle(tweets_with_labels)

# Convert to a DataFrame for easy manipulation and viewing
df = pd.DataFrame(tweets_with_labels, columns=["tweet", "label"])
df_test = pd.DataFrame(test_tweets, columns=["tweet", "label"])
# Display the first few rows of the DataFrame
print(df.head())
print(df_test.head())


                                               tweet  label
0           stupid ice coffee gave me a stomach ache     -1
1  <user> ya ! seudah . plus decor is way prettie...      1
2  <user> just watched your jenga game with <user...      1
3  oh pandora , why do you have to be nice to me ...     -1
4  ass grabbing while kissing > > > #somethingshe...      1
                                               tweet  label
0  1,sea doo pro sea scooter ( sports with the po...     -1
1  2,<user> shucks well i work all week so now i ...     -1
2          3,i cant stay away from bug thats my baby     -1
3  4,<user> no ma'am ! ! ! lol im perfectly fine ...     -1
4  5,whenever i fall asleep watching the tv , i a...     -1


In [3]:
def get_average_embedding(tweet, glove_embeddings, embedding_dim=20):
    words = tweet.split()  # Tokenize the tweet by splitting on whitespace
    word_vectors = [glove_embeddings[word] for word in words if word in glove_embeddings]

    if not word_vectors:
        # If no words in the tweet have embeddings, return a zero vector
        return np.zeros(embedding_dim)
    
    # Average the word vectors
    avg_vector = np.mean(word_vectors, axis=0)
    return avg_vector

In [4]:
embedding_dim = 384  # Based on your embedding vector dimension
df["feature"] = df["tweet"].apply(lambda tweet: get_average_embedding(tweet, glove_embeddings, embedding_dim))
df.head()

df_test["feature"] = df_test["tweet"].apply(lambda tweet: get_average_embedding(tweet, glove_embeddings, embedding_dim))
feature_matrix_test = np.vstack(df_test["feature"].values)
# Convert the list of arrays in "feature" to a feature matrix for ML algorithms
feature_matrix = np.vstack(df["feature"].values)
labels = df["label"].values

# Check the shape of the feature matrix and a sample of data
print("Feature matrix shape:", feature_matrix.shape)  # Should be (number_of_tweets, embedding_dim)
print("Sample features:", feature_matrix[:5])
print("Labels:", labels[:5])

Feature matrix shape: (2500000, 384)
Sample features: [[-0.04732432  0.01590263 -0.01478511 ...  0.02944557 -0.00037405
   0.02391698]
 [-0.06798897  0.0254344   0.01065353 ...  0.01780745  0.03044034
  -0.00491464]
 [-0.04491756  0.02407607 -0.00086558 ...  0.0276089   0.00562347
   0.0198721 ]
 [-0.04566717  0.02825467 -0.00125924 ...  0.03640893  0.02461299
   0.01075954]
 [-0.05077406  0.03024042  0.01428271 ...  0.03757446  0.01713703
   0.01155091]]
Labels: [-1  1  1 -1  1]


In [7]:
np.save("feature_matrix_allMiniLM", feature_matrix)
np.save("labels_allMiniLM",labels)


In [9]:
feature_matrix = np.load("feature_matrix_allMiniLM.npy")
labels=np.load("labels_allMiniLM.npy")

## Linear regression

In [5]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import StandardScaler

# Load your feature matrix and labels from previous steps
# feature_matrix: the matrix of averaged embeddings for each tweet
# labels: the corresponding labels (+1 for positive, -1 for negative)

# Split the data into training and test sets (e.g., 80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(feature_matrix, labels, test_size=0.2, random_state=42)

# Normalize the feature matrix

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Initialize the Linear Regression model
model = LinearRegression()

# Train the model on the training data
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Convert predictions to binary labels (+1 or -1) by rounding to nearest integer
y_pred_labels = np.where(y_pred >= 0, 1, -1)

# Evaluate the model's performance using accuracy
# accuracy = accuracy_score(y_test, y_pred_labels)
# print("Test set accuracy:", accuracy)

# # Evaluate the model's performance using F1 score
# f1 = f1_score(y_test, y_pred_labels)
# print("Test set F1 score:", f1)


In [6]:
#Evaluate the model's performance using accuracy
accuracy = accuracy_score(y_test, y_pred_labels)
print("Test set accuracy:", accuracy)

#Evaluate the model's performance using F1 score
f1 = f1_score(y_test, y_pred_labels)
print("Test set F1 score:", f1)


Test set accuracy: 0.78639
Test set F1 score: 0.7939118068727581


## MLP sklearn

In [5]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import StandardScaler

# Load your feature matrix and labels (replace with actual variables)
# feature_matrix: averaged embeddings for each tweet
# labels: corresponding labels (+1 for positive, -1 for negative)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    feature_matrix, labels, test_size=0.2, random_state=42
)

# Normalize the feature matrix
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Initialize the MLP Classifier model with default parameters
mlp = MLPClassifier(random_state=42)

# Train the MLP Classifier
mlp.fit(X_train, y_train)

# Predict on the test set
y_pred = mlp.predict(X_test)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Test set accuracy:", accuracy)
print("Test set F1 score:", f1)



Test set accuracy: 0.841316
Test set F1 score: 0.8438118860607492


## MLP torch (100 epochs)

In [16]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score
import numpy as np
from tqdm import tqdm

# Vérifier si le GPU MPS est disponible
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print("Using device:", device)


# Division en train/test
X_train, X_test, y_train, y_test = train_test_split(
    feature_matrix, labels, test_size=0.2, random_state=42
)

# Normalisation des données
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Conversion des données en tenseurs PyTorch
X_train = torch.tensor(X_train, dtype=torch.float32).to(device)
X_test = torch.tensor(X_test, dtype=torch.float32).to(device)
y_train = torch.tensor(y_train, dtype=torch.float32).to(device)
y_test = torch.tensor(y_test, dtype=torch.float32).to(device)

# Définir un modèle MLP
class MLP(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(MLP, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),  # Couche cachée
            nn.ReLU(),  # Fonction d'activation ReLU
            nn.Linear(hidden_dim, output_dim),  # Couche de sortie
            nn.Sigmoid()  # Fonction d'activation pour la classification binaire
        )

    def forward(self, x):
        return self.model(x)

# Initialiser le modèle, la fonction de perte et l'optimiseur
input_dim = X_train.shape[1]
hidden_dim = 50  # 50 neurones dans la couche cachée
output_dim = 1   # Sortie binaire (sigmoïde)

model = MLP(input_dim, hidden_dim, output_dim).to(device)
criterion = nn.BCELoss()  # Binary Cross-Entropy Loss
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Entraînement du modèle
epochs = 100
for epoch in tqdm(range(epochs), desc="Training Progress", unit="epoch"):
    model.train()
    optimizer.zero_grad()
    
    # Prédiction et calcul de la perte
    outputs = model(X_train).squeeze()
    loss = criterion(outputs, (y_train + 1) / 2)  # Transformer -1/+1 en 0/1
    
    # Backpropagation et optimisation
    loss.backward()
    optimizer.step()

    if (epoch + 1) % 10 == 0:
        print(f"Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}")

# Évaluation du modèle
model.eval()
with torch.no_grad():
    y_pred_proba = model(X_test).squeeze()
    y_pred = torch.where(y_pred_proba >= 0.5, 1.0, -1.0)  # Seuil à 0.5 pour binariser les prédictions

# Conversion en NumPy pour sklearn
y_test = y_test.cpu().numpy()
y_pred = y_pred.cpu().numpy()

# Évaluation avec accuracy et F1-score
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Test set accuracy: {accuracy:.4f}")
print(f"Test set F1 score: {f1:.4f}")


Using device: mps
Epoch [10/100], Loss: 0.5626
Epoch [20/100], Loss: 0.5044
Epoch [30/100], Loss: 0.4732
Epoch [40/100], Loss: 0.4537
Epoch [50/100], Loss: 0.4409
Epoch [60/100], Loss: 0.4320
Epoch [70/100], Loss: 0.4254
Epoch [80/100], Loss: 0.4202
Epoch [90/100], Loss: 0.4159
Epoch [100/100], Loss: 0.4122
Test set accuracy: 0.8057
Test set F1 score: 0.8101


## Pour des essais réduction du nombre de tweets

In [10]:
import numpy as np



sample_size = 2500
selected_indices = np.random.choice(feature_matrix.shape[0], sample_size, replace=False)

# Réduire feature_matrix et labels en fonction des indices sélectionnés
reduced_feature_matrix = feature_matrix[selected_indices]
reduced_labels = labels[selected_indices]

# Afficher les formes des nouvelles matrices
print("Original feature matrix shape:", feature_matrix.shape)
print("Reduced feature matrix shape:", reduced_feature_matrix.shape)
print("Original labels shape:", labels.shape)
print("Reduced labels shape:", reduced_labels.shape)

Original feature matrix shape: (2500000, 384)
Reduced feature matrix shape: (2500, 384)
Original labels shape: (2500000,)
Reduced labels shape: (2500,)


In [8]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import StandardScaler

# Load your feature matrix and labels from previous steps
# feature_matrix: the matrix of averaged embeddings for each tweet
# labels: the corresponding labels (+1 for positive, -1 for negative)

# Split the data into training and test sets (e.g., 80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(reduced_feature_matrix, reduced_labels, test_size=0.2, random_state=42)

# Normalize the feature matrix

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Initialize the Linear Regression model
model = LinearRegression()

# Train the model on the training data
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Convert predictions to binary labels (+1 or -1) by rounding to nearest integer
y_pred_labels = np.where(y_pred >= 0, 1, -1)

# Evaluate the model's performance using accuracy
accuracy = accuracy_score(y_test, y_pred_labels)
print("Test set accuracy:", accuracy)

# Evaluate the model's performance using F1 score
f1 = f1_score(y_test, y_pred_labels)
print("Test set F1 score:", f1)


Test set accuracy: 0.78632
Test set F1 score: 0.7943882067664833


In [9]:
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm  # Barre de progression
import numpy as np


# Split des données
X_train, X_test, y_train, y_test = train_test_split(
    reduced_feature_matrix, reduced_labels, test_size=0.2, random_state=42
)

# Normalisation
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Convertir en tenseurs torch et déplacer sur GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
X_train = torch.tensor(X_train, dtype=torch.float32).to(device)
X_test = torch.tensor(X_test, dtype=torch.float32).to(device)
y_train = torch.tensor(y_train, dtype=torch.float32).to(device)
y_test = torch.tensor(y_test, dtype=torch.float32).to(device)

# Modèle de régression linéaire
class LinearRegressionModel(nn.Module):
    def __init__(self, input_dim):
        super(LinearRegressionModel, self).__init__()
        self.linear = nn.Linear(input_dim, 1)

    def forward(self, x):
        return self.linear(x)

# Initialiser et entraîner le modèle
model = LinearRegressionModel(X_train.shape[1]).to(device)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

# Entraînement avec barre de progression
epochs = 1000
for epoch in tqdm(range(epochs), desc="Training Progress", unit="epoch"):
    model.train()
    optimizer.zero_grad()
    outputs = model(X_train).squeeze()
    loss = criterion(outputs, y_train)
    loss.backward()
    optimizer.step()

# Prédiction
model.eval()
with torch.no_grad():
    y_pred = model(X_test).squeeze()
    y_pred_labels = torch.where(y_pred >= 0, torch.tensor(1.0).to(device), torch.tensor(-1.0).to(device))

# Évaluation
accuracy = (y_pred_labels == y_test).float().mean().item()
print("Test set accuracy:", accuracy)


Training Progress: 100%|██████████| 1000/1000 [00:08<00:00, 122.39epoch/s]

Test set accuracy: 0.7866399884223938





In [10]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score
import numpy as np

# Vérifier si le GPU MPS est disponible
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print("Using device:", device)


# Division en train/test
X_train, X_test, y_train, y_test = train_test_split(
    reduced_feature_matrix, reduced_labels, test_size=0.2, random_state=42
)

# Normalisation des données
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Conversion des données en tenseurs PyTorch
X_train = torch.tensor(X_train, dtype=torch.float32).to(device)
X_test = torch.tensor(X_test, dtype=torch.float32).to(device)
y_train = torch.tensor(y_train, dtype=torch.float32).to(device)
y_test = torch.tensor(y_test, dtype=torch.float32).to(device)

# Définir un modèle MLP
class MLP(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(MLP, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),  # Couche cachée
            nn.ReLU(),  # Fonction d'activation ReLU
            nn.Linear(hidden_dim, output_dim),  # Couche de sortie
            nn.Sigmoid()  # Fonction d'activation pour la classification binaire
        )

    def forward(self, x):
        return self.model(x)

# Initialiser le modèle, la fonction de perte et l'optimiseur
input_dim = X_train.shape[1]
hidden_dim = 50  # 50 neurones dans la couche cachée
output_dim = 1   # Sortie binaire (sigmoïde)

model = MLP(input_dim, hidden_dim, output_dim).to(device)
criterion = nn.BCELoss()  # Binary Cross-Entropy Loss
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Entraînement du modèle
epochs = 100
for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()
    
    # Prédiction et calcul de la perte
    outputs = model(X_train).squeeze()
    loss = criterion(outputs, (y_train + 1) / 2)  # Transformer -1/+1 en 0/1
    
    # Backpropagation et optimisation
    loss.backward()
    optimizer.step()

    if (epoch + 1) % 10 == 0:
        print(f"Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}")

# Évaluation du modèle
model.eval()
with torch.no_grad():
    y_pred_proba = model(X_test).squeeze()
    y_pred = torch.where(y_pred_proba >= 0.5, 1.0, -1.0)  # Seuil à 0.5 pour binariser les prédictions

# Conversion en NumPy pour sklearn
y_test = y_test.cpu().numpy()
y_pred = y_pred.cpu().numpy()

# Évaluation avec accuracy et F1-score
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Test set accuracy: {accuracy:.4f}")
print(f"Test set F1 score: {f1:.4f}")


Using device: mps
Epoch [10/100], Loss: 0.5595
Epoch [20/100], Loss: 0.4993
Epoch [30/100], Loss: 0.4694
Epoch [40/100], Loss: 0.4508
Epoch [50/100], Loss: 0.4387
Epoch [60/100], Loss: 0.4301
Epoch [70/100], Loss: 0.4236
Epoch [80/100], Loss: 0.4183
Epoch [90/100], Loss: 0.4137
Epoch [100/100], Loss: 0.4095
Test set accuracy: 0.8052
Test set F1 score: 0.8111


In [17]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import StandardScaler

# Load your feature matrix and labels (replace with actual variables)
# feature_matrix: averaged embeddings for each tweet
# labels: corresponding labels (+1 for positive, -1 for negative)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    reduced_feature_matrix, reduced_labels, test_size=0.2, random_state=42
)

# Normalize the feature matrix
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Initialize the MLP Classifier model with default parameters
mlp = MLPClassifier(random_state=42)

# Train the MLP Classifier
mlp.fit(X_train, y_train)

# Predict on the test set
y_pred = mlp.predict(X_test)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Test set accuracy:", accuracy)
print("Test set F1 score:", f1)

Test set accuracy: 0.79286
Test set F1 score: 0.7968139995683989




In [27]:
ids = np.arange(1, len(y_pred_labels) + 1)

In [29]:
from helpers import create_csv_submission
create_csv_submission(ids, y_pred_labels, "submission_embed_transfo1.csv")