### Just getting started ###

In [1]:
import numpy as np

# 1. Load the vocabulary (list of words)
with open("vocab_cut.txt", "r") as f:
    words = [line.strip() for line in f]

# Check the number of words in vocab.txt
print(f"Number of words in vocab.txt: {len(words)}")

# 2. Load the embedding matrix
embedding_matrix = np.load("embeddings.npy")

# Check the shape of the embedding matrix
#print(f"Embedding matrix shape: {embedding_matrix.shape}")  # Should be (101298, 20)

# 3. Create the word-to-embedding dictionary
glove_embeddings = {words[i]: embedding_matrix[i] for i in range(len(words))}

Number of words in vocab.txt: 101298


In [2]:
import pandas as pd

# Define file paths
data_path = "data/twitter-datasets/"
train_neg_path = f"{data_path}train_neg_full.txt"
train_pos_path = f"{data_path}train_pos_full.txt"
test_path = f"{data_path}test_data.txt"

# Load negative tweets and assign a label of -1
with open(train_neg_path, "r") as f:
    neg_tweets = [(line.strip(), -1) for line in f]

# Load positive tweets and assign a label of +1
with open(train_pos_path, "r") as f:
    pos_tweets = [(line.strip(), 1) for line in f]

with open(test_path, "r") as f:
    test_tweets = [(line.strip(), -1) for line in f]

# Combine the positive and negative tweets into a single list
tweets_with_labels = neg_tweets + pos_tweets

# Optional: Shuffle the dataset (important for training)
import random
random.shuffle(tweets_with_labels)

# Convert to a DataFrame for easy manipulation and viewing
df = pd.DataFrame(tweets_with_labels, columns=["tweet", "label"])
df_test = pd.DataFrame(test_tweets, columns=["tweet", "label"])
# Display the first few rows of the DataFrame
print(df.head())
print(df_test.head())


                                               tweet  label
0  have you even dream that i'm not yours anymore...     -1
1  so sad , uno's leaving tomorrow , vacation's over     -1
2  <user> hey dw about it thanks anyways ! owe ya...      1
3  <user> ooo that's a shocker nd goood basketyba...      1
4  <user> you should considerrr deleting your twi...      1
                                               tweet  label
0  1,sea doo pro sea scooter ( sports with the po...     -1
1  2,<user> shucks well i work all week so now i ...     -1
2          3,i cant stay away from bug thats my baby     -1
3  4,<user> no ma'am ! ! ! lol im perfectly fine ...     -1
4  5,whenever i fall asleep watching the tv , i a...     -1


In [3]:
def get_average_embedding(tweet, glove_embeddings, embedding_dim=20):
    words = tweet.split()  # Tokenize the tweet by splitting on whitespace
    word_vectors = [glove_embeddings[word] for word in words if word in glove_embeddings]

    if not word_vectors:
        # If no words in the tweet have embeddings, return a zero vector
        return np.zeros(embedding_dim)
    
    # Average the word vectors
    avg_vector = np.mean(word_vectors, axis=0)
    return avg_vector

In [None]:
embedding_dim = 20  # Based on your embedding vector dimension
df["feature"] = df["tweet"].apply(lambda tweet: get_average_embedding(tweet, glove_embeddings, embedding_dim))
df.head()

df_test["feature"] = df_test["tweet"].apply(lambda tweet: get_average_embedding(tweet, glove_embeddings, embedding_dim))
feature_matrix_test = np.vstack(df_test["feature"].values)
# Convert the list of arrays in "feature" to a feature matrix for ML algorithms
feature_matrix = np.vstack(df["feature"].values)
labels = df["label"].values

# Check the shape of the feature matrix and a sample of data
print("Feature matrix shape:", feature_matrix.shape)  # Should be (number_of_tweets, embedding_dim)
print("Sample features:", feature_matrix[:5])
print("Labels:", labels[:5])

Feature matrix shape: (2500000, 20)
Sample features: [[ 0.30075204  0.03078056 -0.33671506  0.59790264 -0.2940522   0.30887251
   0.18902891 -0.34908383  0.01907641  0.21592869  0.25099927  0.26876923
   0.54592424 -0.61302241  0.09055117  0.51168789  0.20999208 -0.10051028
   0.58153037 -0.24113424]
 [ 0.29901142 -0.01452917 -0.23043686  0.46373904 -0.20266416  0.16274825
   0.15461645 -0.37216849  0.07457324  0.130669    0.14910271  0.08946777
   0.45444745 -0.4830026   0.04326336  0.46095983  0.26453349 -0.05143715
   0.25681797 -0.00466864]
 [ 0.17069418  0.02189873 -0.06987763  0.39282254 -0.35960902  0.33947773
   0.19983961 -0.32279671 -0.11036849  0.18927896  0.23833146  0.29834712
   0.35724263 -0.50109079 -0.08810106  0.29771447  0.10619965 -0.18684458
   0.00088793  0.05229465]
 [ 0.21511581 -0.36253114 -0.13034872  0.29990089  0.18971807  0.24370075
  -0.30592113 -0.29622787  0.08150644  0.16852212  0.11948167  0.0265511
   0.45875313 -0.4263422  -0.17688048  0.39696826  0.

In [None]:
np.save("feature_matrix_glove", feature_matrix)

In [5]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import StandardScaler

# Load your feature matrix and labels from previous steps
# feature_matrix: the matrix of averaged embeddings for each tweet
# labels: the corresponding labels (+1 for positive, -1 for negative)

# Split the data into training and test sets (e.g., 80% train, 20% test)
#X_train, X_test, y_train, y_test = train_test_split(feature_matrix, labels, test_size=0.0, random_state=42)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    feature_matrix, labels, test_size=0.2, random_state=42
)
# Normalize the feature matrix

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Initialize the Linear Regression model
model = LinearRegression()
# Train the model on the training data
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Convert predictions to binary labels (+1 or -1) by rounding to nearest integer
y_pred_labels = np.where(y_pred >= 0, 1, -1)

 #Evaluate the model's performance using accuracy
accuracy = accuracy_score(y_test, y_pred_labels)
print("Test set accuracy:", accuracy)

# Evaluate the model's performance using F1 score
f1 = f1_score(y_test, y_pred_labels)
print("Test set F1 score:", f1)


Test set accuracy: 0.572408
Test set F1 score: 0.5953653514217364


In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import StandardScaler

# Load your feature matrix and labels (replace with actual variables)
# feature_matrix: averaged embeddings for each tweet
# labels: corresponding labels (+1 for positive, -1 for negative)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    feature_matrix, labels, test_size=0.2, random_state=42
)

# Normalize the feature matrix
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Initialize the SVM model with default parameters
svm = SVC(random_state=42)

# Train the SVM Classifier
svm.fit(X_train, y_train)

# Predict on the test set
y_pred = svm.predict(X_test)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Test set accuracy:", accuracy)
print("Test set F1 score:", f1)


## MLP with grid search

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import StandardScaler

# Load your feature matrix and labels (replace with actual variables)
# feature_matrix: averaged embeddings for each tweet
# labels: corresponding labels (+1 for positive, -1 for negative)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    feature_matrix, labels, test_size=0.2, random_state=42
)

# Normalize the feature matrix
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Initialize the MLP Classifier model
mlp = MLPClassifier(hidden_layer_sizes=(100,), activation='relu', solver='adam', 
              alpha=0.0001, learning_rate='constant', max_iter=200, tol=1e-4)

# Define hyperparameter grid for optimization
param_grid = {
    'hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 50)],  # Number of neurons per layer
    'activation': ['relu'],  # Activation functions
    'solver': ['sgd'],  # Optimization solvers
    'alpha': [0.0001, 0.001, 0.01],  # L2 regularization term
    'learning_rate': ['constant']  # Learning rate schedule
}

# Perform GridSearchCV for hyperparameter optimization
grid_search = GridSearchCV(
    estimator=mlp, param_grid=param_grid, 
    cv=5, scoring='f1', verbose=False, n_jobs=-1
)

# Train the MLP Classifier using GridSearch
# grid_search.fit(X_train, y_train)

mlp.fit(X_train, y_train)

# Get the best model and parameters
#best_mlp = grid_search.best_estimator_
#print("Best hyperparameters:", grid_search.best_params_)

# Predict on the test set
#y_pred = best_mlp.predict(X_test)
y_pred = mlp.predict(X_test)
# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Test set accuracy:", accuracy)
print("Test set F1 score:", f1)




Test set accuracy: 0.615142
Test set F1 score: 0.6571915678984414


## MLP without grid search

In [8]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import StandardScaler

# Load your feature matrix and labels (replace with actual variables)
# feature_matrix: averaged embeddings for each tweet
# labels: corresponding labels (+1 for positive, -1 for negative)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    feature_matrix, labels, test_size=0.2, random_state=42
)

# Normalize the feature matrix
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Initialize the MLP Classifier model with default parameters
mlp = MLPClassifier(random_state=42)

# Train the MLP Classifier
mlp.fit(X_train, y_train)

# Predict on the test set
y_pred = mlp.predict(X_test)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Test set accuracy:", accuracy)
print("Test set F1 score:", f1)



Test set accuracy: 0.595908
Test set F1 score: 0.6444517180334561


In [14]:
import numpy as np



sample_size = 2500000
selected_indices = np.random.choice(feature_matrix.shape[0], sample_size, replace=False)

# Réduire feature_matrix et labels en fonction des indices sélectionnés
reduced_feature_matrix = feature_matrix[selected_indices]
reduced_labels = labels[selected_indices]

# Afficher les formes des nouvelles matrices
print("Original feature matrix shape:", feature_matrix.shape)
print("Reduced feature matrix shape:", reduced_feature_matrix.shape)
print("Original labels shape:", labels.shape)
print("Reduced labels shape:", reduced_labels.shape)


Original feature matrix shape: (2500000, 20)
Reduced feature matrix shape: (2500000, 20)
Original labels shape: (2500000,)
Reduced labels shape: (2500000,)


In [11]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import StandardScaler

# Load your feature matrix and labels from previous steps
# feature_matrix: the matrix of averaged embeddings for each tweet
# labels: the corresponding labels (+1 for positive, -1 for negative)

# Split the data into training and test sets (e.g., 80% train, 20% test)
#X_train, X_test, y_train, y_test = train_test_split(feature_matrix, labels, test_size=0.0, random_state=42)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    reduced_feature_matrix, reduced_labels, test_size=0.2, random_state=42
)
# Normalize the feature matrix

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Initialize the Linear Regression model
model = LinearRegression()

# Train the model on the training data
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Convert predictions to binary labels (+1 or -1) by rounding to nearest integer
y_pred_labels = np.where(y_pred >= 0, 1, -1)

 #Evaluate the model's performance using accuracy
accuracy = accuracy_score(y_test, y_pred_labels)
print("Test set accuracy:", accuracy)

# Evaluate the model's performance using F1 score
f1 = f1_score(y_test, y_pred_labels)
print("Test set F1 score:", f1)


Test set accuracy: 0.57
Test set F1 score: 0.587173579109063


In [12]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import StandardScaler

# Load your feature matrix and labels (replace with actual variables)
# feature_matrix: averaged embeddings for each tweet
# labels: corresponding labels (+1 for positive, -1 for negative)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    reduced_feature_matrix, reduced_labels, test_size=0.2, random_state=42
)

# Normalize the feature matrix
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Initialize the SVM model with default parameters
svm = SVC(random_state=42)

# Train the SVM Classifier
svm.fit(X_train, y_train)

# Predict on the test set
y_pred = svm.predict(X_test)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Test set accuracy:", accuracy)
print("Test set F1 score:", f1)


Test set accuracy: 0.6056
Test set F1 score: 0.665422463522226


In [15]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import StandardScaler

# Load your feature matrix and labels (replace with actual variables)
# feature_matrix: averaged embeddings for each tweet
# labels: corresponding labels (+1 for positive, -1 for negative)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    reduced_feature_matrix, reduced_labels, test_size=0.2, random_state=42
)

# Normalize the feature matrix
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Initialize the Logistic Regression model with default parameters
log_reg = LogisticRegression(random_state=42)

# Train the Logistic Regression Classifier
log_reg.fit(X_train, y_train)

# Predict on the test set
y_pred = log_reg.predict(X_test)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Test set accuracy:", accuracy)
print("Test set F1 score:", f1)

Test set accuracy: 0.57295
Test set F1 score: 0.5947700138350385


In [27]:
ids = np.arange(1, len(y_pred_labels) + 1)

In [29]:
from helpers import create_csv_submission
create_csv_submission(ids, y_pred_labels, "submission_embed_transfo1.csv")