### Just getting started ###

In [2]:
import numpy as np

# 1. Load the vocabulary (list of words)
with open("vocab_cut.txt", "r") as f:
    words = [line.strip() for line in f]

# Check the number of words in vocab.txt
print(f"Number of words in vocab.txt: {len(words)}")

# 2. Load the embedding matrix
embedding_matrix = np.load("embeddings_twitter_roberta.npy")

# Check the shape of the embedding matrix
#print(f"Embedding matrix shape: {embedding_matrix.shape}")  # Should be (101298, 20)

# 3. Create the word-to-embedding dictionary
glove_embeddings = {words[i]: embedding_matrix[i] for i in range(len(words))}

Number of words in vocab.txt: 101298


In [3]:
import pandas as pd

# Define file paths
data_path = "data/twitter-datasets/"
train_neg_path = f"{data_path}train_neg_full.txt"
train_pos_path = f"{data_path}train_pos_full.txt"
test_path = f"{data_path}test_data.txt"

# Load negative tweets and assign a label of -1
with open(train_neg_path, "r") as f:
    neg_tweets = [(line.strip(), -1) for line in f]

# Load positive tweets and assign a label of +1
with open(train_pos_path, "r") as f:
    pos_tweets = [(line.strip(), 1) for line in f]

with open(test_path, "r") as f:
    test_tweets = [(line.strip(), -1) for line in f]

# Combine the positive and negative tweets into a single list
tweets_with_labels = neg_tweets + pos_tweets

# Optional: Shuffle the dataset (important for training)
import random
random.shuffle(tweets_with_labels)

# Convert to a DataFrame for easy manipulation and viewing
df = pd.DataFrame(tweets_with_labels, columns=["tweet", "label"])
df_test = pd.DataFrame(test_tweets, columns=["tweet", "label"])
# Display the first few rows of the DataFrame
print(df.head())
print(df_test.head())


                                               tweet  label
0                          <user> now there is ! ! !      1
1  motorola razr v3r minisync - complete essentia...     -1
2  off for a shower ! will be back in an hour or ...      1
3  actual picture of me , n0 edittt ! ! ! 1 ! ! 1...      1
4  i don't feel good at all ! i'm not going to sc...     -1
                                               tweet  label
0  1,sea doo pro sea scooter ( sports with the po...     -1
1  2,<user> shucks well i work all week so now i ...     -1
2          3,i cant stay away from bug thats my baby     -1
3  4,<user> no ma'am ! ! ! lol im perfectly fine ...     -1
4  5,whenever i fall asleep watching the tv , i a...     -1


In [4]:
def get_average_embedding(tweet, glove_embeddings, embedding_dim=20):
    words = tweet.split()  # Tokenize the tweet by splitting on whitespace
    word_vectors = [glove_embeddings[word] for word in words if word in glove_embeddings]

    if not word_vectors:
        # If no words in the tweet have embeddings, return a zero vector
        return np.zeros(embedding_dim)
    
    # Average the word vectors
    avg_vector = np.mean(word_vectors, axis=0)
    return avg_vector

In [5]:
embedding_dim = 768  # Based on your embedding vector dimension
df["feature"] = df["tweet"].apply(lambda tweet: get_average_embedding(tweet, glove_embeddings, embedding_dim))
df.head()

df_test["feature"] = df_test["tweet"].apply(lambda tweet: get_average_embedding(tweet, glove_embeddings, embedding_dim))
feature_matrix_test = np.vstack(df_test["feature"].values)
# Convert the list of arrays in "feature" to a feature matrix for ML algorithms
feature_matrix = np.vstack(df["feature"].values)
labels = df["label"].values

# Check the shape of the feature matrix and a sample of data
print("Feature matrix shape:", feature_matrix.shape)  # Should be (number_of_tweets, embedding_dim)
print("Sample features:", feature_matrix[:5])
print("Labels:", labels[:5])

Feature matrix shape: (2500000, 768)
Sample features: [[-0.01253124  0.00145689 -0.02160601 ... -0.12208135 -0.02768009
   0.02173384]
 [ 0.06072925  0.06751245 -0.01328603 ... -0.08775819  0.01282722
   0.03254552]
 [ 0.01193622  0.03922918 -0.00839926 ... -0.0674857  -0.02968713
  -0.01105892]
 [ 0.01861416  0.01742793 -0.0232915  ... -0.04378478 -0.01142052
   0.00456361]
 [ 0.03111274  0.0680578  -0.00779726 ... -0.06426999 -0.02902201
  -0.00197017]]
Labels: [ 1 -1  1  1 -1]


In [6]:
np.save("labels_twitter_roberta",labels)

In [6]:
np.save("feature_matrix_twitter_roberta", feature_matrix)

In [2]:
import numpy as np
feature_matrix = np.load("feature_matrix_twitter_roberta.npy")

In [7]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import StandardScaler

# Load your feature matrix and labels from previous steps
# feature_matrix: the matrix of averaged embeddings for each tweet
# labels: the corresponding labels (+1 for positive, -1 for negative)

# Split the data into training and test sets (e.g., 80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(feature_matrix, labels, test_size=0.2, random_state=42)

# Normalize the feature matrix

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Initialize the Linear Regression model
model = LinearRegression()

# Train the model on the training data
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Convert predictions to binary labels (+1 or -1) by rounding to nearest integer
y_pred_labels = np.where(y_pred >= 0, 1, -1)

# Evaluate the model's performance using accuracy
# accuracy = accuracy_score(y_test, y_pred_labels)
# print("Test set accuracy:", accuracy)

# # Evaluate the model's performance using F1 score
# f1 = f1_score(y_test, y_pred_labels)
# print("Test set F1 score:", f1)


: 

In [8]:
#Evaluate the model's performance using accuracy
accuracy = accuracy_score(y_test, y_pred_labels)
print("Test set accuracy:", accuracy)

#Evaluate the model's performance using F1 score
f1 = f1_score(y_test, y_pred_labels)
print("Test set F1 score:", f1)


Test set accuracy: 0.785822
Test set F1 score: 0.7935765050126932


In [5]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import StandardScaler

# Load your feature matrix and labels (replace with actual variables)
# feature_matrix: averaged embeddings for each tweet
# labels: corresponding labels (+1 for positive, -1 for negative)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    feature_matrix, labels, test_size=0.2, random_state=42
)

# Normalize the feature matrix
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Initialize the MLP Classifier model with default parameters
mlp = MLPClassifier(random_state=42)

# Train the MLP Classifier
mlp.fit(X_train, y_train)

# Predict on the test set
y_pred = mlp.predict(X_test)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Test set accuracy:", accuracy)
print("Test set F1 score:", f1)



Test set accuracy: 0.841316
Test set F1 score: 0.8438118860607492


In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import StandardScaler

# Load your feature matrix and labels (replace with actual variables)
# feature_matrix: averaged embeddings for each tweet
# labels: corresponding labels (+1 for positive, -1 for negative)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    feature_matrix, labels, test_size=0.2, random_state=42
)

# Normalize the feature matrix
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Initialize Logistic Regression model
log_reg = LogisticRegression()

# Define hyperparameter grid to optimize
param_grid = {
    'penalty': ['l1', 'l2'],
    'C': [0.1, 0, 15, 0.2, 0.3],  # Regularization strength
    'solver': ['saga', 'liblinear', 'lbfgs'],  # Added solvers
    'max_iter': [100, 200, 300],  # Maximum number of iterations
    'tol': [1e-4, 1e-3, 1e-2]  # Tolerance for stopping criteria
}

# Perform GridSearchCV for hyperparameter optimization
grid_search = GridSearchCV(
    estimator=log_reg, param_grid=param_grid, 
    cv=5, scoring='f1', verbose=False, n_jobs=-1
)

# Train the model using GridSearch
grid_search.fit(X_train, y_train)

# Get the best model and parameters
best_model = grid_search.best_estimator_
print("Best hyperparameters:", grid_search.best_params_)

# Predict on the test set
y_pred = best_model.predict(X_test)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Test set accuracy:", accuracy)
print("Test set F1 score:", f1)

NameError: name 'feature_matrix' is not defined

In [27]:
ids = np.arange(1, len(y_pred_labels) + 1)

In [29]:
from helpers import create_csv_submission
create_csv_submission(ids, y_pred_labels, "submission_embed_transfo1.csv")