### Load model ###

In [None]:
import numpy as np
from transformers import DistilBertTokenizer, DistilBertModel

#embedding_matrix = np.load("embeddings_transfo.npy")
from sentence_transformers import SentenceTransformer

model_name = 'sentence-transformers/all-MiniLM-L6-v2'
# Load the all-MiniLM-L6-v2 model
device = "mps"
model = SentenceTransformer(model_name)
model.to(device)


SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)

In [9]:
import pandas as pd

# Define file paths
data_path = "data/twitter-datasets/"
train_neg_path = f"{data_path}train_neg_full.txt"
train_pos_path = f"{data_path}train_pos_full.txt"
test_path = f"{data_path}test_data.txt"

# Load negative tweets and assign a label of -1
with open(train_neg_path, "r") as f:
    neg_tweets = [(line.strip(), -1) for line in f]

print(len(neg_tweets))

# Load positive tweets and assign a label of +1
with open(train_pos_path, "r") as f:
    pos_tweets = [(line.strip(), 1) for line in f]

print(len(pos_tweets))

with open(test_path, "r") as f:
    test_tweets = [(line.strip()[2:], -1) for line in f]

print(len(test_tweets))
# Combine the positive and negative tweets into a single list
tweets_with_labels = neg_tweets + pos_tweets

# Optional: Shuffle the dataset (important for training)
import random
random.shuffle(tweets_with_labels)

# Convert to a DataFrame for easy manipulation and viewing
df = pd.DataFrame(tweets_with_labels, columns=["tweet", "label"])
df["word_count"] = df["tweet"].apply(lambda x: len(x.split()))
df_test = pd.DataFrame(test_tweets, columns=["tweet", "label"])
df_test["word_count"] = df_test["tweet"].apply(lambda x: len(x.split()))
# Display the first few rows of the DataFrame
print(df.head())
print(df_test.head())


1250000
1250000
10000
                                               tweet  label  word_count
0  replacement battery for dell inspiron 300m ( 4...     -1          24
1                      <user> to bad i have no plans     -1           7
2  <user> but my house there the sky is just norm...     -1          11
3  please follow back rt <user> #teamfollowback #...     -1          13
4    2moro myself , <user> and <user> hit vc midrand      1           9
                                               tweet  label  word_count
0  sea doo pro sea scooter ( sports with the port...     -1          22
1  <user> shucks well i work all week so now i ca...     -1          27
2            i cant stay away from bug thats my baby     -1           9
3  <user> no ma'am ! ! ! lol im perfectly fine an...     -1          15
4  whenever i fall asleep watching the tv , i alw...     -1          15


In [None]:
import torch

features_matrix_train = model.encode(df["tweet"].values, convert_to_tensor=True)
word_count_tensor = torch.tensor(df["word_count"].values, dtype=torch.float32).unsqueeze(1).to(device)  # Shape: (num_samples, 1)
features_all_train = torch.cat((features_matrix_train, word_count_tensor), dim=1)  # Shape: (num_samples, embedding_dim + 1)


KeyboardInterrupt: 

In [None]:
import torch 

features_matrix_test = model.encode(df_test["tweet"].values, convert_to_tensor=True)
word_count_tensor_test = torch.tensor(df_test["word_count"].values, dtype=torch.float32).unsqueeze(1).to(device)  # Shape: (num_samples, 1)
features_all_test = torch.cat((features_matrix_test, word_count_tensor_test), dim=1)  # Shape: (num_samples, embedding_dim + 1)

In [17]:
features_all_test.shape

torch.Size([10000, 385])

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm
# Load your feature matrix and labels from previous steps
# feature_matrix: the matrix of averaged embeddings for each tweet
# labels: the corresponding labels (+1 for positive, -1 for negative)

# Split the data into training and test sets (e.g., 80% train, 20% test)
#X_train, X_test, y_train, y_test = train_test_split(feature_matrix, labels, test_size=0.0, random_state=42)

X_train = features_all_train
y_train = df["label"].values

X_test = features_all_test
# Normalize the feature matrix

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Initialize the SGDClassifier
model = SGDClassifier()

# Train the model on the training data
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model's performance using accuracy
# accuracy = accuracy_score(y_test, y_pred)
# print("Test set accuracy:", accuracy)

# Evaluate the model's performance using F1 score
# f1 = f1_score(y_test, y_pred)
# print("Test set F1 score:", f1)



In [6]:

# Predict on the test set
y_pred = model.predict(X_test)



# Convert predictions to binary labels (+1 or -1) by rounding to nearest integer
#y_pred_labels = np.where(y_pred >= 0, 1, -1)

# Evaluate the model's performance using accuracy
# accuracy = accuracy_score(y_test, y_pred_labels)
# print("Test set accuracy:", accuracy)

# # Evaluate the model's performance using F1 score
# f1 = f1_score(y_test, y_pred_labels)
# print("Test set F1 score:", f1)


In [10]:
print(len(y_pred[y_pred==-1]))

4833


In [8]:
ids = np.arange(1, len(y_pred) + 1)

In [9]:
from helpers import create_csv_submission
create_csv_submission(ids, y_pred, "submission_embed_transfo4_length.csv")