### Just getting started ###

In [12]:
import numpy as np

# 1. Load the vocabulary (list of words)
with open("vocab_cut.txt", "r") as f:
    words = [line.strip() for line in f]

# Check the number of words in vocab.txt
print(f"Number of words in vocab.txt: {len(words)}")

# 2. Load the embedding matrix
embedding_matrix = np.load("embeddings_transfo.npy")

# Check the shape of the embedding matrix
#print(f"Embedding matrix shape: {embedding_matrix.shape}")  # Should be (101298, 20)

# 3. Create the word-to-embedding dictionary
glove_embeddings = {words[i]: embedding_matrix[i] for i in range(len(words))}

Number of words in vocab.txt: 101298


In [23]:
import pandas as pd

# Define file paths
data_path = "data/twitter-datasets/"
train_neg_path = f"{data_path}train_neg_full.txt"
train_pos_path = f"{data_path}train_pos_full.txt"
test_path = f"{data_path}test_data.txt"

# Load negative tweets and assign a label of -1
with open(train_neg_path, "r") as f:
    neg_tweets = [(line.strip(), -1) for line in f]

# Load positive tweets and assign a label of +1
with open(train_pos_path, "r") as f:
    pos_tweets = [(line.strip(), 1) for line in f]

with open(test_path, "r") as f:
    test_tweets = [(line.strip(), -1) for line in f]

# Combine the positive and negative tweets into a single list
tweets_with_labels = neg_tweets + pos_tweets

# Optional: Shuffle the dataset (important for training)
import random
random.shuffle(tweets_with_labels)

# Convert to a DataFrame for easy manipulation and viewing
df = pd.DataFrame(tweets_with_labels, columns=["tweet", "label"])
df_test = pd.DataFrame(test_tweets, columns=["tweet", "label"])
# Display the first few rows of the DataFrame
print(df.head())
print(df_test.head())


                                               tweet  label
0                  last ever shower as a 15 year old      1
1                           just let me hold it down      1
2  <user> so glad we ran into each other last nig...      1
3  <user> if i only could mate , i'd do it . foll...      1
4  <user> yeah right i changed it for him to noti...      1
                                               tweet  label
0  1,sea doo pro sea scooter ( sports with the po...     -1
1  2,<user> shucks well i work all week so now i ...     -1
2          3,i cant stay away from bug thats my baby     -1
3  4,<user> no ma'am ! ! ! lol im perfectly fine ...     -1
4  5,whenever i fall asleep watching the tv , i a...     -1


In [18]:
def get_average_embedding(tweet, glove_embeddings, embedding_dim=20):
    words = tweet.split()  # Tokenize the tweet by splitting on whitespace
    word_vectors = [glove_embeddings[word] for word in words if word in glove_embeddings]

    if not word_vectors:
        # If no words in the tweet have embeddings, return a zero vector
        return np.zeros(embedding_dim)
    
    # Average the word vectors
    avg_vector = np.mean(word_vectors, axis=0)
    return avg_vector

In [24]:
embedding_dim = 384  # Based on your embedding vector dimension
df["feature"] = df["tweet"].apply(lambda tweet: get_average_embedding(tweet, glove_embeddings, embedding_dim))
df.head()

df_test["feature"] = df_test["tweet"].apply(lambda tweet: get_average_embedding(tweet, glove_embeddings, embedding_dim))
feature_matrix_test = np.vstack(df_test["feature"].values)
# Convert the list of arrays in "feature" to a feature matrix for ML algorithms
feature_matrix = np.vstack(df["feature"].values)
labels = df["label"].values

# Check the shape of the feature matrix and a sample of data
print("Feature matrix shape:", feature_matrix.shape)  # Should be (number_of_tweets, embedding_dim)
print("Sample features:", feature_matrix[:5])
print("Labels:", labels[:5])

Feature matrix shape: (2500000, 384)
Sample features: [[-0.04410629  0.04913031 -0.01160404 ...  0.01251471  0.00039741
   0.02593069]
 [-0.05677286  0.02824929  0.00212249 ... -0.00254374  0.03906221
   0.03127772]
 [-0.05476969  0.01981606 -0.02334924 ...  0.01745407  0.02919325
   0.02059066]
 [-0.07227229  0.01744828  0.00086285 ...  0.00961973  0.01885824
  -0.01378786]
 [-0.05895717  0.02410818  0.00617399 ...  0.03079126  0.01713443
  -0.00498506]]
Labels: [1 1 1 1 1]


In [26]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import StandardScaler

# Load your feature matrix and labels from previous steps
# feature_matrix: the matrix of averaged embeddings for each tweet
# labels: the corresponding labels (+1 for positive, -1 for negative)

# Split the data into training and test sets (e.g., 80% train, 20% test)
#X_train, X_test, y_train, y_test = train_test_split(feature_matrix, labels, test_size=0.0, random_state=42)

X_train = feature_matrix
y_train = labels

X_test = feature_matrix_test
# Normalize the feature matrix

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Initialize the Linear Regression model
model = LinearRegression()

# Train the model on the training data
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Convert predictions to binary labels (+1 or -1) by rounding to nearest integer
y_pred_labels = np.where(y_pred >= 0, 1, -1)

# Evaluate the model's performance using accuracy
# accuracy = accuracy_score(y_test, y_pred_labels)
# print("Test set accuracy:", accuracy)

# # Evaluate the model's performance using F1 score
# f1 = f1_score(y_test, y_pred_labels)
# print("Test set F1 score:", f1)


In [27]:
ids = np.arange(1, len(y_pred_labels) + 1)

In [29]:
from helpers import create_csv_submission
create_csv_submission(ids, y_pred_labels, "submission_embed_transfo1.csv")