### Just getting started ###

In [6]:
import numpy as np

# 1. Load the vocabulary (list of words)
with open("vocab_cut.txt", "r") as f:
    words = [line.strip() for line in f]

# Check the number of words in vocab.txt
print(f"Number of words in vocab.txt: {len(words)}")

# 2. Load the embedding matrix
embedding_matrix = np.load("embeddings.npy")

# Check the shape of the embedding matrix
print(f"Embedding matrix shape: {embedding_matrix.shape}")  # Should be (101298, 20)

# 3. Create the word-to-embedding dictionary
glove_embeddings = {words[i]: embedding_matrix[i] for i in range(len(words))}

# Example: Access the embedding for the word "dog"
word_embedding = glove_embeddings.get("dog")  # Replace "dog" with any word
if word_embedding is not None:
    print(f"Embedding for 'dog': {word_embedding}")
else:
    print("Word not found in vocabulary")

Number of words in vocab.txt: 101298
Embedding matrix shape: (101298, 20)
Embedding for 'dog': [ 0.08723993  0.198662   -0.60317519  0.42002618  0.09527151  0.44971682
 -0.16778793 -0.24183066  0.53596791 -0.32699965  1.05951738  0.68060991
  0.45617437 -0.47227478 -0.39513353  0.35734509 -0.34283911  0.17890524
  0.33702634 -0.10559697]


In [12]:
import pandas as pd

# Define file paths
data_path = "data/twitter-datasets/"
train_neg_path = f"{data_path}train_neg.txt"
train_pos_path = f"{data_path}train_pos.txt"

# Load negative tweets and assign a label of -1
with open(train_neg_path, "r") as f:
    neg_tweets = [(line.strip(), -1) for line in f]

# Load positive tweets and assign a label of +1
with open(train_pos_path, "r") as f:
    pos_tweets = [(line.strip(), 1) for line in f]

# Combine the positive and negative tweets into a single list
tweets_with_labels = neg_tweets + pos_tweets

# Optional: Shuffle the dataset (important for training)
import random
random.shuffle(tweets_with_labels)

# Convert to a DataFrame for easy manipulation and viewing
df = pd.DataFrame(tweets_with_labels, columns=["tweet", "label"])

# Display the first few rows of the DataFrame
print(df.head())


                                               tweet  label
0  i wonder who are you talking about ... it's de...     -1
1                 glad to b off dnt feel good at all     -1
2  loving my peeps on twitter protesting the fuck...      1
3  i love you <user> wait for tonight's clip of b...      1
4  <user> you and <user> will make the cutest cou...      1


In [13]:
def get_average_embedding(tweet, glove_embeddings, embedding_dim=20):
    words = tweet.split()  # Tokenize the tweet by splitting on whitespace
    word_vectors = [glove_embeddings[word] for word in words if word in glove_embeddings]

    if not word_vectors:
        # If no words in the tweet have embeddings, return a zero vector
        return np.zeros(embedding_dim)
    
    # Average the word vectors
    avg_vector = np.mean(word_vectors, axis=0)
    return avg_vector

In [15]:
embedding_dim = 20  # Based on your embedding vector dimension
df["feature"] = df["tweet"].apply(lambda tweet: get_average_embedding(tweet, glove_embeddings, embedding_dim))
df.head()
# Convert the list of arrays in "feature" to a feature matrix for ML algorithms
feature_matrix = np.vstack(df["feature"].values)
labels = df["label"].values

# Check the shape of the feature matrix and a sample of data
print("Feature matrix shape:", feature_matrix.shape)  # Should be (number_of_tweets, embedding_dim)
print("Sample features:", feature_matrix[:5])
print("Labels:", labels[:5])

Feature matrix shape: (200000, 20)
Sample features: [[ 3.46951193e-01 -5.39721850e-02 -2.85347753e-01  4.15248704e-01
  -3.09205731e-01  2.89634001e-01  1.10817865e-01 -3.14039857e-01
  -8.43951729e-02  1.26812239e-01  8.54011491e-02  2.25696590e-01
   4.67463946e-01 -4.39469099e-01  8.31066792e-02  5.91286910e-01
   1.81714505e-01 -9.58502088e-02  3.28682083e-01 -1.04378108e-02]
 [ 4.42388857e-01  2.16337682e-01 -2.50358366e-01  5.02090672e-01
  -1.27068211e-01  2.12625968e-01  2.76913617e-02 -4.81798412e-01
   5.33096252e-02  1.19584558e-01  1.85556101e-01  3.08845606e-01
   5.26943232e-01 -5.60846487e-01  4.27428091e-02  4.92746303e-01
   3.04177767e-01 -1.02890440e-01  4.73283879e-01 -1.35745022e-01]
 [ 1.42379915e-01  2.19552371e-02 -1.68202581e-01  2.07059272e-01
  -1.24985767e-01  1.21580990e-01 -9.75649833e-02 -1.92837410e-01
  -8.10305916e-02 -3.14825180e-02  5.76040921e-02  5.02743517e-02
   2.93349135e-01 -3.24897987e-01 -1.11397981e-01  4.56748483e-01
   1.39910613e-01  3.3

In [24]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import StandardScaler

# Load your feature matrix and labels from previous steps
# feature_matrix: the matrix of averaged embeddings for each tweet
# labels: the corresponding labels (+1 for positive, -1 for negative)

# Split the data into training and test sets (e.g., 80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(feature_matrix, labels, test_size=0.2, random_state=42)

# Normalize the feature matrix

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Initialize the Linear Regression model
model = LinearRegression()

# Train the model on the training data
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Convert predictions to binary labels (+1 or -1) by rounding to nearest integer
y_pred_labels = np.where(y_pred >= 0, 1, -1)

# Evaluate the model's performance using accuracy
accuracy = accuracy_score(y_test, y_pred_labels)
print("Test set accuracy:", accuracy)

# Evaluate the model's performance using F1 score
f1 = f1_score(y_test, y_pred_labels)
print("Test set F1 score:", f1)


Test set accuracy: 0.574325
Test set F1 score: 0.5966217336713179
