In [2]:
# Imports
import torch
from transformers import AutoTokenizer, AutoModel
import pandas as pd
from tqdm import tqdm
import os
import numpy as np
from sklearn.decomposition import PCA
from sklearn.decomposition import IncrementalPCA
import pickle

In [4]:
# Seeting up embeddings
device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")

# Loading RoBERTa base model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment")
model = AutoModel.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment")
model.to(device)
model.eval()

# Paths
input_file = '/Users/diegolemos/Masters/Theses/code/data/processed/customer_data_final.pkl'
output_folder = '/Users/diegolemos/Masters/Theses/code/data/processed/embeddings/'
os.makedirs(output_folder, exist_ok=True)

Using device: mps


Some weights of RobertaModel were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
# Defining the embedding function
def get_roberta_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state[:, 0, :].squeeze().cpu().numpy()


In [6]:
# Loading labeled data
df = pd.read_pickle(input_file)
df['clean_text'] = df['clean_text'].astype(str)

# Chunks
chunk_size = 20000
num_chunks = (len(df) // chunk_size) + 1

for i in range(num_chunks):
    chunk = df.iloc[i * chunk_size:(i + 1) * chunk_size].copy()
    if chunk.empty:
        continue
    
    print(f"Processing chunk {i+1}/{num_chunks} - rows {chunk.index[0]} to {chunk.index[-1]}")
    
    embeddings = []
    for text in tqdm(chunk['clean_text'], desc="Encoding", leave=False):
        embeddings.append(get_roberta_embedding(text))
    
    # Storing embedding as new column
    chunk['embedding'] = embeddings

    # Save chunk
    output_path = os.path.join(output_folder, f'embeddings_chunk_{i+1}.pkl')
    chunk.to_pickle(output_path)
    print(f"Saved to {output_path}")
    
    # Cleaning up
    del chunk, embeddings
    torch.cuda.empty_cache() if torch.cuda.is_available() else None


Processing chunk 1/68 - rows 0 to 19999


                                                               

Saved to /Users/diegolemos/Masters/Theses/code/data/processed/embeddings/embeddings_chunk_1.pkl
Processing chunk 2/68 - rows 20000 to 39999


                                                               

Saved to /Users/diegolemos/Masters/Theses/code/data/processed/embeddings/embeddings_chunk_2.pkl
Processing chunk 3/68 - rows 40000 to 59999


                                                               

Saved to /Users/diegolemos/Masters/Theses/code/data/processed/embeddings/embeddings_chunk_3.pkl
Processing chunk 4/68 - rows 60000 to 79999


                                                               

Saved to /Users/diegolemos/Masters/Theses/code/data/processed/embeddings/embeddings_chunk_4.pkl
Processing chunk 5/68 - rows 80000 to 99999


                                                               

Saved to /Users/diegolemos/Masters/Theses/code/data/processed/embeddings/embeddings_chunk_5.pkl
Processing chunk 6/68 - rows 100000 to 119999


                                                               

Saved to /Users/diegolemos/Masters/Theses/code/data/processed/embeddings/embeddings_chunk_6.pkl
Processing chunk 7/68 - rows 120000 to 139999


                                                               

Saved to /Users/diegolemos/Masters/Theses/code/data/processed/embeddings/embeddings_chunk_7.pkl
Processing chunk 8/68 - rows 140000 to 159999


                                                               

Saved to /Users/diegolemos/Masters/Theses/code/data/processed/embeddings/embeddings_chunk_8.pkl
Processing chunk 9/68 - rows 160000 to 179999


                                                               

Saved to /Users/diegolemos/Masters/Theses/code/data/processed/embeddings/embeddings_chunk_9.pkl
Processing chunk 10/68 - rows 180000 to 199999


                                                               

Saved to /Users/diegolemos/Masters/Theses/code/data/processed/embeddings/embeddings_chunk_10.pkl
Processing chunk 11/68 - rows 200000 to 219999


                                                               

Saved to /Users/diegolemos/Masters/Theses/code/data/processed/embeddings/embeddings_chunk_11.pkl
Processing chunk 12/68 - rows 220000 to 239999


                                                               

Saved to /Users/diegolemos/Masters/Theses/code/data/processed/embeddings/embeddings_chunk_12.pkl
Processing chunk 13/68 - rows 240000 to 259999


                                                               

Saved to /Users/diegolemos/Masters/Theses/code/data/processed/embeddings/embeddings_chunk_13.pkl
Processing chunk 14/68 - rows 260000 to 279999


                                                               

Saved to /Users/diegolemos/Masters/Theses/code/data/processed/embeddings/embeddings_chunk_14.pkl
Processing chunk 15/68 - rows 280000 to 299999


                                                               

Saved to /Users/diegolemos/Masters/Theses/code/data/processed/embeddings/embeddings_chunk_15.pkl
Processing chunk 16/68 - rows 300000 to 319999


                                                               

Saved to /Users/diegolemos/Masters/Theses/code/data/processed/embeddings/embeddings_chunk_16.pkl
Processing chunk 17/68 - rows 320000 to 339999


                                                               

Saved to /Users/diegolemos/Masters/Theses/code/data/processed/embeddings/embeddings_chunk_17.pkl
Processing chunk 18/68 - rows 340000 to 359999


                                                               

Saved to /Users/diegolemos/Masters/Theses/code/data/processed/embeddings/embeddings_chunk_18.pkl
Processing chunk 19/68 - rows 360000 to 379999


                                                               

Saved to /Users/diegolemos/Masters/Theses/code/data/processed/embeddings/embeddings_chunk_19.pkl
Processing chunk 20/68 - rows 380000 to 399999


                                                               

Saved to /Users/diegolemos/Masters/Theses/code/data/processed/embeddings/embeddings_chunk_20.pkl
Processing chunk 21/68 - rows 400000 to 419999


                                                               

Saved to /Users/diegolemos/Masters/Theses/code/data/processed/embeddings/embeddings_chunk_21.pkl
Processing chunk 22/68 - rows 420000 to 439999


                                                               

Saved to /Users/diegolemos/Masters/Theses/code/data/processed/embeddings/embeddings_chunk_22.pkl
Processing chunk 23/68 - rows 440000 to 459999


                                                               

Saved to /Users/diegolemos/Masters/Theses/code/data/processed/embeddings/embeddings_chunk_23.pkl
Processing chunk 24/68 - rows 460000 to 479999


                                                               

Saved to /Users/diegolemos/Masters/Theses/code/data/processed/embeddings/embeddings_chunk_24.pkl
Processing chunk 25/68 - rows 480000 to 499999


                                                               

Saved to /Users/diegolemos/Masters/Theses/code/data/processed/embeddings/embeddings_chunk_25.pkl
Processing chunk 26/68 - rows 500000 to 519999


                                                               

Saved to /Users/diegolemos/Masters/Theses/code/data/processed/embeddings/embeddings_chunk_26.pkl
Processing chunk 27/68 - rows 520000 to 539999


                                                               

Saved to /Users/diegolemos/Masters/Theses/code/data/processed/embeddings/embeddings_chunk_27.pkl
Processing chunk 28/68 - rows 540000 to 559999


                                                               

Saved to /Users/diegolemos/Masters/Theses/code/data/processed/embeddings/embeddings_chunk_28.pkl
Processing chunk 29/68 - rows 560000 to 579999


                                                               

Saved to /Users/diegolemos/Masters/Theses/code/data/processed/embeddings/embeddings_chunk_29.pkl
Processing chunk 30/68 - rows 580000 to 599999


                                                               

Saved to /Users/diegolemos/Masters/Theses/code/data/processed/embeddings/embeddings_chunk_30.pkl
Processing chunk 31/68 - rows 600000 to 619999


                                                               

Saved to /Users/diegolemos/Masters/Theses/code/data/processed/embeddings/embeddings_chunk_31.pkl
Processing chunk 32/68 - rows 620000 to 639999


                                                               

Saved to /Users/diegolemos/Masters/Theses/code/data/processed/embeddings/embeddings_chunk_32.pkl
Processing chunk 33/68 - rows 640000 to 659999


                                                               

Saved to /Users/diegolemos/Masters/Theses/code/data/processed/embeddings/embeddings_chunk_33.pkl
Processing chunk 34/68 - rows 660000 to 679999


                                                               

Saved to /Users/diegolemos/Masters/Theses/code/data/processed/embeddings/embeddings_chunk_34.pkl
Processing chunk 35/68 - rows 680000 to 699999


                                                               

Saved to /Users/diegolemos/Masters/Theses/code/data/processed/embeddings/embeddings_chunk_35.pkl
Processing chunk 36/68 - rows 700000 to 719999


                                                               

Saved to /Users/diegolemos/Masters/Theses/code/data/processed/embeddings/embeddings_chunk_36.pkl
Processing chunk 37/68 - rows 720000 to 739999


                                                               

Saved to /Users/diegolemos/Masters/Theses/code/data/processed/embeddings/embeddings_chunk_37.pkl
Processing chunk 38/68 - rows 740000 to 759999


                                                               

Saved to /Users/diegolemos/Masters/Theses/code/data/processed/embeddings/embeddings_chunk_38.pkl
Processing chunk 39/68 - rows 760000 to 779999


                                                               

Saved to /Users/diegolemos/Masters/Theses/code/data/processed/embeddings/embeddings_chunk_39.pkl
Processing chunk 40/68 - rows 780000 to 799999


                                                               

Saved to /Users/diegolemos/Masters/Theses/code/data/processed/embeddings/embeddings_chunk_40.pkl
Processing chunk 41/68 - rows 800000 to 819999


                                                               

Saved to /Users/diegolemos/Masters/Theses/code/data/processed/embeddings/embeddings_chunk_41.pkl
Processing chunk 42/68 - rows 820000 to 839999


                                                               

Saved to /Users/diegolemos/Masters/Theses/code/data/processed/embeddings/embeddings_chunk_42.pkl
Processing chunk 43/68 - rows 840000 to 859999


                                                               

Saved to /Users/diegolemos/Masters/Theses/code/data/processed/embeddings/embeddings_chunk_43.pkl
Processing chunk 44/68 - rows 860000 to 879999


                                                               

Saved to /Users/diegolemos/Masters/Theses/code/data/processed/embeddings/embeddings_chunk_44.pkl
Processing chunk 45/68 - rows 880000 to 899999


                                                               

Saved to /Users/diegolemos/Masters/Theses/code/data/processed/embeddings/embeddings_chunk_45.pkl
Processing chunk 46/68 - rows 900000 to 919999


                                                               

Saved to /Users/diegolemos/Masters/Theses/code/data/processed/embeddings/embeddings_chunk_46.pkl
Processing chunk 47/68 - rows 920000 to 939999


                                                               

Saved to /Users/diegolemos/Masters/Theses/code/data/processed/embeddings/embeddings_chunk_47.pkl
Processing chunk 48/68 - rows 940000 to 959999


                                                               

Saved to /Users/diegolemos/Masters/Theses/code/data/processed/embeddings/embeddings_chunk_48.pkl
Processing chunk 49/68 - rows 960000 to 979999


                                                               

Saved to /Users/diegolemos/Masters/Theses/code/data/processed/embeddings/embeddings_chunk_49.pkl
Processing chunk 50/68 - rows 980000 to 999999


                                                               

Saved to /Users/diegolemos/Masters/Theses/code/data/processed/embeddings/embeddings_chunk_50.pkl
Processing chunk 51/68 - rows 1000000 to 1019999


                                                               

Saved to /Users/diegolemos/Masters/Theses/code/data/processed/embeddings/embeddings_chunk_51.pkl
Processing chunk 52/68 - rows 1020000 to 1039999


                                                               

Saved to /Users/diegolemos/Masters/Theses/code/data/processed/embeddings/embeddings_chunk_52.pkl
Processing chunk 53/68 - rows 1040000 to 1059999


                                                               

Saved to /Users/diegolemos/Masters/Theses/code/data/processed/embeddings/embeddings_chunk_53.pkl
Processing chunk 54/68 - rows 1060000 to 1079999


                                                               

Saved to /Users/diegolemos/Masters/Theses/code/data/processed/embeddings/embeddings_chunk_54.pkl
Processing chunk 55/68 - rows 1080000 to 1099999


                                                               

Saved to /Users/diegolemos/Masters/Theses/code/data/processed/embeddings/embeddings_chunk_55.pkl
Processing chunk 56/68 - rows 1100000 to 1119999


                                                               

Saved to /Users/diegolemos/Masters/Theses/code/data/processed/embeddings/embeddings_chunk_56.pkl
Processing chunk 57/68 - rows 1120000 to 1139999


                                                               

Saved to /Users/diegolemos/Masters/Theses/code/data/processed/embeddings/embeddings_chunk_57.pkl
Processing chunk 58/68 - rows 1140000 to 1159999


                                                               

Saved to /Users/diegolemos/Masters/Theses/code/data/processed/embeddings/embeddings_chunk_58.pkl
Processing chunk 59/68 - rows 1160000 to 1179999


                                                               

Saved to /Users/diegolemos/Masters/Theses/code/data/processed/embeddings/embeddings_chunk_59.pkl
Processing chunk 60/68 - rows 1180000 to 1199999


                                                               

Saved to /Users/diegolemos/Masters/Theses/code/data/processed/embeddings/embeddings_chunk_60.pkl
Processing chunk 61/68 - rows 1200000 to 1219999


                                                               

Saved to /Users/diegolemos/Masters/Theses/code/data/processed/embeddings/embeddings_chunk_61.pkl
Processing chunk 62/68 - rows 1220000 to 1239999


                                                               

Saved to /Users/diegolemos/Masters/Theses/code/data/processed/embeddings/embeddings_chunk_62.pkl
Processing chunk 63/68 - rows 1240000 to 1259999


                                                               

Saved to /Users/diegolemos/Masters/Theses/code/data/processed/embeddings/embeddings_chunk_63.pkl
Processing chunk 64/68 - rows 1260000 to 1279999


                                                               

Saved to /Users/diegolemos/Masters/Theses/code/data/processed/embeddings/embeddings_chunk_64.pkl
Processing chunk 65/68 - rows 1280000 to 1299999


                                                               

Saved to /Users/diegolemos/Masters/Theses/code/data/processed/embeddings/embeddings_chunk_65.pkl
Processing chunk 66/68 - rows 1300000 to 1319999


                                                               

Saved to /Users/diegolemos/Masters/Theses/code/data/processed/embeddings/embeddings_chunk_66.pkl
Processing chunk 67/68 - rows 1320000 to 1339999


                                                               

Saved to /Users/diegolemos/Masters/Theses/code/data/processed/embeddings/embeddings_chunk_67.pkl
Processing chunk 68/68 - rows 1340000 to 1345726


                                                             

Saved to /Users/diegolemos/Masters/Theses/code/data/processed/embeddings/embeddings_chunk_68.pkl


In [9]:
# Loading all embedding Chunks
embeddings_folder = '/Users/diegolemos/Masters/Theses/code/data/processed/embeddings'

# Getting list of chunk files in order
chunk_files = sorted([
    os.path.join(embeddings_folder, f)
    for f in os.listdir(embeddings_folder)
    if f.startswith('embeddings_chunk_') and f.endswith('.pkl')
])

# Loading all chunks
embedding_chunks = []
for idx, file in enumerate(chunk_files, 1):
    print(f'Loading chunk {idx}/{len(chunk_files)}: {file}')
    chunk = pd.read_pickle(file)
    embedding_chunks.append(chunk)

# Combining all into one big DataFrame
roberta_embeddings_df = pd.concat(embedding_chunks, ignore_index=True)
print(f"\n All chunks loaded. Final shape: {roberta_embeddings_df.shape}")

Loading chunk 1/68: /Users/diegolemos/Masters/Theses/code/data/processed/embeddings/embeddings_chunk_1.pkl
Loading chunk 2/68: /Users/diegolemos/Masters/Theses/code/data/processed/embeddings/embeddings_chunk_10.pkl
Loading chunk 3/68: /Users/diegolemos/Masters/Theses/code/data/processed/embeddings/embeddings_chunk_11.pkl
Loading chunk 4/68: /Users/diegolemos/Masters/Theses/code/data/processed/embeddings/embeddings_chunk_12.pkl
Loading chunk 5/68: /Users/diegolemos/Masters/Theses/code/data/processed/embeddings/embeddings_chunk_13.pkl
Loading chunk 6/68: /Users/diegolemos/Masters/Theses/code/data/processed/embeddings/embeddings_chunk_14.pkl
Loading chunk 7/68: /Users/diegolemos/Masters/Theses/code/data/processed/embeddings/embeddings_chunk_15.pkl
Loading chunk 8/68: /Users/diegolemos/Masters/Theses/code/data/processed/embeddings/embeddings_chunk_16.pkl
Loading chunk 9/68: /Users/diegolemos/Masters/Theses/code/data/processed/embeddings/embeddings_chunk_17.pkl
Loading chunk 10/68: /Users/d

In [10]:
# Loading the file that contains the clean_text and final_sentiment
final_labels_path = '/Users/diegolemos/Masters/Theses/code/data/processed/customer_data_final.pkl'

final_labels_df = pd.read_pickle(final_labels_path)
final_labels_df['clean_text'] = final_labels_df['clean_text'].astype(str)

print(f"Final labels loaded: {final_labels_df.shape}")


Final labels loaded: (1345727, 6)


In [11]:
# Checking if files have the same size
assert len(roberta_embeddings_df) == len(final_labels_df), "Row count mismatch"

# Dropping duplicated if already exists in roberta_embeddings_df
if 'final_sentiment' in roberta_embeddings_df.columns:
    roberta_embeddings_df = roberta_embeddings_df.drop(columns=['final_sentiment'])

# Merging side by side by index
model_df = pd.concat([roberta_embeddings_df, final_labels_df['final_sentiment']], axis=1)

print(f"Final dataset for training: {model_df.shape}")
model_df.head()

Final dataset for training: (1345727, 7)


Unnamed: 0,clean_text,vader_sentiment,distilbert_sentiment,roberta_sentiment,agreement,embedding,final_sentiment
0,and how do you propose we do that,neutral,positive,neutral,False,"[-0.35439885, -0.23273325, 0.016789742, -0.558...",neutral
1,i have sent several private messages and no on...,negative,negative,negative,True,"[-0.39328283, -0.39745688, 0.17642555, -1.0906...",negative
2,is the worst customer service,negative,negative,negative,True,"[-0.0460682, -0.1983459, -0.061124038, -0.8759...",negative
3,you gonna magically change your connectivity f...,negative,positive,neutral,False,"[-0.6767313, -0.32149655, 0.029280744, -0.7963...",neutral
4,since i signed up with you....since day 1,neutral,positive,neutral,False,"[-0.33590388, -0.30803612, -0.38103887, -0.617...",neutral


In [12]:
# Saving final dataset for model training
model_df.to_pickle('/Users/diegolemos/Masters/Theses/code/data/processed/final_model_dataset.pkl')
print("Saved final_model_dataset.pkl")

Saved final_model_dataset.pkl


### PCA (Principal Component Analysis)

In [6]:
# Loading saved dataset in chunks
embedding_folder = '/Users/diegolemos/Masters/Theses/code/data/processed/embeddings'
chunk_files = sorted([os.path.join(embedding_folder, f) 
                      for f in os.listdir(embedding_folder) if f.endswith('.pkl')])

print(f"Found {len(chunk_files)} chunks.")


Found 68 chunks.


In [14]:
# Initializing PCA
n_components = 100
batch_size = 10000

ipca = IncrementalPCA(n_components=n_components)

for i, path in enumerate(chunk_files):
    print(f"Fitting on chunk {i+1}/{len(chunk_files)}: {path}")
    chunk = pd.read_pickle(path)

    # Convert to matrix
    embeddings = np.vstack(chunk['embedding'].values)

    # Split into smaller batches
    for j in range(0, len(embeddings), batch_size):
        batch = embeddings[j:j+batch_size]
        ipca.partial_fit(batch)


Fitting on chunk 1/68: /Users/diegolemos/Masters/Theses/code/data/processed/embeddings/embeddings_chunk_1.pkl
Fitting on chunk 2/68: /Users/diegolemos/Masters/Theses/code/data/processed/embeddings/embeddings_chunk_10.pkl
Fitting on chunk 3/68: /Users/diegolemos/Masters/Theses/code/data/processed/embeddings/embeddings_chunk_11.pkl
Fitting on chunk 4/68: /Users/diegolemos/Masters/Theses/code/data/processed/embeddings/embeddings_chunk_12.pkl
Fitting on chunk 5/68: /Users/diegolemos/Masters/Theses/code/data/processed/embeddings/embeddings_chunk_13.pkl
Fitting on chunk 6/68: /Users/diegolemos/Masters/Theses/code/data/processed/embeddings/embeddings_chunk_14.pkl
Fitting on chunk 7/68: /Users/diegolemos/Masters/Theses/code/data/processed/embeddings/embeddings_chunk_15.pkl
Fitting on chunk 8/68: /Users/diegolemos/Masters/Theses/code/data/processed/embeddings/embeddings_chunk_16.pkl
Fitting on chunk 9/68: /Users/diegolemos/Masters/Theses/code/data/processed/embeddings/embeddings_chunk_17.pkl
Fi

In [15]:
# Transforming and saving reduced embbedings
output_folder = '/Users/diegolemos/Masters/Theses/code/data/processed/pca_embeddings_chunks/'
os.makedirs(output_folder, exist_ok=True)

for i, path in enumerate(chunk_files):
    print(f"Transforming chunk {i+1}/{len(chunk_files)}: {path}")
    chunk = pd.read_pickle(path)

    embeddings = np.vstack(chunk['embedding'].values)
    reduced = ipca.transform(embeddings)

    # Saving
    np.save(os.path.join(output_folder, f'pca_chunk_{i+1}.npy'), reduced)


Transforming chunk 1/68: /Users/diegolemos/Masters/Theses/code/data/processed/embeddings/embeddings_chunk_1.pkl
Transforming chunk 2/68: /Users/diegolemos/Masters/Theses/code/data/processed/embeddings/embeddings_chunk_10.pkl
Transforming chunk 3/68: /Users/diegolemos/Masters/Theses/code/data/processed/embeddings/embeddings_chunk_11.pkl
Transforming chunk 4/68: /Users/diegolemos/Masters/Theses/code/data/processed/embeddings/embeddings_chunk_12.pkl
Transforming chunk 5/68: /Users/diegolemos/Masters/Theses/code/data/processed/embeddings/embeddings_chunk_13.pkl
Transforming chunk 6/68: /Users/diegolemos/Masters/Theses/code/data/processed/embeddings/embeddings_chunk_14.pkl
Transforming chunk 7/68: /Users/diegolemos/Masters/Theses/code/data/processed/embeddings/embeddings_chunk_15.pkl
Transforming chunk 8/68: /Users/diegolemos/Masters/Theses/code/data/processed/embeddings/embeddings_chunk_16.pkl
Transforming chunk 9/68: /Users/diegolemos/Masters/Theses/code/data/processed/embeddings/embeddin