# Data Preprocessing

In [1]:
import pandas as pd

In [2]:
file_path = r"C:\\Users\\boufo\\Start\\2A\\P2\\CSC_4MI01_TA\\projet3\\data\\training.1600000.processed.noemoticon.csv"
column_names = ["target", "ids", "date", "flag", "user", "text"]
data = pd.read_csv(file_path, encoding="latin1", names=column_names)

In [3]:
print(data.head())

   target         ids                          date      flag  \
0       0  1467810369  Mon Apr 06 22:19:45 PDT 2009  NO_QUERY   
1       0  1467810672  Mon Apr 06 22:19:49 PDT 2009  NO_QUERY   
2       0  1467810917  Mon Apr 06 22:19:53 PDT 2009  NO_QUERY   
3       0  1467811184  Mon Apr 06 22:19:57 PDT 2009  NO_QUERY   
4       0  1467811193  Mon Apr 06 22:19:57 PDT 2009  NO_QUERY   

              user                                               text  
0  _TheSpecialOne_  @switchfoot http://twitpic.com/2y1zl - Awww, t...  
1    scotthamilton  is upset that he can't update his Facebook by ...  
2         mattycus  @Kenichan I dived many times for the ball. Man...  
3          ElleCTF    my whole body feels itchy and like its on fire   
4           Karoli  @nationwideclass no, it's not behaving at all....  


In [4]:
print(data.columns)

Index(['target', 'ids', 'date', 'flag', 'user', 'text'], dtype='object')


In [5]:
data = data[['text', 'target']]
print(data.head())

                                                text  target
0  @switchfoot http://twitpic.com/2y1zl - Awww, t...       0
1  is upset that he can't update his Facebook by ...       0
2  @Kenichan I dived many times for the ball. Man...       0
3    my whole body feels itchy and like its on fire        0
4  @nationwideclass no, it's not behaving at all....       0


In [6]:
print(data.isnull().sum())
print(data['target'].value_counts())

text      0
target    0
dtype: int64
target
0    800000
4    800000
Name: count, dtype: int64


In [7]:
# TODO : handle metadata 

## Cleaning data

In [8]:
import re

# Function to clean text
def clean_text(text):
    # Remove mentions (@username), URLs, and special characters
    text = re.sub(r'@[\w]*', '', text)  # Remove mentions
    text = re.sub(r'https?://\S+', '', text)  # Remove URLs
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove non-alphabet characters
    return text.lower()  # Convert to lowercase

# Apply cleaning to the text column
data['clean_text'] = data['text'].apply(clean_text)

In [9]:
# Save clean data
data.to_pickle("cleaned_data.pkl")

In [10]:
data.head()

Unnamed: 0,text,target,clean_text
0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",0,awww thats a bummer you shoulda got david ...
1,is upset that he can't update his Facebook by ...,0,is upset that he cant update his facebook by t...
2,@Kenichan I dived many times for the ball. Man...,0,i dived many times for the ball managed to sa...
3,my whole body feels itchy and like its on fire,0,my whole body feels itchy and like its on fire
4,"@nationwideclass no, it's not behaving at all....",0,no its not behaving at all im mad why am i he...


In [26]:
print(data.isnull().sum())

text          0
target        0
clean_text    0
dtype: int64


In [27]:
# TODO Tokenization

# Embedding with LLM

In [28]:
from transformers import BertTokenizer, BertModel
import torch

In [29]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
device = "cuda"
model = model.to(device)

In [None]:
def get_embeddings_batch(texts):
    # Tokenize the input texts in a batch
    inputs = tokenizer(texts, return_tensors='pt', truncation=True, padding=True, max_length=512)
    inputs = {key: value.to(model.device) for key, value in inputs.items()}
    
    with torch.no_grad():
        outputs = model(**inputs)
        embeddings = outputs.last_hidden_state.mean(dim=1)  # Batch of sentence embeddings
    
    return embeddings.cpu().numpy()

# Process data in batches
batch_size = 32
embeddings = []

from tqdm import tqdm

for i in tqdm(range(0, len(data), batch_size), desc="Processing Batches"):
    batch_texts = data['clean_text'][i:i + batch_size].tolist()
    embeddings_batch = get_embeddings_batch(batch_texts)
    embeddings.extend(embeddings_batch)

# Add embeddings back to the DataFrame
data['embedding'] = embeddings

Processing Batches:   0%|          | 0/50000 [00:00<?, ?it/s]

Processing Batches: 100%|██████████| 50000/50000 [20:31<00:00, 40.58it/s]


In [37]:
print(data['embedding'].isnull().sum())

0


In [38]:
embedding_lengths = data['embedding'].apply(lambda x: len(x) if x is not None else None)
print(embedding_lengths.value_counts())

embedding
768    1600000
Name: count, dtype: int64


In [39]:
# Save embeddings
data.to_pickle("data_with_embeddings.pkl")

## Train ML Models on LLM Embeddings