## Traditional Text Vectorization

In [2]:
import pandas as pd
import numpy as np

In [5]:
train_df = pd.read_pickle("clean_train_data.pkl")
test_df = pd.read_pickle("clean_test_data.pkl")
print(train_df.head())
print(test_df.head())

    Lemmatized_text  sentiment_class
0  I d respond I go                1
1          sooo sad                0
2             bully                0
3       leave alone                0
4               son                0
                                     Lemmatized_text  sentiment_class
0                                 last session day                  1
1    shanghai also really exciting precisely   sk...                2
2  recession hit veronique branquinho quit compan...                0
3                                         happy bday                2
4                                             I like                2


In [7]:
# TF-IDf
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()
tfidf_features = tfidf_vectorizer.fit_transform(train_df['Lemmatized_text'])

In [8]:
tfidf_features

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 114780 stored elements and shape (27480, 15452)>

In [9]:
# PCA to reduce dimension
from sklearn.decomposition import PCA

tfidf_dense = tfidf_features.toarray()

pca = PCA(n_components=5000)
pca_features = pca.fit_transform(tfidf_dense)

print(f'Reduced features shape: {pca_features.shape}')
print(f'Explained variance ratio: {pca.explained_variance_ratio_}')

Reduced features shape: (27480, 5000)
Explained variance ratio: [1.85213901e-02 1.77483296e-02 1.63784080e-02 ... 2.28594305e-05
 2.28290326e-05 2.28103713e-05]


In [10]:
total_explained_variance = pca.explained_variance_ratio_.sum()
print(f'Explained variance ratio: {total_explained_variance}')

Explained variance ratio: 0.9175500384772406


In [22]:
tfidf_test_features = tfidf_vectorizer.transform(test_df['Lemmatized_text'])
tfidf_test_dense = tfidf_test_features.toarray()

In [23]:
X_train = pca_features
y_train = train_df["sentiment_class"]
X_test = pca.transform(tfidf_test_dense)
y_test = test_df["sentiment_class"]

In [26]:
print(len(X_train))
print(len(X_train[0]))
print(len(X_test))

27480
5000
3534


## Modèle NN

In [19]:
import numpy as np
X_train = np.array(X_train)
y_train = np.array(y_train)
X_test = np.array(X_train)
y_test = np.array(y_train)

5000

In [27]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

In [28]:
device = "cuda"

In [29]:
X_train_tensor = torch.tensor(X_train, dtype=torch.float32).to(device)
y_train_tensor = torch.tensor(y_train, dtype=torch.long).to(device)

In [30]:
train_data = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_data, batch_size=128, shuffle=True)

In [36]:
class NN(nn.Module):
    def __init__(self, input_dim):
        super(NN, self).__init__()
        
        self.fc1 = nn.Linear(input_dim, 4096)   
        self.fc2 = nn.Linear(4096, 2048)        
        self.fc3 = nn.Linear(2048, 1024)        
        self.fc4 = nn.Linear(1024, 512)         
        self.fc5 = nn.Linear(512, 256)          
        self.fc6 = nn.Linear(256, 128)          
        self.fc7 = nn.Linear(128, 64)           
        self.fc8 = nn.Linear(64, 32)            
        self.fc9 = nn.Linear(32, 16)            
        self.fc10 = nn.Linear(16, 3)         
        
        self.gelu = nn.GELU()  
    
    def forward(self, x):
        x = self.gelu(self.fc1(x))   
        x = self.gelu(self.fc2(x))   
        x = self.gelu(self.fc3(x))   
        x = self.gelu(self.fc4(x))   
        x = self.gelu(self.fc5(x))   
        x = self.gelu(self.fc6(x))   
        x = self.gelu(self.fc7(x))   
        x = self.gelu(self.fc8(x))   
        x = self.gelu(self.fc9(x))   
        x = self.fc10(x)             
        return x

# Check the input dimension size
print(len(X_train[0]))

5000


In [37]:
model = NN(input_dim=len(X_train[0])).to(device)
loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [38]:
num_epochs = 100
for epoch in range(num_epochs):
    model.train()
    for batch in train_loader:
        X_batch, y_batch = batch
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        
        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = loss_function(outputs.squeeze(), y_batch)
        loss.backward()
        optimizer.step()
    
    print(f"Epoch {epoch+1}, Loss: {loss.item()}")

Epoch 1, Loss: 0.6014074683189392
Epoch 2, Loss: 0.3968317210674286
Epoch 3, Loss: 0.5196322798728943
Epoch 4, Loss: 0.3231646716594696
Epoch 5, Loss: 0.2775110602378845
Epoch 6, Loss: 0.376080721616745
Epoch 7, Loss: 0.3188728392124176
Epoch 8, Loss: 0.3699393570423126
Epoch 9, Loss: 0.21034950017929077
Epoch 10, Loss: 0.2720819413661957
Epoch 11, Loss: 0.23757171630859375
Epoch 12, Loss: 0.149859219789505
Epoch 13, Loss: 0.2018383890390396
Epoch 14, Loss: 0.2821100056171417
Epoch 15, Loss: 0.18712888658046722
Epoch 16, Loss: 0.07489898055791855
Epoch 17, Loss: 0.22413448989391327
Epoch 18, Loss: 0.16914302110671997
Epoch 19, Loss: 0.08383259922266006
Epoch 20, Loss: 0.0723608210682869
Epoch 21, Loss: 0.12552812695503235
Epoch 22, Loss: 0.10859191417694092
Epoch 23, Loss: 0.09189684689044952
Epoch 24, Loss: 0.06477409601211548
Epoch 25, Loss: 0.20204010605812073
Epoch 26, Loss: 0.21280993521213531
Epoch 27, Loss: 0.1957506686449051
Epoch 28, Loss: 0.027795903384685516
Epoch 29, Loss: 

In [None]:
import torch
from sklearn.metrics import accuracy_score

# Ensure labels are in the correct format for multi-class classification
X_test_tensor = torch.tensor(X_test, dtype=torch.float32).to(device)
y_test_tensor = torch.tensor(y_test, dtype=torch.long).to(device)

batch_size = 64

y_true = []
y_pred = []

num_samples = len(X_test_tensor)  # Number of samples, not features
for i in range(0, num_samples, batch_size):
    # Get the batch
    X_batch = X_test_tensor[i:i+batch_size].to(device)
    y_batch = y_test_tensor[i:i+batch_size].to(device)
    
    # Forward pass to get predictions
    with torch.no_grad():
        outputs = model(X_batch)
        
    # Convert outputs to predicted class indices (class with the highest probability)
    predicted = torch.argmax(outputs, dim=1)  # Get the class with the highest probability
    
    # Append true labels and predicted labels
    y_true.extend(y_batch.cpu().numpy())  
    y_pred.extend(predicted.cpu().numpy()) 

# Calculate accuracy
accuracy = accuracy_score(y_true, y_pred)

# Print accuracy
print(f'Accuracy: {accuracy * 100:.2f}%')


Accuracy: 57.58%
