## Traditional Text Vectorization

In [2]:
import pandas as pd
import numpy as np

In [5]:
train_df = pd.read_pickle("clean_train_data.pkl")
test_df = pd.read_pickle("clean_test_data.pkl")
print(train_df.head())
print(test_df.head())

    Lemmatized_text  sentiment_class
0  I d respond I go                1
1          sooo sad                0
2             bully                0
3       leave alone                0
4               son                0
                                     Lemmatized_text  sentiment_class
0                                 last session day                  1
1    shanghai also really exciting precisely   sk...                2
2  recession hit veronique branquinho quit compan...                0
3                                         happy bday                2
4                                             I like                2


In [7]:
# TF-IDf
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()
tfidf_features = tfidf_vectorizer.fit_transform(train_df['Lemmatized_text'])

In [8]:
tfidf_features

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 114780 stored elements and shape (27480, 15452)>

In [9]:
# PCA to reduce dimension
from sklearn.decomposition import PCA

tfidf_dense = tfidf_features.toarray()

pca = PCA(n_components=5000)
pca_features = pca.fit_transform(tfidf_dense)

print(f'Reduced features shape: {pca_features.shape}')
print(f'Explained variance ratio: {pca.explained_variance_ratio_}')

Reduced features shape: (27480, 5000)
Explained variance ratio: [1.85213901e-02 1.77483296e-02 1.63784080e-02 ... 2.28594305e-05
 2.28290326e-05 2.28103713e-05]


In [10]:
total_explained_variance = pca.explained_variance_ratio_.sum()
print(f'Explained variance ratio: {total_explained_variance}')

Explained variance ratio: 0.9175500384772406


In [22]:
tfidf_test_features = tfidf_vectorizer.transform(test_df['Lemmatized_text'])
tfidf_test_dense = tfidf_test_features.toarray()

In [23]:
X_train = pca_features
y_train = train_df["sentiment_class"]
X_test = pca.transform(tfidf_test_dense)
y_test = test_df["sentiment_class"]

In [26]:
print(len(X_train))
print(len(X_train[0]))
print(len(X_test))

27480
5000
3534


## Mod√®le NN

In [19]:
import numpy as np
X_train = np.array(X_train)
y_train = np.array(y_train)
X_test = np.array(X_train)
y_test = np.array(y_train)

5000

In [27]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

In [28]:
device = "cuda"

In [29]:
X_train_tensor = torch.tensor(X_train, dtype=torch.float32).to(device)
y_train_tensor = torch.tensor(y_train, dtype=torch.long).to(device)

In [30]:
train_data = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_data, batch_size=128, shuffle=True)

In [31]:
class NN(nn.Module):
    def __init__(self, input_dim):
        super(NN, self).__init__()
        
        self.fc1 = nn.Linear(input_dim, 2048)  
        self.fc2 = nn.Linear(2048, 1024)       
        self.fc3 = nn.Linear(1024, 512)        
        self.fc4 = nn.Linear(512, 256)         
        self.fc5 = nn.Linear(256, 128)         
        self.fc6 = nn.Linear(128, 64)          
        self.fc7 = nn.Linear(64, 3)            
        
        self.gelu = nn.GELU()  
    
    def forward(self, x):
        x = self.gelu(self.fc1(x))  
        x = self.gelu(self.fc2(x))  
        x = self.gelu(self.fc3(x))  
        x = self.gelu(self.fc4(x))  
        x = self.gelu(self.fc5(x))  
        x = self.gelu(self.fc6(x))  
        x = self.fc7(x)             
        return x

# Check the input dimension size
print(len(X_train[0]))

5000


In [33]:
model = NN(input_dim=len(X_train[0])).to(device)
loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [34]:
num_epochs = 100
for epoch in range(num_epochs):
    model.train()
    for batch in train_loader:
        X_batch, y_batch = batch
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        
        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = loss_function(outputs.squeeze(), y_batch)
        loss.backward()
        optimizer.step()
    
    print(f"Epoch {epoch+1}, Loss: {loss.item()}")

Epoch 1, Loss: 0.43523934483528137
Epoch 2, Loss: 0.4202541410923004
Epoch 3, Loss: 0.4968375265598297
Epoch 4, Loss: 0.3690131902694702
Epoch 5, Loss: 0.3731940984725952
Epoch 6, Loss: 0.33698198199272156
Epoch 7, Loss: 0.3597392141819
Epoch 8, Loss: 0.43736952543258667
Epoch 9, Loss: 0.19738934934139252
Epoch 10, Loss: 0.21385253965854645
Epoch 11, Loss: 0.2678065598011017
Epoch 12, Loss: 0.1631861925125122
Epoch 13, Loss: 0.22807294130325317
Epoch 14, Loss: 0.11542239785194397
Epoch 15, Loss: 0.09125786274671555
Epoch 16, Loss: 0.26698118448257446
Epoch 17, Loss: 0.29427334666252136
Epoch 18, Loss: 0.0758037269115448
Epoch 19, Loss: 0.15562915802001953
Epoch 20, Loss: 0.16036346554756165
Epoch 21, Loss: 0.1259487271308899
Epoch 22, Loss: 0.1480122208595276
Epoch 23, Loss: 0.15441717207431793
Epoch 24, Loss: 0.16064126789569855
Epoch 25, Loss: 0.07942800223827362
Epoch 26, Loss: 0.1316019743680954
Epoch 27, Loss: 0.1478373408317566
Epoch 28, Loss: 0.0728161409497261
Epoch 29, Loss: 0

In [35]:
import torch
from sklearn.metrics import accuracy_score

# Ensure labels are in the correct format for multi-class classification
X_test_tensor = torch.tensor(X_test, dtype=torch.float32).to(device)
y_test_tensor = torch.tensor(y_test, dtype=torch.long).to(device)  # Use torch.long for multi-class labels

batch_size = 64

y_true = []
y_pred = []

num_samples = len(X_test_tensor)  # Number of samples, not features
for i in range(0, num_samples, batch_size):
    # Get the batch
    X_batch = X_test_tensor[i:i+batch_size].to(device)
    y_batch = y_test_tensor[i:i+batch_size].to(device)
    
    # Forward pass to get predictions
    with torch.no_grad():
        outputs = model(X_batch)
        
    # Convert outputs to predicted class indices (class with the highest probability)
    predicted = torch.argmax(outputs, dim=1)  # Get the class with the highest probability
    
    # Append true labels and predicted labels
    y_true.extend(y_batch.cpu().numpy())  # Move to CPU if necessary
    y_pred.extend(predicted.cpu().numpy())  # Move to CPU if necessary

# Calculate accuracy
accuracy = accuracy_score(y_true, y_pred)

# Print accuracy
print(f'Accuracy: {accuracy * 100:.2f}%')


Accuracy: 58.04%
