# goal: Implement sentiment classifier using convolution neural network

In [None]:
# Import your pytorch convolution tools
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable

# load gensim google vectors
import gensim.downloader as api
word_vectors = api.load('word2vec-google-news-300')



In [None]:
def get_default_device():
    """Pick GPU if available, else CPU"""
    if torch.cuda.is_available():
        return torch.device('cuda')
    else:
        return torch.device('cpu')
device = get_default_device()
print(device)


def to_device(data, device):
    """Move tensor(s) to chosen device"""
    if isinstance(data, (list,tuple)):
        return [to_device(x, device) for x in data]
    return data.to(device, non_blocking=True)

# preprocessing

In [None]:
from tqdm import tqdm
# data loading
from nltk.tokenize import TreebankWordTokenizer


# # load gensim google vectors
# word_vectors = api.load('word2vec-google-news-300')


def preprocess_data(filepath):
    """
    load data from file. convert labels from string to numbers
    """
    dataset = pd.read_csv(filepath,skiprows=0)
    # modify  dataset[1] such that positive = 1, negative=0
    dataset["sentiment"] = dataset["sentiment"].map({"positive": 1, "negative": 0})
    return dataset


def tokenize_and_vectorize_sample(sample, max_len=400):
    """
    takes text as input and return word vectors as output
    """
    tokenizer = TreebankWordTokenizer()
    vectorized_data = []
    tokens = tokenizer.tokenize(sample)
    sample_vecs = []
    for token in tokens:
        try:
            sample_vecs.append(word_vectors[token])
            if len(sample_vecs)>= max_len:
                return sample_vecs
            # print(f"keeping: {token}")
        except KeyError:
            # print(f"skipping: {token}")
            pass  # No matching token in the Google w2v vocab

    return sample_vecs


dataset = preprocess_data("data/IMDB_Dataset.csv")


# test train split

In [None]:

split_point = int(len(dataset)*.8)

x_train = [sample[0] for i, sample in dataset.iloc[1:split_point,:].iterrows()]
y_train = [sample[1] for i, sample in dataset.iloc[1:split_point,:].iterrows()]

x_test = [sample[0] for i, sample in dataset.iloc[split_point:,:].iterrows()]
y_test = [sample[1] for i, sample in dataset.iloc[split_point:,:].iterrows()]


In [None]:
# tokenize_and_vectorize_sample(x_train[0])
print(len(x_train[0].split()))

In [None]:
def generate_batch(x_train, y_train, batch_size):
    next_x_batch, next_y_batch = [], []
    with tqdm(total=len(x_train), position=0, leave=True) as pbar:
        for ip, output in zip(x_train, y_train):
            next_x_batch.append(ip)
            next_y_batch.append(output)
            if len(next_x_batch) == batch_size:
                yield next_x_batch, next_y_batch
                next_x_batch, next_y_batch = [], []
                pbar.update(batch_size)

In [None]:
class RNN(nn.Module):
    def __init__(self, embedding_dims=300, hidden_dims=100, num_layers=1, batch_first=True):
        super(RNN, self).__init__()
        self.rnn = nn.RNN(
            input_size=embedding_dims,
            hidden_size=hidden_dims,
            num_layers=1,
            batch_first=True,
        )
        self.out = nn.Linear(hidden_dim, 1)
        self.activation = nn.Sigmoid()
        
    
    def forward(self, x):
        r_out, h_n = self.rnn(x, None)
        out = self.out(r_out[:, -1, :])
        out = self.activation(out)
        
        return out



In [None]:
embedding_dims=300
hidden_dims= 512
num_layers=2
batch_first=True
max_len=400
epochs = 5
batch_size=1

In [None]:
# Instantiate the model
rnn_model = RNN(embedding_dim=300, hidden_dim=100, num_layers=num_layers, batch_first=True)
rnn_model = to_device(rnn_model, device)

# Define loss function and optimizer
criterion = nn.BCELoss()
optimizer = optim.Adam(rnn_model.parameters())


# Training loop
num_epochs = epochs  # Example value for epochs
for epoch in tqdm(range(num_epochs)):
    rnn_model.train()     
    loss_val = 0
    for i, (x_batch, y_batch) in enumerate(generate_batch(x_train, y_train, batch_size=batch_size)):
        # print(i)
        x_batch = [tokenize_and_vectorize_sample(sample) for sample in x_batch]
        x_batch = Variable(torch.FloatTensor(x_batch))
        x_batch = to_device(x_batch, device)
        # x_batch = x_batch.permute(0, 2, 1)
        # print(x_batch.shape)
        y_batch = to_device(Variable(torch.FloatTensor([y_batch])), device)
        y_batch = y_batch.reshape(batch_size,1)
        outputs = rnn_model(x_batch)
        # print(outputs)
        # print(y_batch)
        loss = criterion(outputs, y_batch)
        optimizer.zero_grad()
        loss.backward()
        loss_val += loss.item()
        optimizer.step()
        # if i==20000:
        #     break
    print(f"epoch({epoch}): total_loss={loss_val}")
    loss_val=0

In [None]:
# save pytorch model
# saving embeddings
model_path = f"imdb_rnn_model_{max_len}_{embedding_dims}_{hidden_dims}_{num_layers}.pth"
torch.save(rnn_model.state_dict(), model_path)

In [None]:
# Load the model from the file
model_path = f"imdb_rnn_model_{max_len}_{embedding_dims}_{hidden_dims}_{num_layers}.pth"
loaded_model = RNN(embedding_dims=embedding_dims, hidden_dims=hidden_dims, num_layers=num_layers)  # Create an instance of your model
loaded_model.load_state_dict(torch.load(model_path))  # Load the state dictionary
loaded_model.eval()  # Set the model to evaluation mode
loaded_model = to_device(loaded_model, device)
# Now 'loaded_model' contains the model loaded from the saved file


# evaluation

In [None]:
# evaluation
import torch
from sklearn.metrics import accuracy_score

# Load the model and prepare input data (as shown in the previous responses)

def evaluate(x_test, y_test, batch_size=1):
    print(f"len(x_test) == len(y_test): {len(x_test)} == {len(y_test)}")
    predictions = []
    batches = int(len(x_test)/batch_size) +1
    for i in tqdm(range(batches)):
        x_batch = x_test[i:i+batch_size]
        y_batch = y_test[i:i+batch_size]
        
        if not x_batch or not y_batch:
            break
        
        x_batch = [tokenize_and_vectorize_sample(sample, max_len=max_len) for sample in x_batch]
        
        x_batch = Variable(torch.FloatTensor(x_batch))
        x_batch = to_device(x_batch, device)
        
        
        y_batch = to_device(Variable(torch.FloatTensor([y_batch])), device)
        y_batch = y_batch.reshape(batch_size,1)
        y_batch = to_device(y_batch, device)
        # print(x_batch.shape)
        # Perform inference on the test data
        with torch.no_grad():
            # Forward pass to get predictions
            batch_predictions = loaded_model(x_batch)
            # print(f"batch_predictions: {batch_predictions}")
            # Assuming 'predictions' is the model's predictions (binary values)
            # print(predictions)
            # Convert predictions to binary values based on a threshold (e.g., 0.5 for binary classification)
            threshold = 0.5
            binary_predictions = (batch_predictions > threshold).float()
            binary_predictions = [bp.squeeze(0).cpu() for bp in binary_predictions]
            # print(f"binary_predictions.squeeze(): {binary_predictions.squeeze()}")
            predictions.extend(binary_predictions)
            
            # print(f"len(binary_predictions):{len(binary_predictions)}")
    
    total = min(len(y_test), len(predictions))
    # print(y_test[:total])
    # print(predictions[:total])
    accuracy = accuracy_score(y_test[:total], predictions[:total])
    
    print("Accuracy:", accuracy)

evaluate(x_test, y_test)
# print(type(x_test))

# inference

In [None]:
def inference(text):
    x_batch = [tokenize_and_vectorize_sample(sample) for sample in [text]]
    x_batch = [pad_trunc(sample, max_len=max_len, embedding_dims=embedding_dims) for sample in x_batch]        
    x_batch = to_device(Variable(torch.FloatTensor(x_batch)),device)
    # x_batch = x_batch.permute(0, 2, 1)
    # print(x_batch.shape)
    # Perform inference
    with torch.no_grad():
        # Forward pass to get predictions
        predictions = loaded_model(x_batch)
        # If you're doing binary classification (as in your original Keras model)
        # You may want to threshold the predictions to get the final classes
        # Assuming a threshold of 0.5 for binary classification
        # print(predictions)
        threshold = 0.5
        binary_predictions = (predictions > threshold).float()
        return binary_predictions

review = inference("""Movie was too expensive but worth it"s weight in gold.""")
print(review)

# print(x_train[1])
