In [1]:
import re
import json
import torch
import random
import pandas as pd
from tqdm import tqdm
from torch.utils.data import Dataset
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, random_split
from transformers import GPT2Tokenizer, TrainingArguments, Trainer, GPT2LMHeadModel, GPT2ForSequenceClassification
import wandb
from datasets import load_dataset
from torch.utils.data import TensorDataset, DataLoader, RandomSampler

In [2]:
wandb.init(project='testanalytics', name='sentiment-analysis-1')

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33maamjad[0m ([33mtab-llm-finetuning[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [3]:
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')

In [4]:
train_df

Unnamed: 0,review,sentiment
0,SAPS AT SEA Aspect ratio . Sound format Mono (...,negative
1,"If you want mindless action, hot chicks and a ...",positive
2,"""The Woman in Black"" is easily one of the cree...",positive
3,I can barely find the words to describe how mu...,negative
4,What's in here Let me tell you. It's the prese...,negative
...,...,...
29995,I was really looking forward to this show give...,negative
29996,"I searched for this movie for years, apparentl...",positive
29997,This is a story of the Winchester Rifle Model ...,positive
29998,this film is in the MANDINGO DRUM type they we...,negative


In [5]:
id2label = {0: "negative", 1: "positive"}
label2id = {"negative": 0, "positive": 1}
#train_df['sentiment'] = train_df['sentiment'].map(label2id)
#test_df['sentiment'] = test_df['sentiment'].map(label2id)

In [6]:
def cleaning(s):
    s = str(s)
    s = re.sub(r'<br />',' ',s)
    s = re.sub('\W,\s',' ',s)
    s = re.sub(r'[^A-Za-z0-9.,\'"()-]',' ',s)
    s = re.sub("\d+", "", s)
    s = re.sub('\s+',' ',s)
    s = re.sub('[!@#$_]', '', s)
    s = s.replace("co","")
    s = s.replace("https","")
    s = s.replace("[\w*"," ")
    return s

In [7]:
train_df['review'] = train_df['review'].apply(cleaning)
test_df['review'] = test_df['review'].apply(cleaning)

In [8]:
test_df

Unnamed: 0,review,sentiment
0,Steven Rea plays a forensic scientist thrust o...,positive
1,As the first of the TV specials offered on the...,positive
2,There may something poetically right in seeing...,negative
3,all i can say about this film is to read the b...,negative
4,I thought it was a pretty good movie and shoul...,positive
...,...,...
19995,Well-done ghost story that will give you the c...,positive
19996,I'm at a loss for words. This movie is beyond ...,negative
19997,"First off, I had my doubts just looking at the...",negative
19998,"In an early scene, Luca (David Pasquesi) and J...",positive


In [9]:
train_df.to_csv("data/train.csv", index=False)
test_df.to_csv("data/test.csv", index=False)

In [3]:
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')

In [4]:
train_df

Unnamed: 0,review,sentiment
0,SAPS AT SEA Aspect ratio . Sound format Mono (...,negative
1,"If you want mindless action, hot chicks and a ...",positive
2,"""The Woman in Black"" is easily one of the cree...",positive
3,I can barely find the words to describe how mu...,negative
4,What's in here Let me tell you. It's the prese...,negative
...,...,...
29995,I was really looking forward to this show give...,negative
29996,"I searched for this movie for years, apparentl...",positive
29997,This is a story of the Winchester Rifle Model ...,positive
29998,this film is in the MANDINGO DRUM type they we...,negative


In [5]:
train_data, val_data = train_test_split(train_df, test_size=0.1, random_state=42)

In [6]:
train_data['review']

346      First of three Aztec Mummies film is the only ...
13028    OK, so I rented this clown-like-Chainsaw-Massa...
8821     Personally, I didn't really gain a whole lot f...
25676    Pecker is a hilariously funny yet twisted film...
7534     What can one say about Elvira that hasn't alre...
                               ...                        
29802    I must admit, out of the EROS MOVIE COLLECTION...
5390     Despite being a huge fan of Fred Astaire and G...
860      When a hardworking entrepreneur is rejected fr...
15795    When I saw this movie I was stunned by what a ...
23654    Even longtime Shirley fans may be surprised by...
Name: review, Length: 27000, dtype: object

In [7]:
model_name = 'gpt2'

tokenizer = GPT2Tokenizer.from_pretrained(model_name, pad_token='<|pad|>', num_labels=2)
model = GPT2ForSequenceClassification.from_pretrained(model_name, num_labels=2)


# print memory footprint
print("Memory footprint of gpt2: ", model.num_parameters() * 4 / 1024 / 1024, "MB")


Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Memory footprint of gpt2:  474.7060546875 MB


In [8]:
#def tokenize(data):
#    return tokenizer(data["review"].tolist(), padding=True, truncation=True, max_length=512), \
#           torch.tensor([1 if sentiment == "positive" else 0 for sentiment in data["sentiment"]])

#train_inputs, train_labels = tokenize(train_data)
#val_inputs, val_labels = tokenize(val_data)
#test_inputs, test_labels = tokenize(test_df)


In [9]:
# Convert inputs to tensors with a specified maximum sequence length
max_sequence_length = 1024

train_inputs = tokenizer(
    train_data["review"].tolist(),
    padding=True,
    truncation=True,
    max_length=max_sequence_length,
    return_tensors="pt",
)

val_inputs = tokenizer(
    val_data["review"].tolist(),
    padding=True,
    truncation=True,
    max_length=max_sequence_length,
    return_tensors="pt",
)

test_inputs = tokenizer(
    test_df["review"].tolist(),
    padding=True,
    truncation=True,
    max_length=max_sequence_length,
    return_tensors="pt",
)

train_labels = torch.tensor([1 if sentiment == "positive" else 0 for sentiment in train_data["sentiment"]])
val_labels = torch.tensor([1 if sentiment == "positive" else 0 for sentiment in val_data["sentiment"]])
test_labels = torch.tensor([1 if sentiment == "positive" else 0 for sentiment in test_df["sentiment"]])

In [10]:
len(train_inputs["input_ids"])

27000

In [11]:
len(train_inputs["attention_mask"])

27000

In [12]:
len(train_labels)

27000

In [13]:
# Convert inputs to tensors
#max_sequence_length = 1024
#train_inputs = tokenizer(train_inputs["input_ids"], padding=True, truncation=True, max_length=max_sequence_length, return_tensors="pt")
#val_inputs = tokenizer(val_inputs["input_ids"], padding=True, truncation=True, max_length=max_sequence_length, return_tensors="pt")
#test_inputs = tokenizer(test_inputs["input_ids"], padding=True, truncation=True, max_length=max_sequence_length, return_tensors="pt")


#train_inputs = torch.tensor(train_inputs["input_ids"]), torch.tensor(train_inputs["attention_mask"])
#val_inputs = torch.tensor(val_inputs["input_ids"]), torch.tensor(val_inputs["attention_mask"])

In [14]:


#def tokenize(sample):
#    return tokenizer(sample['review'], padding='max_length')

#tokenized_datasets = dataset.map(tokenize, batched=True)

#tokenize_train = dataset["train"].map(tokenize_function, batched=True)
#tokenize_test = dataset["test"].map(tokenize_function, batched=True)


In [26]:
from transformers import default_data_collator

In [27]:
train_dataset = TensorDataset(input_ids, attention_masks, labels)

In [None]:
val

In [15]:
print("Loading training arguments...")
training_args = TrainingArguments(output_dir='data/result', 
                                  num_train_epochs=4,
                                  logging_steps=10, 
                                  load_best_model_at_end=True,
                                  evaluation_strategy="steps", 
                                  per_device_train_batch_size=4, 
                                  per_device_eval_batch_size=4,
                                  warmup_steps=100, 
                                  weight_decay=0.01, 
                                  logging_dir='data/logs',
                                  save_steps=500,
                                  save_total_limit=2,
                                  report_to='wandb')

# Define Trainer with optimizer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=TensorDataset(train_inputs["input_ids"],
                                                train_inputs["attention_mask"],
                                                train_labels),
    eval_dataset=TensorDataset(val_inputs["input_ids"],
                                               val_inputs["attention_mask"],
                                               val_labels)
)


print("Start training...")
trainer.train()
    
model.save_pretrained("data/result/sentiment_gpt2")
tokenizer.save_pretrained("data/result/sentiment_gpt2")
    
# test
print("Start testing...")
# eval mode on model
trainer.evaluate()
model.eval()


 

Loading training arguments...
Start training...


TypeError: vars() argument must have __dict__ attribute

In [None]:
test = pd.read_csv('data/test.csv')

In [None]:
model_name = 'gpt2'

tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2ForSequenceClassification.from_pretrained(model_name, num_labels=2)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
# print memory footprint
print("Memory footprint of gpt2: ", model.num_parameters() * 4 / 1024 / 1024, "MB")


In [None]:
test_texts = test_data["review"].values
test_dataset = SentimentDataset(test_texts, [0] * len(test_texts), tokenizer)  # Labels are not used for testing

test_loader = DataLoader(test_dataset, batch_size=batch_size)

In [None]:
predicted_labels = ["positive" if pred == 1 else "negative" for pred in predictions]


In [None]:
test_data["predicted_sentiment"] = predicted_labels


In [None]:
test_data.to_csv("test_results.csv", index=False)