In [None]:
#pip install wandb

In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, GPT2Config, GPT2ForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd
import os
from tqdm import tqdm
import re
import wandb
import torch

In [2]:
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import Trainer, TrainingArguments

In [3]:
wandb.init(project='testanalytics', name='sentiment-analysis-1')

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33maamjad[0m ([33mtab-llm-finetuning[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [4]:
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')

In [5]:
train_df

Unnamed: 0,review,sentiment
0,SAPS AT SEA <br /><br />Aspect ratio: 1.37:1<b...,negative
1,"If you want mindless action, hot chicks and a ...",positive
2,"""The Woman in Black"" is easily one of the cree...",positive
3,I can barely find the words to describe how mu...,negative
4,What's in here ?! Let me tell you. It's the pr...,negative
...,...,...
29995,I was really looking forward to this show give...,negative
29996,"I searched for this movie for years, apparentl...",positive
29997,This is a story of the Winchester Rifle Model ...,positive
29998,this film is in the MANDINGO & DRUM type<br />...,negative


In [None]:
model_name = 'gpt2'

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
config = GPT2Config.from_pretrained('gpt2', num_labels=2)
model = GPT2ForSequenceClassification.from_pretrained('gpt2', config=config)


In [6]:
id2label = {0: "negative", 1: "positive"}
label2id = {"negative": 0, "positive": 1}
train_df['sentiment'] = train_df['sentiment'].map(label2id)
test_df['sentiment'] = test_df['sentiment'].map(label2id)

In [7]:
def cleaning(s):
    s = str(s)
    s = re.sub(r'<br />',' ',s)
    s = re.sub('\W,\s',' ',s)
    s = re.sub(r'[^A-Za-z0-9.,\'"()-]',' ',s)
    s = re.sub("\d+", "", s)
    s = re.sub('\s+',' ',s)
    s = re.sub('[!@#$_]', '', s)
    s = s.replace("co","")
    s = s.replace("https","")
    s = s.replace("[\w*"," ")
    return s

In [8]:
train_df['review'] = train_df['review'].apply(cleaning)
test_df['review'] = test_df['review'].apply(cleaning)

In [9]:
train_df

Unnamed: 0,review,sentiment
0,SAPS AT SEA Aspect ratio . Sound format Mono (...,0
1,"If you want mindless action, hot chicks and a ...",1
2,"""The Woman in Black"" is easily one of the cree...",1
3,I can barely find the words to describe how mu...,0
4,What's in here Let me tell you. It's the prese...,0
...,...,...
29995,I was really looking forward to this show give...,0
29996,"I searched for this movie for years, apparentl...",1
29997,This is a story of the Winchester Rifle Model ...,1
29998,this film is in the MANDINGO DRUM type they we...,0


In [10]:
test_df

Unnamed: 0,review,sentiment
0,Steven Rea plays a forensic scientist thrust o...,1
1,As the first of the TV specials offered on the...,1
2,There may something poetically right in seeing...,0
3,all i can say about this film is to read the b...,0
4,I thought it was a pretty good movie and shoul...,1
...,...,...
19995,Well-done ghost story that will give you the c...,1
19996,I'm at a loss for words. This movie is beyond ...,0
19997,"First off, I had my doubts just looking at the...",0
19998,"In an early scene, Luca (David Pasquesi) and J...",1


In [11]:
train_reviews, train_sentiment = train_df['review'].tolist(), train_df['sentiment'].tolist()
test_reviews, test_sentiment = test_df['review'].tolist(), test_df['sentiment'].tolist()

In [12]:
train_reviews, val_review, train_sentiment, val_sentiment = train_test_split(train_reviews, train_sentiment, test_size=0.1, random_state=42)

In [13]:
model_name = 'gpt2'

model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
# print memory footprint
print("Memory footprint of gpt2_model: ", gpt2_model.num_parameters() * 4 / 1024 / 1024, "MB")

#tokenizer = GPT2Tokenizer.from_pretrained(model_name)
#tokenizer.add_special_tokens({'pad_token': '[PAD]'})
#model = GPT2ForSequenceClassification.from_pretrained(model_name, num_labels=2)  # Assuming 2 classes for sentiment analysis


Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
# Tokenize the data
train_enc = tokenizer(train_reviews, truncation=True, padding=True, max_length=128, return_tensors='pt')
val_enc = tokenizer(val_review, truncation=True, padding=True, max_length=128, return_tensors='pt')
test_enc = tokenizer(test_reviews, truncation=True, padding=True, max_length=128, return_tensors='pt')

In [None]:
def data_collator(samples):
    input_ids = tokenized_datasets["train"]['input_ids']
    attention_mask = tokenized_datasets["train"]['attention_mask']
    #labels = tokenized_datasets["train"]['input_ids']

    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        #'labels': labels
    }

In [16]:
import torch
from torch.utils.data import Dataset

class SentimentDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['sentiment'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [17]:
train_dataset = SentimentDataset(train_enc, train_sentiment)
val_dataset = SentimentDataset(val_enc, val_sentiment)
test_dataset = SentimentDataset(test_enc, test_sentiment)

In [None]:
#train_dataset = TextDataset(train_enc, torch.tensor(train_sentiment), block_size=128)
#val_dataset = TextDataset(val_enc, torch.tensor(val_sentiment), block_size=128)
#test_dataset = TextDataset(test_enc, torch.tensor(test_sentiment), block_size=128)

In [20]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [22]:
output_dir = 'data/result'
training_args = TrainingArguments(
    output_dir=output_dir,
    overwrite_output_dir=True,
    per_device_train_batch_size=8,
    num_train_epochs=4,
    save_steps=500,  # Adjust as needed
    save_total_limit=2,
    fp16=False,
    report_to='wandb'
)


trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)


trainer.train()

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
results = trainer.evaluate(test_dataset)
accuracy = results['eval_accuracy']
wandb.log({"Test Accuracy": accuracy})
wandb.finish()  # Finish W&B run



In [None]:
print(f'Accuracy on test set: {accuracy}')

In [None]:
train(
    train_file_path=train_file_path,
    model_name=model_name,
    output_dir=output_dir,
    overwrite_output_dir=overwrite_output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    num_train_epochs=num_train_epochs,
    save_steps=save_steps
)

In [None]:
from transformers import PreTrainedTokenizerFast, GPT2LMHeadModel, GPT2TokenizerFast, GPT2Tokenizer

In [None]:
test = pd.read_csv("data/test.csv")
test

In [None]:
def generate_text(sequence, max_length):
    model_path = 'data/result'
    model = load_model(model_path)
    tokenizer = load_tokenizer(model_path)
    ids = tokenizer.encode(f'{sequence}', return_tensors='pt')
    final_outputs = model.generate(
        ids,
        do_sample=True,
        max_length=max_length,
        pad_token_id=model.config.eos_token_id,
        top_k=50,
        top_p=0.95,
    )
    print(tokenizer.decode(final_outputs[0], skip_special_tokens=True))