In [2]:
import pandas as pd
from datasets import Dataset

In [6]:
# Load the CSV file (adjust the filename as needed)
data = pd.read_csv("imdb_subset.csv")


In [7]:
# Convert the DataFrame to a Hugging Face Dataset
hf_dataset = Dataset.from_pandas(data)

# Check the dataset
print(hf_dataset)


Dataset({
    features: ['review', 'sentiment'],
    num_rows: 1000
})


In [8]:
from transformers import AutoTokenizer

In [9]:
## Loading GPT-2 tokenizer

tokenizer = AutoTokenizer.from_pretrained("gpt2")

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [20]:
# Define tokenizing function
# Set pad_token to eos_token
tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(examples):
    tokenized = tokenizer(examples['review'], truncation=True, padding="max_length", max_length=128)
    tokenized['labels'] = tokenized['input_ids'].copy()  # Set labels as a copy of input_ids
    return tokenized
## Applying tokeinization

tokenized_dataset = hf_dataset.map(tokenize_function,batched=True)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [21]:
## Setting format for pytorch

tokenized_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])


In [22]:
## Splitting the dataset to train and test

train_test_split = tokenized_dataset.train_test_split(test_size=0.2,seed=42)
train_dataset = train_test_split['train']
test_dataset = train_test_split['test']

In [23]:
## Defining Training Arguments

from transformers import TrainingArguments


In [24]:
training_arguments = TrainingArguments(
    output_dir = './results',
    eval_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10
)

In [25]:
from transformers import AutoModelForCausalLM, Trainer

In [26]:
## Load GPT-2

model = AutoModelForCausalLM.from_pretrained("gpt2")

#Initialize the Trainer

trainer = Trainer(
    model = model,
    args = training_arguments,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

## Train the model
trainer.train()


Epoch,Training Loss,Validation Loss
1,3.547,3.530348
2,3.0654,3.552714
3,3.1881,3.577069


Attempted to log scalar metric loss:
4.109
Attempted to log scalar metric grad_norm:
10.864067077636719
Attempted to log scalar metric learning_rate:
4.9166666666666665e-05
Attempted to log scalar metric epoch:
0.05
Attempted to log scalar metric loss:
3.7817
Attempted to log scalar metric grad_norm:
10.540285110473633
Attempted to log scalar metric learning_rate:
4.8333333333333334e-05
Attempted to log scalar metric epoch:
0.1
Attempted to log scalar metric loss:
3.7492
Attempted to log scalar metric grad_norm:
10.254212379455566
Attempted to log scalar metric learning_rate:
4.75e-05
Attempted to log scalar metric epoch:
0.15
Attempted to log scalar metric loss:
3.8009
Attempted to log scalar metric grad_norm:
9.807520866394043
Attempted to log scalar metric learning_rate:
4.666666666666667e-05
Attempted to log scalar metric epoch:
0.2
Attempted to log scalar metric loss:
3.739
Attempted to log scalar metric grad_norm:
8.99795150756836
Attempted to log scalar metric learning_rate:
4.5

TrainOutput(global_step=600, training_loss=3.442647460301717, metrics={'train_runtime': 7883.8345, 'train_samples_per_second': 0.304, 'train_steps_per_second': 0.076, 'total_flos': 156775219200000.0, 'train_loss': 3.442647460301717, 'epoch': 3.0})

In [27]:
model.save_pretrained("./imdb_gpt_model")
tokenizer.save_pretrained("./imdb_gpt_model")


('./imdb_gpt_model\\tokenizer_config.json',
 './imdb_gpt_model\\special_tokens_map.json',
 './imdb_gpt_model\\vocab.json',
 './imdb_gpt_model\\merges.txt',
 './imdb_gpt_model\\added_tokens.json',
 './imdb_gpt_model\\tokenizer.json')

In [28]:
from transformers import pipeline

# Load the fine-tuned model
generator = pipeline("text-generation", model="./imdb_gpt_model", tokenizer=tokenizer)

# Test with a prompt
result = generator("The movie was", max_length=50, num_return_sequences=2)
print(result)


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[{'generated_text': "The movie was an interesting attempt at a different style; all three films were entertaining, but none really added any interesting plot and character arcs. The director simply felt that it wasn't quite as enjoyable as an enjoyable one, which is why I'd say"}, {'generated_text': 'The movie was awful I guess. It didn\'t have a lot going for it. I don\'t know if I could feel pity for Mr. K. K. or if he was a nice person. I think he looked bored for example in "'}]


In [1]:
## Validation and testing



new_review_raw = [
"Wow, this movie was just what I needed to cure my insomnia. Absolutely thrilling!",
"Best movie ever! I loved wasting 3 hours of my life on this masterpiece.",
"Oh sure, the acting was so 'natural' I almost believed the actors were wooden dolls.",
"Definitely recommend this movie... if you want to bore yourself to death.",
"The movie was about two friends who embark on a journey. It has a runtime of two hours.",
"It is a typical superhero movie with action scenes and some emotional moments.",
"The cinematography was colorful, and the soundtrack was loud.",
"The second half of the movie was longer than the first.",
"It was okay, I guess, but I wouldn’t watch it again.",
"Not bad, but not great either.",
"I laughed, I cried, but I still don’t know if I liked it or not.",
"The second half was much better than the first, though the ending was questionable.",
"It was very good, super, fantastic.",
"It was good until the second half ",
"It was second half ",
"Second half was good ",
"Movie is amazing, especially in the Second half",
"Terrible movie, Second half was hilarious",
"It was okay, Second half was hilarious",
"Best movie if you are looking for a headache",
"Lots of fun"

]

In [4]:
import pandas as pd
from transformers import AutoModelForCausalLM, AutoTokenizer

# Load fine-tuned GPT-2 model and tokenizer
model = AutoModelForCausalLM.from_pretrained("./imdb_gpt_model")
tokenizer = AutoTokenizer.from_pretrained("./imdb_gpt_model")

# Ensure padding token is set
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token


# Initialize results DataFrame
validationResults = pd.DataFrame()
validationResults['Test cases'] = new_review_raw

# Perform sentiment analysis
predictions = []
model.eval()
for review in new_review_raw:
    # Tokenize the input review
    inputs = tokenizer(review, return_tensors="pt", padding=True, truncation=True, max_length=100)
    
    # Generate response
    outputs = model.generate(
        **inputs,
        max_new_tokens=20,  # Control the number of tokens to generate
        num_return_sequences=1,
        pad_token_id=tokenizer.eos_token_id
    )
    
    # Decode the generated text
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print(generated_text)
    
    # Simple heuristic to classify sentiment
    if "positive" in generated_text.lower():
        predictions.append("Positive")
    elif "negative" in generated_text.lower():
        predictions.append("Negative")
    else:
        predictions.append("Neutral")  # Fallback if sentiment is unclear

# Add predictions to the DataFrame
# validationResults['GPT-2 Prediction'] = predictions

# # Display results
# print(validationResults)


Wow, this movie was just what I needed to cure my insomnia. Absolutely thrilling! I was hooked. I was hooked. I was hooked. I was hooked. I was hooked.
Best movie ever! I loved wasting 3 hours of my life on this masterpiece. I was so disappointed. I was so disappointed in the movie. I was so disappointed in the acting
Oh sure, the acting was so 'natural' I almost believed the actors were wooden dolls. But I was wrong. The acting was so 'natural' I almost believed the actors were wooden dolls
Definitely recommend this movie... if you want to bore yourself to death. It's a very good movie. It's a very good movie. It's a very good movie
The movie was about two friends who embark on a journey. It has a runtime of two hours. The first is a very good movie. The second is a very bad movie. The movie is a
It is a typical superhero movie with action scenes and some emotional moments. The movie is a bit of a disappointment, but it is a good movie. The acting is good
The cinematography was colorf