## Finetuning Superlightweight Gemma-2B and Comparing Baseline Vs. Finetuned For Humor Detection

### 1. Setting Up The Data

In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import Dataset
import torch

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix

import transformers
from trl import SFTTrainer
from peft import LoraConfig

import re

In [12]:
import os
import warnings

warnings.filterwarnings("ignore")

os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

device = torch.device("cuda")

In [13]:
access_token = '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'

In [14]:
# Load the dataset
df_complete = pd.read_csv('Data/humour/dataset.csv')

np.random.seed(12345)  # setting random seed for reproducibility
df = df_complete.sample(n=256)
#df = df_complete.head(n=200)

# Split the dataset into training and testing sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

train_set = Dataset.from_pandas(train_df)

In [5]:
# Load the tokenizer and the model for 2B
tokenizer_2b = AutoTokenizer.from_pretrained("google/gemma-2b")
model_2b = AutoModelForCausalLM.from_pretrained("google/gemma-2b", device_map="auto", token = access_token)

Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.24s/it]


In [6]:
def check_string(input_string):
    # Split the string into sentences
    sentences = re.split(r'(?<=[.!?])\s+', input_string)

    # Reverse the list to start from the end of the string
    sentences.reverse()
    count = 0
    # Iterate over each sentence
    for sentence in sentences:
        if count > 2:
            break
        # Count the occurrences of 'TRUE' and 'FALSE'
        true_count = sentence.count('TRUE')
        false_count = sentence.count('FALSE')

        # Check the counts and return the appropriate value
        if true_count > 0 and false_count == 0:
            return 'TRUE'
        elif false_count > 0 and true_count == 0:
            return 'FALSE'
        count+=1

    # If no 'TRUE' or 'FALSE' is found in the last three sentences, check the whole string
    true_count = input_string.count('TRUE')
    false_count = input_string.count('FALSE')

    if true_count > 0 and false_count == 0:
        return 'TRUE'
    elif false_count > 0 and true_count == 0:
        return 'FALSE'

    # If both or none are present, return None
    return None
        
# Function to prompt gemma to classify humor with provided text
def classify_humor_2b(text):
    prompt = 'Please detect whether or not the following text within square brackets contains any kind of humor. Return \'TRUE\' in plaintext if it does and \'FALSE\' if it does not : [' + text + ']' 
    input_ids = tokenizer_2b.encode(prompt, return_tensors="pt").to("cuda")
    outputs = model_2b.generate(input_ids, max_new_tokens = 512)
    decoded_output = tokenizer_2b.decode(outputs[0])
    print(decoded_output)
    return check_string(decoded_output.replace(prompt, '').lower())

In [7]:
test_joke1 = str(test_df['text'].iloc[4])
test_humor1 = str(test_df['humor'].iloc[4])
print(test_joke1)
print(test_humor1)

What's more effective than an islamic call to prayer? a rape whistle.
True


In [8]:
# Test the baseline model on the test set
result = classify_humor_2b(test_joke1)
print(result)

<bos>Please detect whether or not the following text within square brackets contains any kind of humor. Return 'TRUE' in plaintext if it does and 'FALSE' if it does not : [What's more effective than an islamic call to prayer? a rape whistle.]

Answer:

Step 1/2
First, we need to check if the text contains any kind of humor. To do this, we can use a regular expression to search for any kind of humor, such as a pun, a play on words, or a joke. Here's a Python code snippet that uses the re module to search for any kind of humor in the text: ```python import re text = "[What's more effective than an islamic call to prayer? a rape whistle.]" humor_regex = re.compile(r"(\w+)\s*(\w+)\s*(\w+)\s*(\w+)\s*(\w+)\s*(\w+)\s*(\w+)\s*(\w+)\s*(\w+)\s*(\w+)\s*(\w+)\s*(\w+)\s*(\w+)\s*(\w+)\s*(\w+)\s*(\w+)\s*(\w+)\s*(\w+)\s*(\w+)\s*(\w+)\s*(\w+)\s*(\w+)\s*(\w+)\s*(\w+)\s*(\w+)\s*(\w+)\s*(\w+)\s*(\w+)\s*(\w+)\s*(\w+)\s*(\w+)\s*(\w+)\s*(\w+)\s*(\w+)\s*(\w+)\s*(\w+)\s*(\w+)\s*(\w+)\s*(\w+)\s*(\w+)\s*(\w+)\s*

### Next, let's run the baseline model on the test set:

In [None]:
# Function to classify humor for all text in the test set
def classify_all_humor_2b(df):
    df['predicted_humor'] = df['text'].apply(classify_humor_2b)
    return df

def process_and_save(df):
    # Create a copy of the dataframe to avoid modifying the original
    df_copy = df.copy()

    # Apply the function to the test set
    df_copy = classify_all_humor_2b(df_copy)

    # Convert the 'humor' and 'predicted_humor' columns to boolean
    df_copy['humor'] = df_copy['humor'].apply(lambda x: True if x == 'TRUE' else False)
    df_copy['predicted_humor'] = df_copy['predicted_humor'].apply(lambda x: True if x == 'TRUE' else False)

    # Print the F1 score, confusion matrix, and other relevant classification metrics
    classification_report_str = classification_report(df_copy['humor'], df_copy['predicted_humor'])

    # Save the classification report
    with open('classification_report.txt', 'w') as f:
        f.write(classification_report_str)

    # Generate and save the confusion matrix
    confusion_mat = confusion_matrix(df_copy['humor'], df_copy['predicted_humor'])
    plt.figure(figsize=(10,7))
    sns.heatmap(confusion_mat, annot=True)
    plt.xlabel('Predicted')
    plt.ylabel('Truth')
    plt.savefig('confusion_matrix.png')

# Call the function with your dataframe
process_and_save(test_df)

In [None]:
def formatting_func(example):
    text = f"Text: {example['text'][0]}\nHumor: {example['humor'][0]}"
    return [text]

lora_config = LoraConfig(
    r=8,
    target_modules=["q_proj", "o_proj", "k_proj", "v_proj", "gate_proj", "up_proj", "down_proj"],
    task_type="CAUSAL_LM",
)
    
trainer_2b = SFTTrainer(
    model=model_2b,
    train_dataset=train_set,
    args=transformers.TrainingArguments(
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        warmup_steps=2,
        max_steps=10,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=1,
        output_dir="outputs",
        optim="paged_adamw_8bit"
    ),
    peft_config=lora_config,
    formatting_func=formatting_func,
)

In [None]:
trainer_2b.train()

In [None]:
# Save trained model
trainer_2b.model.save_pretrained("trained-model")

### Next, testing the finetuned model:

In [9]:
#load finetuned model
tunedmodel_2b = AutoModelForCausalLM.from_pretrained("trained-model", device_map="auto", token = access_token)
tunedmodel_2b.to("cuda")

# Function to prompt gemma to classify humor with provided text
def classify_humor_tuned2b(text):
    prompt = 'Please detect whether or not the following text within square brackets contains any kind of humor. Return \'TRUE\' in plaintext if it does and \'FALSE\' if it does not : [' + text + ']' 
    input_ids = tokenizer_2b.encode(prompt, return_tensors="pt").to("cuda")
    outputs = tunedmodel_2b.generate(input_ids, max_new_tokens = 512)
    decoded_output = tokenizer_2b.decode(outputs[0])
    print(decoded_output)
    return check_string(decoded_output.replace(prompt, '').lower())

Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.38s/it]


In [10]:
# Test the finetuned model on the test set
result = classify_humor_tuned2b(test_joke1)
print(result)

<bos>Please detect whether or not the following text within square brackets contains any kind of humor. Return 'TRUE' in plaintext if it does and 'FALSE' if it does not : [What's more effective than an islamic call to prayer? a rape whistle.]

Answer:

Step 1/2
First, we need to check if the text contains any kind of humor. To do this, we can use a regular expression to search for any kind of humor, such as a pun, a play on words, or a sarcastic tone. Here's a Python code snippet that uses the re module to search for humor in the given text: ```python import re text = "[What's more effective than an islamic call to prayer? a rape whistle.]" humor_regex = re.compile(r"(\w+) (\w+)") humor_match = humor_regex.search(text) if humor_match: print("Humor detected!") else: print("No humor detected.") ``` This code uses the re.search() method to search for the humor regex pattern in the given text. If the pattern is found, the humor_match variable will be set to the match object, otherwise it w

In [None]:
# Function to classify humor for all text in the test set
def classify_all_humor_tuned2b(df):
    df['predicted_humor'] = df['text'].apply(classify_humor_tuned2b)
    return df

def process_and_save_tuned(df):
    # Create a copy of the dataframe to avoid modifying the original
    df_copy = df.copy()

    # Apply the function to the test set
    df_copy = classify_all_humor_tuned2b(df_copy)

    # Convert the 'humor' and 'predicted_humor' columns to boolean
    df_copy['humor'] = df_copy['humor'].apply(lambda x: True if x == 'TRUE' else False)
    df_copy['predicted_humor'] = df_copy['predicted_humor'].apply(lambda x: True if x == 'TRUE' else False)

    # Print the F1 score, confusion matrix, and other relevant classification metrics
    classification_report_str = classification_report(df_copy['humor'], df_copy['predicted_humor'])

    # Save the classification report
    with open('tuned_classification_report.txt', 'w') as f:
        f.write(classification_report_str)

    # Generate and save the confusion matrix
    confusion_mat = confusion_matrix(df_copy['humor'], df_copy['predicted_humor'])
    plt.figure(figsize=(10,7))
    sns.heatmap(confusion_mat, annot=True)
    plt.xlabel('Predicted')
    plt.ylabel('Truth')
    plt.savefig('tuned_confusion_matrix.png')

# Call the function with your dataframe
process_and_save_tuned(test_df)