In [None]:
!pip install transformers datasets torch accelerate -U

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
Collecting torch
  Downloading torch-2.3.1-cp310-cp310-manylinux1_x86_64.whl (779.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m779.1/779.1 MB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading accelerate-0.31.0-py3-none-any.whl (309 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m309.4/309.4 kB[0m [31m38.0 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-16.1.0-cp310-cp310-manylinux_2_28_x86_64.whl (40.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 MB[0m [31m41.2 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/RM

Mounted at /content/drive
/content/drive/MyDrive/RM


In [None]:
import os
import pandas as pd
from datasets import load_dataset, Dataset, DatasetDict,load_metric
from transformers import DebertaV2Tokenizer, DebertaV2ForSequenceClassification, Trainer, TrainingArguments,DataCollatorWithPadding
from transformers import TextClassificationPipeline


In [None]:

# Load the dataset and inspect the columns
df = pd.read_csv('Processed_RickandMortySeason1-7_utf8.csv')
print(df.columns)

df['dialogue'] = df['dialogue'].astype(str)
df = df.dropna(subset=['dialogue'])

# Print the first few rows of the DataFrame to verify its content
print(df.head())

# Filter and reassign characters
target_characters = ['Rick']
df['character_group'] = df['character'].apply(lambda x: x if x in target_characters else 'Others')

# Prepare the data: Map character names to numerical labels
unique_characters = df['character_group'].unique()
label_map = {char: idx for idx, char in enumerate(unique_characters)}
df['label'] = df['character_group'].map(label_map)

# Print the number of unique characters (classes)
num_classes = len(unique_characters)
print(f"Number of unique characters (classes) to classify: {num_classes}")

# Select only the required columns
df = df[['dialogue', 'label']]


Index(['season', 'episode', 'character', 'dialogue'], dtype='object')
  season       episode character  \
0      1  Anatomy Park     Jerry   
1      1  Anatomy Park     Jerry   
2      1  Anatomy Park     Jerry   
3      1  Anatomy Park      Beth   
4      1  Anatomy Park     Jerry   

                                            dialogue  
0        *singing* Last King Christmas last arrived!  
1  *singing* In the Christmas Christmas! *stops s...  
2  Um, Merry Christmas? *puts his hands on his hi...  
3                           Alive? For your parents?  
4                                          Good one.  
Number of unique characters (classes) to classify: 2


In [None]:
# Convert to Hugging Face Dataset
dataset = Dataset.from_pandas(df)

dataset = DatasetDict({
    'train': dataset
})

# Load the tokenizer and model
tokenizer = DebertaV2Tokenizer.from_pretrained('microsoft/deberta-v3-large')
model = DebertaV2ForSequenceClassification.from_pretrained('microsoft/deberta-v3-large', num_labels=num_classes)

max_length = 128

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples['dialogue'], padding="max_length", truncation=True, max_length=max_length)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Use a data collator that dynamically pads inputs when batching
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Set the format for PyTorch
tokenized_datasets = tokenized_datasets.remove_columns(["dialogue"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")

# Load the accuracy metric
accuracy_metric = load_metric("accuracy")

# Define the compute metrics function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    return accuracy_metric.compute(predictions=predictions, references=labels)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()

# Save the fine-tuned model and tokenizer
model.save_pretrained('./results')
tokenizer.save_pretrained('./results')

print("Model fine-tuning complete and saved to './results'")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



config.json:   0%|          | 0.00/580 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/874M [00:00<?, ?B/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/9163 [00:00<?, ? examples/s]

  accuracy_metric = load_metric("accuracy")


Downloading builder script:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

The repository for accuracy contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/accuracy.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


Step,Training Loss
500,0.5885
1000,0.5718
1500,0.5627
2000,0.5653
2500,0.5755
3000,0.5641
3500,0.5693
4000,0.5594
4500,0.5673
5000,0.5576


Model fine-tuning complete and saved to './results'


In [None]:
# Load the fine-tuned model and tokenizer
tokenizer = DebertaV2Tokenizer.from_pretrained('microsoft/deberta-v3-large')
model = DebertaV2ForSequenceClassification.from_pretrained('./results')

# Create a pipeline for text classification
classifier = TextClassificationPipeline(model=model, tokenizer=tokenizer, return_all_scores=True)

df = pd.read_csv('combined_answers.csv')

def classify_dialogue(dialogue):
    if not isinstance(dialogue, str) or dialogue.strip() == "":
        return {"Ricky_Score": None, "Others_Score": None}

    result = classifier(dialogue)[0]
# Assuming the first label corresponds to Rick
    return result[0]['score']

# List of answer columns
answer_columns = [f'Answer_{i}' for i in range(1, 11)]

# Apply the classification function to each answer column and store the results in new columns
for col in answer_columns:
    df[f'{col}_Ricky_Score'] = df[col].apply(classify_dialogue)

# Save the updated DataFrame back to the same CSV file
df.to_csv('filled_answers.csv', index=False)

print("Evaluation scores have been saved to 'filled_answers.csv'")



Evaluation scores have been saved to 'filled_answers.csv'
