<a href="https://colab.research.google.com/github/amarparab28/Dy.Tech/blob/main/G42_DY_TECH_REAL_DATASET.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#**GROUP 42**
## GROUP NAME - DY.TECH
TOPIC - TRANSFORMER BASED RECOMMENDATION SYSTEM

DATA UPLOADING

In [20]:
from google.colab import files
uploaded = files.upload()

Saving Ses01M_script01_1.txt to Ses01M_script01_1 (3).txt
Saving Ses01M_script01_2.txt to Ses01M_script01_2 (2).txt
Saving Ses01M_script01_3.txt to Ses01M_script01_3 (2).txt


READING THE DATA

In [21]:
import pandas as pd
import re

# List of file paths
file_paths = ['Ses01M_script01_1.txt', 'Ses01M_script01_2.txt', 'Ses01M_script01_3.txt']
data = []

for file_path in file_paths:
    with open(file_path, 'r') as file:
        lines = file.readlines()
        current_entry = {}
        for line in lines:
            match = re.match(r'\[(\d+\.\d+) - (\d+\.\d+)\]\s+(\S+)\s+(\w+)\s+\[(\d+\.\d+), (\d+\.\d+), (\d+\.\d+)\]', line)
            if match:
                if current_entry:
                    data.append(current_entry)
                current_entry = {
                    'start_time': float(match.group(1)),
                    'end_time': float(match.group(2)),
                    'turn_name': match.group(3),
                    'emotion': match.group(4),
                    'valence': float(match.group(5)),
                    'arousal': float(match.group(6)),
                    'dominance': float(match.group(7)),
                    'annotations': []
                }
            else:
                annotation_match = re.match(r'(\w+-E\d+):\s+([\w\s]+);\s*\(\)', line)
                if annotation_match and current_entry:
                    current_entry['annotations'].append((annotation_match.group(1), annotation_match.group(2)))

        if current_entry:
            data.append(current_entry)

# Convert to DataFrame
df = pd.DataFrame(data)

print(df)


     start_time  end_time               turn_name emotion  valence  arousal  \
0        8.0600   11.5914  Ses01M_script01_1_F000     fru   2.0000   1.6667   
1       14.4000   16.6181  Ses01M_script01_1_F001     sur   2.0000   2.6667   
2       19.2815   23.9200  Ses01M_script01_1_F002     sur   2.0000   3.3333   
3       29.3349   33.2000  Ses01M_script01_1_F003     sur   2.6667   2.6667   
4       39.7634   42.1267  Ses01M_script01_1_F004     sad   2.0000   2.0000   
..          ...       ...                     ...     ...      ...      ...   
190    431.0408  439.0700  Ses01M_script01_3_M039     sad   1.5000   3.5000   
191    439.4800  449.9100  Ses01M_script01_3_M040     sad   1.5000   2.5000   
192    460.4000  463.8500  Ses01M_script01_3_M041     sad   2.0000   1.5000   
193    499.4600  502.2700  Ses01M_script01_3_M042     xxx   3.5000   2.0000   
194    507.7700  511.4612  Ses01M_script01_3_M043     hap   4.0000   2.5000   

     dominance                                     

DATA PREPROCESSING

In [22]:
# Inspect the first few rows of the dataframe
print(df.head())

# Check for missing values and fill if any
df.fillna('', inplace=True)

# Convert annotations to a single string for simplicity
df['annotations'] = df['annotations'].apply(lambda x: ' '.join([f'{k}:{v}' for k, v in x]))

# Encode categorical variables
emotion_mapping = {emotion: idx for idx, emotion in enumerate(df['emotion'].unique())}
df['emotion'] = df['emotion'].map(emotion_mapping)

# Normalize VAD scores
df['valence'] = df['valence'] / df['valence'].max()
df['arousal'] = df['arousal'] / df['arousal'].max()
df['dominance'] = df['dominance'] / df['dominance'].max()

# Combine all relevant features into a single string for the transformer model input
df['input_text'] = df.apply(lambda row: f"{row['turn_name']} {row['annotations']} Emotion:{row['emotion']} VAD:[{row['valence']}, {row['arousal']}, {row['dominance']}]", axis=1)

# Print the processed DataFrame
print(df[['start_time', 'end_time', 'input_text']].head())


   start_time  end_time               turn_name emotion  valence  arousal  \
0      8.0600   11.5914  Ses01M_script01_1_F000     fru   2.0000   1.6667   
1     14.4000   16.6181  Ses01M_script01_1_F001     sur   2.0000   2.6667   
2     19.2815   23.9200  Ses01M_script01_1_F002     sur   2.0000   3.3333   
3     29.3349   33.2000  Ses01M_script01_1_F003     sur   2.6667   2.6667   
4     39.7634   42.1267  Ses01M_script01_1_F004     sad   2.0000   2.0000   

   dominance                                        annotations  
0     1.6667  [(C-E1, Frustration), (C-E2, Sadness), (C-E4, ...  
1     3.0000  [(C-E1, Anger), (C-E2, Surprise), (C-E4, Surpr...  
2     3.0000  [(C-E1, Frustration), (C-E2, Surprise), (C-E4,...  
3     2.6667  [(C-E1, Frustration), (C-E2, Surprise), (C-E4,...  
4     2.0000   [(C-E1, Fear), (C-E2, Sadness), (C-E4, Sadness)]  
   start_time  end_time                                         input_text
0      8.0600   11.5914  Ses01M_script01_1_F000 C-E1:Frustration C

TOKENIZATION

In [23]:
from transformers import BertTokenizer

# Load the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the input text
inputs = tokenizer(df['input_text'].tolist(), return_tensors='pt', padding=True, truncation=True)

# Print the tokenized inputs
print(inputs)


{'input_ids': tensor([[ 101, 7367, 2015,  ...,    0,    0,    0],
        [ 101, 7367, 2015,  ...,    0,    0,    0],
        [ 101, 7367, 2015,  ...,    0,    0,    0],
        ...,
        [ 101, 7367, 2015,  ...,    0,    0,    0],
        [ 101, 7367, 2015,  ...,    0,    0,    0],
        [ 101, 7367, 2015,  ...,    0,    0,    0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}


CUSTOM DATA CLASS

In [24]:
import torch

# Define dataset class
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

labels = df['emotion'].tolist()
dataset = CustomDataset(inputs, labels)


MODEL TRAINING

In [25]:
from transformers import BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import mean_absolute_error
import numpy as np

# Define a function to compute metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)

    # Compute loss
    loss = np.mean(pred.predictions)

    # Compute MAE
    mae = mean_absolute_error(labels, preds)

    return {"loss": loss, "mae": mae}

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    evaluation_strategy="epoch",
)

# Define Trainer
trainer = Trainer(
    model=BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(emotion_mapping)),
    args=training_args,
    train_dataset=dataset,
    eval_dataset=dataset,
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch,Training Loss,Validation Loss,Mae
1,No log,-0.004431,1.712821
2,No log,-0.021221,1.179487
3,No log,-0.020337,0.615385


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


TrainOutput(global_step=147, training_loss=1.7209162290404443, metrics={'train_runtime': 741.8452, 'train_samples_per_second': 0.789, 'train_steps_per_second': 0.198, 'total_flos': 24351931646640.0, 'train_loss': 1.7209162290404443, 'epoch': 3.0})

MODEL EVALUATION

In [26]:
# Evaluate the model on the training dataset
results = trainer.evaluate()

print(f"Final evaluation results: {results}")


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Final evaluation results: {'eval_loss': -0.02033732272684574, 'eval_mae': 0.6153846153846154, 'eval_runtime': 52.3883, 'eval_samples_per_second': 3.722, 'eval_steps_per_second': 0.935, 'epoch': 3.0}
