In [None]:
!pip install datasets --quiet
!pip install transformers --quiet

In [None]:
!pip install accelerate -U --quiet
!pip install transformers[torch] --quiet

In [None]:
# standard libararies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import torch
from datasets import load_dataset
from sklearn import metrics
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments

In [None]:
!huggingface-cli login --token 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

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
!huggingface-cli whoami

Aditya685


In [None]:
data = load_dataset("datadrivenscience/movie-genre-prediction",use_auth_token= True,)



  0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
data

DatasetDict({
    train: Dataset({
        features: ['id', 'movie_name', 'synopsis', 'genre'],
        num_rows: 54000
    })
    test: Dataset({
        features: ['id', 'movie_name', 'synopsis', 'genre'],
        num_rows: 36000
    })
})

In [None]:
data['train'][0]

{'id': 44978,
 'movie_name': 'Super Me',
 'synopsis': 'A young scriptwriter starts bringing valuable objects back from his short nightmares of being chased by a demon. Selling them makes him rich.',
 'genre': 'fantasy'}

In [None]:
data['train'][1]

{'id': 50185,
 'movie_name': 'Entity Project',
 'synopsis': 'A director and her friends renting a haunted house to capture paranormal events in order to prove it and become popular.',
 'genre': 'horror'}

In [None]:
class ClassificationDataset:
    def __init__(self,data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self,item):
        text = str(self.data[item]['movie_name'] + " " + self.data[item]['synopsis'])
        target = int(self.data[item]['genre'])
        inputs = self.tokenizer(text, max_length = 20, padding = 'max_length', truncation = True)

        ids = inputs['input_ids']
        mask = inputs['attention_mask']

        return {
            'input_ids': torch.tensor(ids,dtype = torch.long),
            'attention_mask': torch.tensor(mask, dtype = torch.long) ,
            'labels': torch.tensor(target, dtype = torch.long)
        }

In [None]:
def compute_metrics(eval_pred):
  predictions, labels = eval_pred
  predictions = np.argmax(predictions, axis = 1)
  accuracy = metrics.accuracy_score(labels, predictions)
  return {'accuracy': accuracy}


In [None]:
def train():
    df = load_dataset("datadrivenscience/movie-genre-prediction", use_auth_token = True)
    df = df.class_encode_column('genre')

    df_train = df['train']
    df_test = df['test']

    temp_df = df_train.train_test_split(test_size = 0.2, stratify_by_column = 'genre')

    df_train = temp_df['train']
    df_val = temp_df['test']

    tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased', use_fast = True)
    model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels = len(df_train.features['genre']._int2str),)

    train_dataset = ClassificationDataset(df_train, tokenizer)
    val_dataset = ClassificationDataset(df_val, tokenizer)
    test_dataset = ClassificationDataset(df_test, tokenizer)

    args = TrainingArguments(
        'model',
        evaluation_strategy = 'epoch',
        save_strategy = 'epoch',
        learning_rate = 2e-5,
        per_device_train_batch_size = 8,
        per_device_eval_batch_size = 8,
        num_train_epochs = 1,
        weight_decay = 0.01,
        load_best_model_at_end = True,
        metric_for_best_model = 'accuracy',
        report_to  = 'none',
        save_total_limit = 1
    )

    trainer = Trainer(
        model,
        args,
        train_dataset = train_dataset,
        eval_dataset = val_dataset,
        tokenizer = tokenizer,
        compute_metrics = compute_metrics

    )

    trainer.train()
    preds = trainer.predict(test_dataset).predictions
    preds = np.argmax(preds, axis = 1)

    # generate submission

    submission = pd.DataFrame({'id': df_test['id'], 'genre': preds})
    submission.loc[:, 'genre']= submission.genre.apply(lambda x : df_train.features['genre'].int2str(x))

    submission.to_csv('submission.csv', index = False)

In [None]:
train()



  0%|          | 0/2 [00:00<?, ?it/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly i

Epoch,Training Loss,Validation Loss,Accuracy
1,1.7665,1.770104,0.373611


In [None]:
data

DatasetDict({
    train: Dataset({
        features: ['id', 'movie_name', 'synopsis', 'genre'],
        num_rows: 54000
    })
    test: Dataset({
        features: ['id', 'movie_name', 'synopsis', 'genre'],
        num_rows: 36000
    })
})

In [None]:
data['train'][0]['movie_name'] + " " + data['train'][0]['synopsis']

'Super Me A young scriptwriter starts bringing valuable objects back from his short nightmares of being chased by a demon. Selling them makes him rich.'

In [None]:
data['train'][0]['movie_name'],data['train'][0]['synopsis']

('Super Me',
 'A young scriptwriter starts bringing valuable objects back from his short nightmares of being chased by a demon. Selling them makes him rich.')