# Finetuning DistilBert 

## Imports

In [None]:
import pandas as pd 
import numpy as np
from datasets import Dataset
import os
from transformers import AutoModelForSequenceClassification
import io


In [None]:
from transformers import pipeline, Trainer, TrainingArguments

## Some Preprocessing and Data Vizualization

In [None]:
df = pd.read_csv('/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv')
df = df.rename(columns={'review': 'text', 'sentiment': 'label'})
df.head()

In [None]:
from bs4 import BeautifulSoup

def remove_html_tags(text):
    soup = BeautifulSoup(text, "html.parser")
    stripped_text = soup.get_text(separator=" ")
    return stripped_text

df['text'] = df['text'].apply(remove_html_tags)



In [None]:
#Thanks to https://www.kaggle.com/code/satyampd/imdb-sentiment-analysis-using-bert-w-huggingface
def cat2num(value):
    if value=='positive': 
        return 1
    else: 
        return 0
    
df['label']  =  df['label'].apply(cat2num)

In [None]:
dataset = Dataset.from_pandas(df)


## Tokenization and Padding

In [None]:
from transformers import AutoTokenizer, DataCollatorWithPadding

In [None]:
checkpoint = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)


def tokenize(example):
    return tokenizer(example['text'], truncation = True)


In [None]:
train_dataset = dataset.map(tokenize, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

## Training for Sentiment Analysis

In [None]:
training_args = TrainingArguments(
    output_dir='results',
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_gpu_train_batch_size=32,
    save_steps=5000,
    save_total_limit=2,
    prediction_loss_only=True
)

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)


In [None]:
trainer = Trainer(
    model,
    training_args,
    train_dataset=train_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [None]:
trainer.train()
