## Importing Libraries

In [1]:
import tensorflow
import pandas as pd 
import numpy as np
from sklearn.preprocessing import LabelEncoder
import nltk
import re
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import spacy
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\abhip\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Loading Dataset from kaggle

In [2]:
df = pd.read_csv("IMDB Dataset.csv")
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


## Pre-processing

In [3]:
df.isna().sum()

review       0
sentiment    0
dtype: int64

In [5]:
from nltk.stem import WordNetLemmatizer
wordnet_lem = WordNetLemmatizer()
stopword_list = set(stopwords.words('english'))
nlp = spacy.load('en_core_web_sm')
def pre_processing(text):
    
    doc = nlp(text)
    
    text = text.strip()
    text = re.sub(r'(<br />)','',text) 
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'[0-9]','',text)
    word_tokens = word_tokenize(text)
    text = " ".join([wordnet_lem.lemmatize(w).lower() for w in word_tokens if not w.lower() in stopword_list])

    return text

In [4]:
tokenizer = AutoTokenizer.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')

model = AutoModelForSequenceClassification.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')

In [6]:
df['Cleaned_Text'] = df['review'].apply(pre_processing)
df

Unnamed: 0,review,sentiment,Cleaned_Text
0,One of the other reviewers has mentioned that ...,positive,one reviewer mentioned watching oz episode you...
1,A wonderful little production. <br /><br />The...,positive,wonderful little production filming technique ...
2,I thought this was a wonderful way to spend ti...,positive,thought wonderful way spend time hot summer we...
3,Basically there's a family where a little boy ...,negative,basically there family little boy jake think t...
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,petter matteis love time money visually stunni...
...,...,...,...
49995,I thought this movie did a down right good job...,positive,thought movie right good job wasnt creative or...
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative,bad plot bad dialogue bad acting idiotic direc...
49997,I am a Catholic taught in parochial elementary...,negative,catholic taught parochial elementary school nu...
49998,I'm going to have to disagree with the previou...,negative,im going disagree previous comment side maltin...


## Initializing Bert Model Parameters

In [17]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from torch.nn import functional as F


class IMDbDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = int(self.labels[idx])

        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

    
# Initialize tokenizer and model
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [8]:
df1 = df[["Cleaned_Text","sentiment"]]

In [21]:
df1 = pd.read_csv("IMDB Dataset.csv")

In [27]:
df["sentiment"] = df1["sentiment"]
def to_num(s):
    if s == "positive":
        return 1
    return 0
df["sentiment"] = df["sentiment"].apply(to_num)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["sentiment"] = df1["sentiment"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["sentiment"] = df["sentiment"].apply(to_num)


In [48]:
df = df.reset_index(drop=True)

In [50]:
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)
train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)
# Create datasets and data loaders
train_dataset = IMDbDataset(train_df['Cleaned_Text'], train_df['sentiment'], tokenizer)
val_dataset = IMDbDataset(val_df['Cleaned_Text'], val_df['sentiment'], tokenizer)

train_loader = DataLoader(train_dataset, batch_size=4, shuffle=False)
val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False)

# Move model to device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Fine-tune the model
optimizer = AdamW(model.parameters(), lr=2e-5)



In [51]:
for epoch in range(3):  # You may need to adjust the number of epochs based on your resources
    model.train()
    for batch in train_loader:
        inputs = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**inputs)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    model.eval()
    val_loss = 0
    correct = 0
    total = 0

    with torch.no_grad():
        for batch in val_loader:
            inputs = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**inputs)
            loss = outputs.loss
            val_loss += loss.item()

            logits = outputs.logits
            preds = F.softmax(logits, dim=1)
            predicted_labels = torch.argmax(preds, dim=1)
            correct += (predicted_labels == batch['labels'].to(device)).sum().item()
            total += len(batch['labels'])

    val_accuracy = correct / total
    print(f"Epoch {epoch + 1}/{3}, Loss: {val_loss:.4f}, Accuracy: {val_accuracy:.4f}")

# Save the fine-tuned model
model.save_pretrained('path/to/save/fine-tuned-model')

RuntimeError: [enforce fail at C:\actions-runner\_work\pytorch\pytorch\builder\windows\pytorch\c10\core\impl\alloc_cpu.cpp:72] data. DefaultCPUAllocator: not enough memory: you tried to allocate 50331648 bytes.