In [None]:
#Enhancing Sentiment Analysis with Multilingual BERT Models

In [2]:
!pip install transformers



In [3]:
!pip install --upgrade transformers



In [4]:
from transformers import BertForSequenceClassification

In [5]:
#importing libraries and setting the initial values

In [6]:
!pip install torch torchvision torchaudio

Collecting torch
  Using cached torch-2.2.1-cp311-cp311-win_amd64.whl.metadata (26 kB)
Collecting torchvision
  Using cached torchvision-0.17.1-cp311-cp311-win_amd64.whl.metadata (6.6 kB)
Collecting torchaudio
  Using cached torchaudio-2.2.1-cp311-cp311-win_amd64.whl.metadata (6.4 kB)
Collecting typing-extensions>=4.8.0 (from torch)
  Using cached typing_extensions-4.10.0-py3-none-any.whl.metadata (3.0 kB)
Downloading torch-2.2.1-cp311-cp311-win_amd64.whl (198.6 MB)
   ---------------------------------------- 0.0/198.6 MB ? eta -:--:--
   ---------------------------------------- 0.0/198.6 MB 991.0 kB/s eta 0:03:21
   ---------------------------------------- 0.1/198.6 MB 1.1 MB/s eta 0:03:03
   ---------------------------------------- 0.1/198.6 MB 1.0 MB/s eta 0:03:13
   ---------------------------------------- 0.2/198.6 MB 984.6 kB/s eta 0:03:22
   ---------------------------------------- 0.2/198.6 MB 984.6 kB/s eta 0:03:22
   ---------------------------------------- 0.2/198.6 MB 984.6

In [7]:
# Import necessary libraries
import numpy as np
import pandas as pd
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from collections import defaultdict
from textwrap import wrap

# Torch ML libraries
import transformers
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader

# Misc.
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Set initial variables and constants
%config InlineBackend.figure_format='retina'

# Graph Designs
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc

# Adjusting seaborn settings for better visualization
sns.set(style='whitegrid', palette='muted', font_scale=1.2)
HAPPY_COLORS_PALETTE = ["#01BEFE", "#FFDD00", "#FF7D00", "#FF006D", "#ADFF02", "#8F00FF"]
sns.set_palette(sns.color_palette(HAPPY_COLORS_PALETTE))
rcParams['figure.figsize'] = 12, 8

# Setting random seed for reproducibility
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

# Checking if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
#Load the data

In [None]:
df = pd.read_csv('reviews.csv')
df.shape

In [None]:
#visualize the data
df.head

In [None]:
# Check for missing values 
df.isnull().sum()

In [None]:
#There are missing values in some of the columns but Content and score don't have a missing value. We can also look at the class balance.
#We will be alloting three classes:-
#Positive (Score: 4-5)
#Neutral (Score: 3)
#Negative (Score: 1-2)

In [None]:
print(df['score'].value_counts())

In [None]:
# Let's have a look at the class balance.
sns.countplot(df['score'])
plt.xlabel('Review Score')
plt.ylabel('Count')
plt.title('Class Balance of Review Scores')
plt.show()

In [None]:
# Function to convert score to sentiment
def to_sentiment(rating):
    
    rating = int(rating)
    
    # Convert to sentiment
    if rating <= 2:
        return 'negative'
    elif rating == 3:
        return 'neutral'
    else:
        return 'positive'

# Apply to the dataset 
df['sentiment'] = df['score'].apply(to_sentiment)

In [None]:
# Plot the distribution
class_names = ['Negative', 'Neutral', 'Positive']
ax = sns.countplot(x='sentiment', data=df, order=['negative', 'neutral', 'positive'])
ax.set_xticklabels(class_names)
plt.xlabel('Review Sentiment')
plt.ylabel('Count')
plt.title('Distribution of Review Sentiments')
plt.show()

In [None]:
#Data Preprocessing

In [None]:
# Set the model name
MODEL_NAME = 'bert-base-cased'

# Build a BERT-based tokenizer
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)

In [None]:
# Some of the common BERT tokens
print("SEP token:", tokenizer.sep_token, tokenizer.sep_token_id)  # Marker for ending of a sentence
print("CLS token:", tokenizer.cls_token, tokenizer.cls_token_id)  # Start of each sentence, used for classification
print("PAD token:", tokenizer.pad_token, tokenizer.pad_token_id)  # Special token for padding
print("UNK token:", tokenizer.unk_token, tokenizer.unk_token_id)  # Tokens not found in the training set

In [None]:
#BERT works with fixed-length sequences. We’ll use a simple strategy to choose the max length. Let’s store the token length of each review.

In [None]:
# Store length of each review 
token_lens = []

# Iterate through the content column
for txt in df['content']:
    # Encode the text using the tokenizer with a maximum length of 512 tokens
    tokens = tokenizer.encode(txt, max_length=512)
    # Append the length of the encoded tokens to the token_lens list
    token_lens.append(len(tokens))

In [None]:
# Plot the distribution of review lengths
sns.distplot(token_lens)
plt.xlim([0, 256])
plt.xlabel('Token Count')
plt.ylabel('Density')
plt.title('Distribution of Review Lengths')
plt.show()

In [None]:
MAX_LEN = 160

In [None]:
#Preparing Torch Dataset

In [None]:
class GPReviewDataset(Dataset):
    def __init__(self, reviews, targets, tokenizer, max_len):
        self.reviews = reviews
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__(self):
        return len(self.reviews)
    
    def __getitem__(self, item):
        review = str(self.reviews[item])
        target = self.targets[item]
        
        encoding = self.tokenizer.encode_plus(
            review,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        
        return {
            'review_text': review,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'targets': torch.tensor(target, dtype=torch.long)
        }

In [None]:
#Create a 80% train data and 10% test and 10% validation data

df_train, df_temp = train_test_split(df, test_size=0.2, random_state=RANDOM_SEED)
df_val, df_test = train_test_split(df_temp, test_size=0.5, random_state=RANDOM_SEED)

print("Training set shape:", df_train.shape)
print("Validation set shape:", df_val.shape)
print("Test set shape:", df_test.shape)

In [None]:
#Create a dataloader to release data in batches.

In [None]:
def create_data_loader(df, tokenizer, max_len, batch_size):
    # Create dataset from DataFrame
    ds = GPReviewDataset(
        reviews=df['content'].to_numpy(),
        targets=df['sentiment'].to_numpy(),
        tokenizer=tokenizer,
        max_len=max_len
    )
    
    # Create DataLoader from dataset
    return DataLoader(
        ds,
        batch_size=batch_size,
        num_workers=0
    )

In [None]:
# Define batch size
batch_size = 16  # You can adjust this value based on your system's memory and training requirements

# Create data loaders for the training, validation, and test sets
train_data_loader = create_data_loader(df_train, tokenizer, MAX_LEN, batch_size)
val_data_loader = create_data_loader(df_val, tokenizer, MAX_LEN, batch_size)
test_data_loader = create_data_loader(df_test, tokenizer, MAX_LEN, batch_size)

In [None]:
# Examples
data = next(iter(train_data_loader))
print("Keys in the batch:", data.keys())

print("Shape of input_ids tensor:", data['input_ids'].shape)
print("Shape of attention_mask tensor:", data['attention_mask'].shape)
print("Shape of targets tensor:", data['targets'].shape)

In [None]:
#Sentiment Classification with BERT and Hugging Face

In [None]:
'We’ll use the basic BertModel and build our sentiment classifier on top of it. Let’s load the model'

In [None]:
# Load the basic BERT model
bert_model = BertModel.from_pretrained(MODEL_NAME)

In [None]:
# Forward pass in the SentimentClassifier class
class SentimentClassifier(nn.Module):
    
    # Constructor method
    def __init__(self, n_classes):
        super(SentimentClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(MODEL_NAME)
        self.drop = nn.Dropout(p=0.3)
        self.out = nn.Linear(self.bert.config.hidden_size, n_classes)
    
    # Forward method
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        pooled_output = outputs.pooler_output  # Use pooler output instead of tuple
        
        if not isinstance(pooled_output, torch.Tensor):
            print("Type of pooled_output:", type(pooled_output))
            raise TypeError("The pooled_output from BERT is not a tensor.")
        
        # Apply dropout
        output = self.drop(pooled_output)
        return self.out(output)

In [None]:
# Instantiate the model
model = SentimentClassifier(len(class_names))

# Move the model to the appropriate device (GPU or CPU)
model = model.to(device)

In [None]:
#Model Characterstics

In [None]:
# Print the number of hidden units
print("Number of hidden units:", model.bert.config.hidden_size)

In [None]:
#Training Phase

In [None]:
# Number of iterations (epochs)
EPOCHS = 10

# Optimizer AdamW
optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)

# Total number of training steps
total_steps = len(train_data_loader) * EPOCHS

# Learning rate scheduler
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

# Loss function
loss_fn = nn.CrossEntropyLoss().to(device)

In [None]:
def train_epoch(model, data_loader, loss_fn, optimizer, device, scheduler, n_examples):
    model = model.train()
    losses = []
    correct_predictions = 0
    
    for d in data_loader:
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        targets = d["targets"].to(device)
        
        # Forward pass
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        
        # Compute loss
        loss = loss_fn(outputs, targets)
        
        # Compute correct predictions
        _, preds = torch.max(outputs, dim=1)
        correct_predictions += torch.sum(preds == targets)
        
        # Append current loss
        losses.append(loss.item())
        
        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        
        # Gradient clipping
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        
        # Update parameters
        optimizer.step()
        
        # Update scheduler
        scheduler.step()
    
    # Compute accuracy
    accuracy = correct_predictions.double() / n_examples
    average_loss = np.mean(losses)
    
    return accuracy, average_loss

In [None]:
#A function to evaluate model performance

In [None]:
def eval_model(model, data_loader, loss_fn, device, n_examples):
    model = model.eval()
    
    losses = []
    correct_predictions = 0
    
    with torch.no_grad():
        for d in data_loader:
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            targets = d["targets"].to(device)
            
            # Forward pass
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )
            
            # Compute loss
            loss = loss_fn(outputs, targets)
            
            # Compute correct predictions
            _, preds = torch.max(outputs, dim=1)
            correct_predictions += torch.sum(preds == targets)
            
            # Append current loss
            losses.append(loss.item())
            
    # Compute accuracy and average loss
    accuracy = correct_predictions.double() / n_examples
    average_loss = np.mean(losses)
    
    return accuracy, average_loss

In [None]:
# Set the number of epochs
EPOCHS = 3

# Set initial values for history and best accuracy
history = defaultdict(list)
best_accuracy = 0

# Set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Transfer the model to the device
model.to(device)

# Use mixed precision training
scaler = GradScaler()

# Set the accumulation steps
accumulation_steps = 4  # Adjust according to your GPU memory size and batch size

# Iterate over epochs
for epoch in range(EPOCHS):
    # Show epoch details 
    print(f"Epoch {epoch + 1}/{EPOCHS}")
    print("-" * 10)
    
    # Training phase
    model.train()
    train_losses = []
    correct_predictions = 0
    total_samples = 0
    
    # Use tqdm for progress visualization
    for batch in tqdm(train_data_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        targets = batch['targets'].to(device)
        
        optimizer.zero_grad()
        
        with autocast():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, predicted = torch.max(outputs, 1)
            
            loss = loss_fn(outputs, targets)
        
        scaler.scale(loss).backward()
        
        # Gradient accumulation
        if (epoch + 1) % accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
        
        train_losses.append(loss.item())
        
        correct_predictions += torch.sum(predicted == targets).item()
        total_samples += len(targets)
    
    train_loss = np.mean(train_losses)
    train_acc = correct_predictions / total_samples
    
    print(f"Train loss {train_loss} accuracy {train_acc}")
    
    # Validation phase
    model.eval()
    val_losses = []
    correct_predictions = 0
    total_samples = 0
    
    with torch.no_grad():
        for batch in tqdm(val_data_loader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            targets = batch['targets'].to(device)
            
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, predicted = torch.max(outputs, 1)
            
            loss = loss_fn(outputs, targets)
            val_losses.append(loss.item())
            
            correct_predictions += torch.sum(predicted == targets).item()
            total_samples += len(targets)
    
    val_loss = np.mean(val_losses)
    val_acc = correct_predictions / total_samples
    
    print(f"Val   loss {val_loss} accuracy {val_acc}")
    print()
    
    # Record history
    history['train_acc'].append(train_acc)
    history['train_loss'].append(train_loss)
    history['val_acc'].append(val_acc)
    history['val_loss'].append(val_loss)
    
    # Save the best model
    if val_acc > best_accuracy:
        torch.save(model.state_dict(), 'best_model_state.bin')
        best_accuracy = val_acc

In [None]:
import matplotlib.pyplot as plt

# Plot training and validation accuracy
plt.plot(history['train_acc'], label='train accuracy')
plt.plot(history['val_acc'], label='validation accuracy')

# Graph settings
plt.title('Training and Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
#Model Evaluation

In [None]:
# Evaluate model on the test set
test_acc, _ = eval_model(model, test_data_loader, loss_fn, device, len(df_test))

# Get the test accuracy value
test_accuracy_value = test_acc.item()

print("Test Accuracy:", test_accuracy_value)

In [None]:
def get_predictions(model, data_loader, device):
    model = model.eval()

    review_texts = []
    predictions = []
    prediction_probs = []
    real_values = []

    with torch.no_grad():
        for d in data_loader:
            texts = d["review_text"]
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            targets = d["targets"].to(device)

            # Get outputs
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )
            _, preds = torch.max(outputs, dim=1)

            review_texts.extend(texts)
            predictions.extend(preds.cpu())
            prediction_probs.extend(outputs.cpu())
            real_values.extend(targets.cpu())

    return review_texts, predictions, prediction_probs, real_values

In [None]:
# Ensure that the device is defined
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Get predictions
y_review_texts, y_pred, y_pred_probs, y_test = get_predictions(
    model,
    test_data_loader,
    device
)

In [None]:
from sklearn.metrics import classification_report

# Define class_names if it's not already defined
class_names = ['negative', 'neutral', 'positive']

# Print classification report
print(classification_report(y_test, y_pred, target_names=class_names))

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

def show_confusion_matrix(confusion_matrix, class_names):
    hmap = sns.heatmap(confusion_matrix, annot=True, fmt="d", cmap="Blues")
    hmap.yaxis.set_ticklabels(hmap.yaxis.get_ticklabels(), rotation=0, ha='right')
    hmap.xaxis.set_ticklabels(hmap.xaxis.get_ticklabels(), rotation=30, ha='right')
    plt.ylabel('True sentiment')
    plt.xlabel('Predicted sentiment')

# Calculate confusion matrix
cm = confusion_matrix(y_test, y_pred)
df_cm = pd.DataFrame(cm, index=class_names, columns=class_names)

# Show confusion matrix
show_confusion_matrix(df_cm, class_names)

In [None]:
#Predicting on raw text

In [None]:
def get_predictions(model, review_text, tokenizer, max_len):
    model = model.eval()
    encoded_review = tokenizer.encode_plus(
        review_text,
        max_length=max_len,
        add_special_tokens=True,
        return_token_type_ids=False,
        pad_to_max_length=True,
        return_attention_mask=True,
        return_tensors='pt',
    )
    input_ids = encoded_review['input_ids'].to(device)
    attention_mask = encoded_review['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        _, prediction = torch.max(outputs, dim=1)
        prediction_prob = torch.softmax(outputs, dim=1)

    return prediction.item(), prediction_prob.squeeze().tolist()

# Example review text
review_text = "I love completing my todos! Best app ever!!!"

# Get predictions for the example review text
predicted_class, predicted_probabilities = get_predictions(model, review_text, tokenizer, MAX_LEN)

print(f"Predicted class: {predicted_class}")
print(f"Predicted probabilities: {predicted_probabilities}")

In [None]:
encoded_review = tokenizer.encode_plus(
    review_text,
    max_length=MAX_LEN,
    add_special_tokens=True,
    return_token_type_ids=False,
    pad_to_max_length=True,
    return_attention_mask=True,
    return_tensors='pt',
)
input_ids = encoded_review['input_ids'].to(device)
attention_mask = encoded_review['attention_mask'].to(device)

with torch.no_grad():
    output = model(input_ids, attention_mask)
    _, prediction = torch.max(output, dim=1)

print(f'Review text: {review_text}')
print(f'Sentiment  : {class_names[prediction.item()]}')

In [None]:
#ENHANCING THE MODEL TRAINING WITH MULTILINGUAL DATASET

In [None]:
import os
import pandas as pd

def combine_datasets(folder_path):
    # Initialize empty lists to store dataframes for each language
    dfs = []

    # Loop through each language folder
    for language_folder in os.listdir(folder_path):
        language_folder_path = os.path.join(folder_path, language_folder)
        if os.path.isdir(language_folder_path):
            # Load train, test, and validation CSV files for the current language
            train_df = pd.read_csv(os.path.join(language_folder_path, 'train.csv'))
            test_df = pd.read_csv(os.path.join(language_folder_path, 'test.csv'))
            val_df = pd.read_csv(os.path.join(language_folder_path, 'valid.csv'))

            # Append language column to identify the language
            train_df['language'] = 'train.csv'
            test_df['language'] = 'test.csv'
            val_df['language'] = 'valid.csv'

            # Append dataframes to the list
            dfs.append(train_df)
            dfs.append(test_df)
            dfs.append(val_df)

    # Concatenate dataframes for all languages
    combined_df = pd.concat(dfs, ignore_index=True)

    return combined_df

# Path to the folder containing language-wise datasets
folder_path = "multilingual_dataset"

# Combine datasets from different languages
multilingual_dataset = combine_datasets(folder_path)

# Display the combined dataset
print(multilingual_dataset.head())

In [None]:
import os
import pandas as pd

# Path to the folder containing language-wise datasets
folder_path = "multilingual_dataset"

# Initialize empty dictionaries to store dataframes for each split (train, test, valid)
train_dfs = {}
test_dfs = {}
valid_dfs = {}

# Loop through each language folder
for language_folder in os.listdir(folder_path):
    language_folder_path = os.path.join(folder_path, language_folder)
    if os.path.isdir(language_folder_path):
        # Read train, test, and valid CSV files for the current language
        train_dfs[language_folder] = pd.read_csv(os.path.join(language_folder_path, 'train.csv'))
        test_dfs[language_folder] = pd.read_csv(os.path.join(language_folder_path, 'test.csv'))
        valid_dfs[language_folder] = pd.read_csv(os.path.join(language_folder_path, 'valid.csv'))

# Access dataframes for a specific language
# To access the test dataframe for hindi language:
hindi_train_df = test_dfs["hindi"]
hindi_test_df = test_dfs["hindi"]
hindi_valid_df = valid_dfs["hindi"]

In [None]:
# Hindi
print("\nHindi Train Data:")
print(hindi_train_df.head())

print("\nHindi Test Data:")
print(hindi_test_df.head())

print("\nHindi Validation Data:")
print(hindi_valid_df.head())


In [None]:
import torch.nn as nn
from transformers import BertModel

class SentimentClassifier(nn.Module):
    def __init__(self, num_classes):
        super(SentimentClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-multilingual-cased')
        self.drop = nn.Dropout(p=0.3)
        self.out = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        _, pooled_output = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        output = self.drop(pooled_output)
        return self.out(output)

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

def train(model, train_loader, val_loader, optimizer, loss_fn, device, epochs=5):
    model.to(device)
    for epoch in range(epochs):
        model.train()
        train_loss = 0.0
        for batch in train_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            targets = batch['targets'].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask)
            loss = loss_fn(outputs, targets)
            loss.backward()
            optimizer.step()

            train_loss += loss.item()

        train_loss /= len(train_loader)

        val_loss, val_acc = evaluate(model, val_loader, loss_fn, device)

        print(f'Epoch {epoch + 1}/{epochs}')
        print('-' * 10)
        print(f'Training Loss: {train_loss:.4f} | Validation Loss: {val_loss:.4f} | Validation Accuracy: {val_acc:.2f}%')
        print()

def evaluate(model, val_loader, loss_fn, device):
    model.eval()
    val_loss = 0.0
    correct_predictions = 0
    total_samples = 0

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            targets = batch['targets'].to(device)

            outputs = model(input_ids, attention_mask)
            loss = loss_fn(outputs, targets)
            val_loss += loss.item()

            _, predicted = torch.max(outputs, 1)
            correct_predictions += torch.sum(predicted == targets).item()
            total_samples += targets.size(0)

    val_loss /= len(val_loader)
    val_acc = correct_predictions / total_samples * 100

    return val_loss, val_acc

In [None]:
import torch
from transformers import BertTokenizer
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split

# Step 1: Tokenization
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

# Step 2: Encoding text data
class CustomDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        text = str(self.data.text[index])
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=False,
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        input_ids = inputs['input_ids'].flatten()
        attention_mask = inputs['attention_mask'].flatten()

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'targets': torch.tensor(self.data.label[index], dtype=torch.long)
        }

# Step 3: Splitting the data into train, validation, and test sets
train_data, val_data = train_test_split(multilingual_dataset, test_size=0.1, random_state=42)
val_data, test_data = train_test_split(val_data, test_size=0.5, random_state=42)

# Step 4: Create data loaders
MAX_LEN = 128
BATCH_SIZE = 16

train_dataset = CustomDataset(train_data, tokenizer, MAX_LEN)
val_dataset = CustomDataset(val_data, tokenizer, MAX_LEN)
test_dataset = CustomDataset(test_data, tokenizer, MAX_LEN)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [None]:
import torch
from torch.utils.data import Dataset
from transformers import BertTokenizer

class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = int(self.labels[idx])  # Convert label to integer
        
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt'
        )
        
        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

In [None]:
import torch
from torch.utils.data import DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from sklearn.metrics import accuracy_score
import pandas as pd

# Load the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

# Define paths to train, test, and validation CSV files for Hindi
train_file = "multilingual_dataset/hindi/train.csv"
test_file = "multilingual_dataset/hindi/test.csv"
valid_file = "multilingual_dataset/hindi/valid.csv"

# Read train, test, and validation data
train_data = pd.read_csv(train_file)
test_data = pd.read_csv(test_file)
valid_data = pd.read_csv(valid_file)

# Convert labels to integers
label_mapping = {'negative': 0, 'neutral': 1, 'positive': 2}
train_data['label'] = train_data['label'].map(label_mapping)
test_data['label'] = test_data['label'].map(label_mapping)
valid_data['label'] = valid_data['label'].map(label_mapping)

# Tokenize and encode the data
train_encodings = tokenizer(train_data['text'].tolist(), truncation=True, padding=True)
test_encodings = tokenizer(test_data['text'].tolist(), truncation=True, padding=True)
valid_encodings = tokenizer(valid_data['text'].tolist(), truncation=True, padding=True)

# Define PyTorch datasets
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Create PyTorch datasets
train_dataset = CustomDataset(train_encodings, train_data['label'].tolist())
test_dataset = CustomDataset(test_encodings, test_data['label'].tolist())
valid_dataset = CustomDataset(valid_encodings, valid_data['label'].tolist())

# Define model
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=3)

# Define optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=5e-5)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=100, num_training_steps=len(train_dataset))

# Define data loaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)
valid_loader = DataLoader(valid_dataset, batch_size=32, shuffle=False)

# Train the model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

model.train()
for epoch in range(3):  # Adjust number of epochs as needed
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        scheduler.step()

    print(f"Epoch {epoch+1}, Average Loss: {total_loss/len(train_loader)}")

# Evaluate the model
model.eval()
all_preds = []
all_labels = []
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs.logits, dim=1).cpu().detach().numpy()
        all_preds.extend(preds)
        all_labels.extend(labels.cpu().detach().numpy())

# Calculate accuracy
accuracy = accuracy_score(all_labels, all_preds)
print(f"Test Accuracy: {accuracy}")

In [None]:
#improvements

In [None]:
import pandas as pd
from transformers import BertTokenizer
import torch

# Load the Hindi CSV file without header
df = pd.read_csv("hindi.csv", header=None)

# Tokenize the text using BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")

# Function to tokenize the text
def tokenize_text(text):
    return tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=128,
        padding='max_length',
        return_attention_mask=True,
        return_tensors='pt'
    )

# Preprocess the data and store it in lists
input_ids = []
attention_masks = []

for row in df.iterrows():
    text = row[1][0]  # Access text by index
    encoded_text = tokenize_text(text)
    input_ids.append(encoded_text['input_ids'])
    attention_masks.append(encoded_text['attention_mask'])

# Convert lists to tensors
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)

# Save the preprocessed data
torch.save(input_ids, "input_ids.pt")
torch.save(attention_masks, "attention_masks.pt")

In [None]:
import matplotlib.pyplot as plt

# Define the number of samples to visualize
num_samples = 5

# Visualize the first few samples
for i in range(num_samples):
    # Decode the input_ids to text
    decoded_text = tokenizer.decode(input_ids[i], skip_special_tokens=True)
    
    # Print the decoded text
    print("Sample", i+1, ":")
    print("Text:", decoded_text)
    print("Input IDs:", input_ids[i])
    print("Attention Mask:", attention_masks[i])
    print()

    # Plot the attention mask
    plt.figure(figsize=(10, 1))
    plt.imshow(attention_masks[i].cpu().detach().numpy().reshape(1, -1), cmap='gray', aspect='auto')
    plt.title("Attention Mask")
    plt.xlabel("Token Position")
    plt.ylabel("Sample")
    plt.show()

In [None]:
import pandas as pd
import numpy as np

# Load data
data = pd.read_csv("hindi.csv", header=None, names=["text", "label"])

# Check data types in the label column
print(data['label'].dtype)

# Convert label column to numeric values
data['label'] = pd.to_numeric(data['label'], errors='coerce')

# Drop rows with NaN values in the label column
data.dropna(subset=['label'], inplace=True)

# Convert labels to one-dimensional tensor
labels = torch.tensor(data['label'].values, dtype=torch.long)

# Now, you can proceed with the rest of the code for preprocessing and training

In [None]:
import pandas as pd

# Load data
data = pd.read_csv("hindi.csv", header=None, names=["text", "label"])

# Check if data is loaded correctly
print("Number of rows in the dataset:", len(data))
print("First few rows of the dataset:")
print(data.head())

In [None]:
import pandas as pd

# Now you can use the pd module to read the CSV file
data = pd.read_csv('hindi.csv')
data.head()

In [None]:
data.info()

In [None]:
data.describe() #descriptive statistics

In [None]:
null_values = data.isnull().sum() #identifying missing values
null_values.index[0]

In [None]:
print('There are {} missing values for {} and {} missing values for {}.'.format(null_values[0],null_values.index[0],null_values[1],null_values.index[1]))

In [None]:
num_duplicates = data.duplicated().sum() #identify duplicates
print('There are {} duplicate reviews present in the dataset'.format(num_duplicates))

In [None]:
# View the columns of your DataFrame
print(data.columns)

In [None]:
# Assuming 'लोग वतन तक खा जाते हैं इसका इसे यकीन नहींमान जाएगा तू ले जाकर दिल्ली इसे दिखा ला दोस्त' is the column with reviews
review_column_name = 'लोग वतन तक खा जाते हैं इसका इसे यकीन नहींमान जाएगा तू ले जाकर दिल्ली इसे दिखा ला दोस्त'

# View duplicate reviews
duplicated_review = data[data.duplicated(subset=[review_column_name])].sort_values(review_column_name)
duplicated_review.head()

In [None]:
#drop duplicate reviews
data.drop_duplicates(inplace = True)

In [None]:
print('The dataset contains {} rows and {} columns after removing duplicates'.format(data.shape[0],data.shape[1]))

In [None]:
data_copy = data.copy()

In [None]:
import pandas as pd
from bs4 import BeautifulSoup
import re
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Define the clean_text function
def clean_text(text, lemmatize=True):
    # Add your cleaning steps here
    return text

# Read the CSV file without header
data = pd.read_csv("hindi.csv", header=None)

# Assign column names
data.columns = ['text', 'label']  # Replace 'text' and 'label' with appropriate column names

# Apply the clean_text function to the 'text' column
data['text'] = data['text'].apply(clean_text, lemmatize=True)

In [None]:
# Assuming 'sentiment' is the column containing target labels
data['label'] = [1 if each == "positive" else 0 for each in data['label']]

In [None]:
#after converting labels
data.head()

In [None]:
# Exploratory data analysis

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Count Plot
sns.set(style="whitegrid", font_scale=1.2)
sns.countplot(data=data, x='label', palette=['green', 'red'], order=[1, 0])
plt.xticks(ticks=np.arange(2), labels=['positive', 'negative'])
plt.title('Target count for reviews')
plt.show()

In [None]:
print('Positive reviews are', round(data['label'].value_counts()[1]), 'i.e.', round(data['label'].value_counts()[1] / len(data) * 100, 2), '% of the dataset')
print('Negative reviews are', round(data['label'].value_counts()[0]), 'i.e.', round(data['label'].value_counts()[0] / len(data) * 100, 2), '% of the dataset')

In [None]:
!pip install wordcloud
from wordcloud import WordCloud  # Add this import statement

# Word cloud for positive reviews
positive_data = data[data['label'] == 1]['text']  # Assuming 'text' is the column containing the text data
positive_data_string = ' '.join(positive_data)
plt.figure(figsize=(20, 20))
wc = WordCloud(max_words=2000, width=1200, height=600, background_color="white").generate(positive_data_string)
plt.imshow(wc, interpolation='bilinear')
plt.axis('off')
plt.title('Word cloud for positive reviews', fontsize=20)
plt.show()

In [None]:
# Word cloud for negative reviews
negative_data = data[data['label'] == 0]['text']
negative_data_string = ' '.join(negative_data)
plt.figure(figsize=(20, 20))
wc = WordCloud(max_words=2000, width=1200, height=600, background_color="white").generate(negative_data_string)
plt.imshow(wc, interpolation='bilinear')
plt.axis('off')
plt.title('Word cloud for negative reviews', fontsize=20)
plt.show()

In [None]:
'hindi language is typical to visualize in wordcloud format, so just graphically assume it'

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 8))
text_len_positive = positive_data.str.len()
ax1.hist(text_len_positive, color='green')
ax1.set_title('Positive Reviews')
ax1.set_xlabel('Number of Characters')
ax1.set_ylabel('Count')

text_len_negative = negative_data.str.len()
ax2.hist(text_len_negative, color='red')
ax2.set_title('Negative Reviews')
ax2.set_xlabel('Number of Characters')
ax2.set_ylabel('Count')

fig.suptitle('Number of characters in texts')
plt.show()

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 8))

text_len_positive = positive_data.str.split().map(lambda x: len(x))
ax1.hist(text_len_positive, color='green')
ax1.set_title('Positive Reviews')
ax1.set_xlabel('Number of Words')
ax1.set_ylabel('Count')

text_len_negative = negative_data.str.split().map(lambda x: len(x))
ax2.hist(text_len_negative, color='red')
ax2.set_title('Negative Reviews')
ax2.set_xlabel('Number of Words')
ax2.set_ylabel('Count')

fig.suptitle('Number of words in texts')
plt.show()

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 10))

word_positive = positive_data.str.split().apply(lambda x: len(x))
sns.histplot(word_positive, ax=ax1, color='green', kde=True)
ax1.set_title('Positive Reviews')
ax1.set_xlabel('Number of words per review')

word_negative = negative_data.str.split().apply(lambda x: len(x))
sns.histplot(word_negative, ax=ax2, color='red', kde=True)
ax2.set_title('Negative Reviews')
ax2.set_xlabel('Number of words per review')

fig.suptitle('Distribution of number of words per reviews')
plt.show()

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 10))

word_positive = positive_data.str.split().apply(lambda x: [len(i) for i in x])
sns.histplot(word_positive.map(lambda x: np.mean(x)), ax=ax1, color='green', kde=True)
ax1.set_title('Positive Reviews')
ax1.set_xlabel('Average word length per review')

word_negative = negative_data.str.split().apply(lambda x: [len(i) for i in x])
sns.histplot(word_negative.map(lambda x: np.mean(x)), ax=ax2, color='red', kde=True)
ax2.set_title('Negative Reviews')
ax2.set_xlabel('Average word length per review')

fig.suptitle('Distribution of average word length in each review')
plt.show()

In [None]:
def get_corpus(text):
    words = []
    for i in text:
        for j in i.split():
            words.append(j.strip())
    return words

corpus = get_corpus(data['text'])  # Assuming 'text' is the column containing the text data
corpus[:5]

In [None]:
from collections import Counter

counter = Counter(corpus)
most_common = counter.most_common(10)
most_common = pd.DataFrame(most_common, columns=['corpus', 'count'])
most_common

In [None]:
most_common = most_common.sort_values('count')
plt.figure(figsize=(10, 10))
plt.yticks(range(len(most_common)), list(most_common.corpus))
plt.barh(range(len(most_common)), list(most_common['count']), align='center', color='blue')
plt.title('Most common words in the dataset')
plt.show()

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

def get_ngrams(review, n, g):
    vec = CountVectorizer(ngram_range=(g, g)).fit(review)
    bag_of_words = vec.transform(review)
    sum_words = bag_of_words.sum(axis=0)
    sum_words = np.array(sum_words)[0].tolist()
    words_freq = [(word, sum_words[idx]) for word, idx in vec.vocabulary_.items()]
    words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True)
    return words_freq[:n]

# Unigram
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(30, 15))
uni_positive = get_ngrams(positive_data, 20, 1)
uni_positive = dict(uni_positive)
temp = pd.DataFrame(list(uni_positive.items()), columns=["Common_words", "Count"])
sns.barplot(data=temp, x="Count", y="Common_words", orient='h', ax=ax1)
ax1.set_title('Positive reviews')
uni_negative = get_ngrams(negative_data, 20, 1)
uni_negative = dict(uni_negative)
temp = pd.DataFrame(list(uni_negative.items()), columns=["Common_words", "Count"])
sns.barplot(data=temp, x="Count", y="Common_words", orient='h', ax=ax2)
ax2.set_title('Negative reviews')
fig.suptitle('Unigram analysis for positive and negative reviews')
plt.show()

# Bigram
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(30, 15))
bi_positive = get_ngrams(positive_data, 20, 2)
bi_positive = dict(bi_positive)
temp = pd.DataFrame(list(bi_positive.items()), columns=["Common_words", "Count"])
sns.barplot(data=temp, x="Count", y="Common_words", orient='h', ax=ax1)
ax1.set_title('Positive reviews')
bi_negative = get_ngrams(negative_data, 20, 2)
bi_negative = dict(bi_negative)
temp = pd.DataFrame(list(bi_negative.items()), columns=["Common_words", "Count"])
sns.barplot(data=temp, x="Count", y="Common_words", orient='h', ax=ax2)
ax2.set_title('Negative reviews')
fig.suptitle('Bigram analysis for positive and negative reviews')
plt.show()

# Trigram
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(30, 15))
tri_positive = get_ngrams(positive_data, 20, 3)
tri_positive = dict(tri_positive)
temp = pd.DataFrame(list(tri_positive.items()), columns=["Common_words", "Count"])
sns.barplot(data=temp, x="Count", y="Common_words", orient='h', ax=ax1)
ax1.set_title('Positive reviews')
tri_negative = get_ngrams(negative_data, 20, 3)
tri_negative = dict(tri_negative)
temp = pd.DataFrame(list(tri_negative.items()), columns=["Common_words", "Count"])
sns.barplot(data=temp, x="Count", y="Common_words", orient='h', ax=ax2)
ax2.set_title('Negative reviews')
fig.suptitle('Trigram analysis for positive and negative reviews')
plt.show()

In [None]:
#PREDICTIVE MODELLING

In [None]:
from sklearn.model_selection import train_test_split

# Splitting into train and test
train, test = train_test_split(data, test_size=0.2, random_state=42)
X_train, y_train = train['text'], train['label']
X_test, y_test = test['text'], test['label']


from transformers import BertTokenizer

# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

# Tokenize the training and testing data
train_encodings = tokenizer(list(X_train), truncation=True, padding=True)
test_encodings = tokenizer(list(X_test), truncation=True, padding=True)

import torch

class HindiDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Create PyTorch datasets
train_dataset = HindiDataset(train_encodings, y_train)
test_dataset = HindiDataset(test_encodings, y_test)

In [None]:
# Splitting into train and test
train, test = train_test_split(data, test_size=0.2, random_state=42)
Xtrain, ytrain = train['text'], train['label']
Xtest, ytest = test['text'], test['label']

# Tokenize training data
Xtrain_tokens = tokenizer(Xtrain.tolist(), padding=True, truncation=True, return_tensors='pt')

# Tokenize test data
Xtest_tokens = tokenizer(Xtest.tolist(), padding=True, truncation=True, return_tensors='pt')

In [None]:
print(data_copy.head())
data_copy.reset_index(drop=True, inplace=True)

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Load data without specifying column names
data = pd.read_csv("hindi.csv", names=['text', 'label'])  # Replace "hindi.csv" with the path to your dataset

# Extract features and target
X = data['text']
y = data['label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the training data
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

# Transform the testing data
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Initialize and train the Multinomial Naive Bayes classifier
mnb = MultinomialNB()
mnb.fit(X_train_tfidf, y_train)

# Make predictions
predictions = mnb.predict(X_test_tfidf)

# Calculate accuracy
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: {:.2f}%".format(accuracy * 100))

# Plot confusion matrix
cm = confusion_matrix(y_test, predictions)
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, cmap='Blues', fmt='d')
plt.title('Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

In [None]:
from sklearn.metrics import classification_report

# Make predictions
predictions = mnb.predict(X_test_tfidf)

# Print classification report
print(classification_report(y_test, predictions))

In [None]:
from sklearn.metrics import accuracy_score

# Make predictions
predictions = mnb.predict(X_test_tfidf)

# Calculate accuracy
accuracy = accuracy_score(y_test, predictions)

# Print accuracy
print("Accuracy: {:.2f}%".format(100 * accuracy))