In [None]:
import pandas as pd
import numpy as np

In [None]:
train = pd.read_csv("/kaggle/input/aiquest-bangla-sentiment-analysis-competition/train.csv")
train.head()

In [None]:
train.shape

In [None]:
train['text'].isnull().sum()

In [None]:
train['sentiment'].value_counts()

In [None]:
import re
import string

def text_preprocessing(text):
    text = re.sub(r'[@#\/]\S+', '', text)

    text = re.sub(r'\d+', '', text)

    text = re.sub(r'\d+', '', text)  # English numbers
    text = re.sub(r'[০-৯]+', '', text)  # Bangla numbers

    text = re.sub(r'https?:\/\/\S+|www\.\S+|ftp:\/\/\S+|mailto:\S+|https?:', '', text)

    text = text.replace('\n', ' ').replace('\r', '')
    text = re.sub(r'\s+', ' ', text).strip()

    english_punc = string.punctuation

    bangla_punc = '।,?!:;"\'()-—॥‘’“”'  

    all_punc = english_punc + bangla_punc
    text = text.translate(str.maketrans('', '', all_punc))

    emoji_pattern = re.compile(
        "["
        "\U0001F600-\U0001F64F"  # Emoticons
        "\U0001F300-\U0001F5FF"  # Symbols & Pictographs
        "\U0001F680-\U0001F6FF"  # Transport & Map Symbols
        "\U0001F1E0-\U0001F1FF"  # Flags (iOS)
        "\U00002700-\U000027BF"  # Dingbats
        "\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
        "\U00002600-\U000026FF"  # Miscellaneous Symbols
        "\U00002B50-\U00002B55"  # Stars and other symbols
        "]+",
        flags=re.UNICODE
    )
    text = emoji_pattern.sub(r'', text)
    
    # Bangla stopwords
    stop_words = {'এ', 'হয়', 'কি', 'কী', 'এর', 'কে', 'যে', 'এই', 'বা', 'সব', 'টি', 'তা',
                  'সে', 'তাই', 'সেই', 'তার', 'আগে', 'যদি', 'আছে', 'আমি', 'এবং', 'করে', 'কার', 'এটি', 'হতে', 'যায়',
                  'আরও', 'যাক', 'খুব', 'উপর', 'পরে', 'হবে', 'কেন', 'কখন', 'সকল', 'হয়', 'ঠিক', 'একই', 'কোন',
                  'ছিল', 'খুবই', 'কোনো', 'অধীন', 'যারা', 'তারা', 'গুলি', 'তাকে', 'সেটা', 'সময়', 'আমার', 'আমরা', 'সবার',
                  'উভয়', 'একটা', 'আপনি', 'নিয়ে', 'একটি', 'বন্ধ', 'জন্য','জন্য', 'শুধু', 'যেটা', 'উচিত', 'মাঝে', 'থেকে', 'করবে',
                  'আবার', 'উপরে', 'সেটি', 'কিছু', 'কারণ', 'যেমন', 'তিনি', 'মধ্যে', 'আমাকে', 'করছেন', 'তুলনা', 'তারপর',
                  'নিজেই', 'থাকার', 'নিজের', 'পারেন', 'একবার', 'সঙ্গে', 'ইচ্ছা', 'নীচের', 'এগুলো', 'আপনার', 'অধীনে', 'কিংবা',
                  'এখানে', 'তাহলে', 'কয়েক', 'জন্যে', 'হচ্ছে', 'তাদের', 'কোথায়', 'কিন্তু', 'নিজেকে', 'যতক্ষণ', 'আমাদের',
                  'দ্বারা', 'হয়েছে', 'সঙ্গে', 'সেখানে', 'কিভাবে', 'মাধ্যমে', 'নিজেদের', 'তুলনায়', 'প্রতিটি',
                  'তাদেরকে', 'ইত্যাদি', 'সম্পর্কে', 'সর্বাধিক', 'বিরুদ্ধে', 'অন্যান্য','প্রায়ই'}


    text_ls = text.split()
    filtered_words = [word for word in text_ls if word not in stop_words]
   
    text = " ".join(filtered_words)

    return text


In [None]:
train['text'] = train['text'].apply(text_preprocessing)
train.head()

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, f1_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier

# Convert sentiment labels to numeric codes
label_mapping = {'negative': 0, 'neutral': 1, 'positive': 2}
reverse_label_mapping = {v: k for k, v in label_mapping.items()}

train['sentiment_encoded'] = train['sentiment'].map(label_mapping)

# Define Features (X) and Target (y)
X = train['text']
y = train['sentiment_encoded']

# TF-IDF Vectorization
vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_features=10000)
X_tfidf = vectorizer.fit_transform(X)

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

# Define models
models = {
    "Logistic Regression": LogisticRegression(max_iter=500),
    "Naïve Bayes": MultinomialNB(),
    "SVM": SVC(kernel='linear'),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(),
    "Neural Network": MLPClassifier(hidden_layer_sizes=(128, 64), max_iter=500)
}

# Dictionary to store F1 scores
model_f1_scores = {}
best_model = None
best_f1 = 0

# Train and evaluate each model
for name, model in models.items():
    print(f"\nTraining {name}...")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    f1 = f1_score(y_test, y_pred, average='macro')
    model_f1_scores[name] = f1
    
    print(f"{name} Macro F1 Score: {f1:.4f}")
    print(f"{name} Classification Report:\n")
    print(classification_report(y_test, y_pred, target_names=['Negative', 'Neutral', 'Positive']))
    print("=" * 50)

    # Track best model
    if f1 > best_f1:
        best_f1 = f1
        best_model = model
        best_model_name = name

print(f"\nBest Model: {best_model_name} with Macro F1 Score: {best_f1:.4f}")

# Retrain best model on full training data
best_model.fit(X_tfidf, y)

# Predict on full training data
y_pred_train = best_model.predict(X_tfidf)
predicted_labels = [reverse_label_mapping[p] for p in y_pred_train]

# Create submission DataFrame
submission = pd.DataFrame({
    "id": range(len(train)),
    "sentiment": predicted_labels
})

# Save to CSV
submission.to_csv("submission.csv", index=False)
print("\nPredictions saved in `submission.csv` using the best model based on Macro F1 Score!")


In [None]:
# import torch
# import numpy as np
# from sklearn.metrics import classification_report, f1_score, confusion_matrix
# from transformers import (
#     BertForSequenceClassification,
#     AdamW,
#     AutoTokenizer,
#     get_linear_schedule_with_warmup
# )
# from torch.utils.data import Dataset, DataLoader
# from tqdm import tqdm
# import pandas as pd
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.model_selection import train_test_split

# # Set device
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# # Hyperparameters
# LEARNING_RATE = 2e-5
# BATCH_SIZE = 32  # Updated to 32
# NUM_EPOCHS = 10
# PATIENCE = 3  # Early stopping patience
# LABELS = [0, 1, 2]  # Changed to numeric labels
# MODEL_NAME = "sagorsarker/bangla-bert-base"

# label_mapping = {'negative': 0, 'neutral': 1, 'positive': 2}
# train['sentiment_encoded'] = train['sentiment'].map(label_mapping)

# train_df, val_df = train_test_split(train, test_size=0.2, random_state=42, stratify=train["sentiment_encoded"])


# # Initialize tokenizer and model
# tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
# model = BertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=len(LABELS)).to(device)

# # Optimizer and Scheduler
# optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, eps=1e-8)
# num_train_steps = len(train_df) // BATCH_SIZE * NUM_EPOCHS
# scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=num_train_steps)

# # Dataset Class
# class BanglaDataset(Dataset):
#     def __init__(self, dataframe, tokenizer, max_length=128):
#         self.dataframe = dataframe
#         self.tokenizer = tokenizer
#         self.max_length = max_length

#     def __len__(self):
#         return len(self.dataframe)

#     def __getitem__(self, idx):
#         text = self.dataframe.iloc[idx]["text"]
#         label = self.dataframe.iloc[idx]["sentiment_encoded"]  # Updated label usage

#         encoding = self.tokenizer(
#             text,
#             max_length=self.max_length,
#             padding="max_length",
#             truncation=True,
#             return_tensors="pt"
#         )

#         return {
#             "input_ids": encoding["input_ids"].squeeze(0),
#             "attention_mask": encoding["attention_mask"].squeeze(0),
#             "label": torch.tensor(label, dtype=torch.long)
#         }

# # Prepare DataLoaders
# train_dataset = BanglaDataset(train_df, tokenizer)
# val_dataset = BanglaDataset(val_df, tokenizer)

# train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
# val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

# # Function to evaluate model
# def evaluate(model, val_loader):
#     model.eval()
#     total_loss = 0
#     correct = 0
#     total = 0
#     with torch.no_grad():
#         for batch in val_loader:
#             input_ids = batch["input_ids"].to(device)
#             attention_mask = batch["attention_mask"].to(device)
#             labels = batch["label"].to(device)

#             outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
#             loss = outputs.loss
#             logits = outputs.logits

#             total_loss += loss.item()
#             preds = torch.argmax(logits, dim=1)
#             correct += (preds == labels).sum().item()
#             total += labels.size(0)

#     avg_loss = total_loss / len(val_loader)
#     accuracy = correct / total
#     return avg_loss, accuracy

# # Training Loop with Early Stopping
# best_val_loss = float("inf")
# early_stop_counter = 0
# best_model_path = "best_bangla_bert.pth"

# for epoch in range(NUM_EPOCHS):
#     model.train()
#     total_train_loss = 0

#     for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{NUM_EPOCHS} - Training"):
#         input_ids = batch["input_ids"].to(device)
#         attention_mask = batch["attention_mask"].to(device)
#         labels = batch["label"].to(device)

#         model.zero_grad()
#         outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
#         loss = outputs.loss
#         total_train_loss += loss.item()

#         loss.backward()
#         optimizer.step()
#         scheduler.step()

#     avg_train_loss = total_train_loss / len(train_loader)

#     # Evaluate on validation set
#     val_loss, val_accuracy = evaluate(model, val_loader)

#     print(f"\nEpoch {epoch+1}/{NUM_EPOCHS} - Train Loss: {avg_train_loss:.4f}, Val Loss: {val_loss:.4f}, Val Acc: {val_accuracy:.4f}")

#     # Save model if validation loss improves
#     if val_loss < best_val_loss:
#         best_val_loss = val_loss
#         early_stop_counter = 0
#         torch.save(model.state_dict(), best_model_path)
#         print(f"New best model saved at epoch {epoch+1}")
#     else:
#         early_stop_counter += 1
#         print(f"Validation loss increased ({early_stop_counter}/{PATIENCE})")

#     # Stop training if validation loss keeps degrading
#     if early_stop_counter >= PATIENCE:
#         print("Early stopping triggered. Training stopped.")
#         break

# # Load the best model before inference
# model.load_state_dict(torch.load(best_model_path))
# print("\nBest model loaded for inference.")

# # Function to get predictions
# def get_predictions(model, data_loader):
#     model.eval()
#     predictions, true_labels = [], []

#     with torch.no_grad():
#         for batch in data_loader:
#             input_ids = batch["input_ids"].to(device)
#             attention_mask = batch["attention_mask"].to(device)
#             labels = batch["label"].to(device)

#             outputs = model(input_ids, attention_mask=attention_mask)
#             logits = outputs.logits
#             preds = torch.argmax(logits, dim=1).cpu().numpy()

#             predictions.extend(preds)
#             true_labels.extend(labels.cpu().numpy())

#     return np.array(true_labels), np.array(predictions)

# # Get True Labels & Predictions
# y_true, y_pred = get_predictions(model, val_loader)

# # Classification Report
# print("Classification Report:")
# print(classification_report(y_true, y_pred, target_names=["negative", "neutral", "positive"]))

# # Macro F1 Score
# macro_f1 = f1_score(y_true, y_pred, average="macro")
# print(f"\nMacro F1 Score: {macro_f1:.4f}")

# # Confusion Matrix
# conf_matrix = confusion_matrix(y_true, y_pred)
# print("\nConfusion Matrix:")
# print(conf_matrix)


In [None]:
# from sklearn.metrics import classification_report, f1_score, confusion_matrix
# import torch
# import numpy as np
# import pandas as pd

# # Function to get predictions on a given dataset
# def get_predictions(model, data_loader):
#     model.eval()
#     predictions, true_labels = [], []

#     with torch.no_grad():
#         for batch in data_loader:
#             input_ids = batch["input_ids"].to(device)
#             attention_mask = batch["attention_mask"].to(device)
#             labels = batch["label"].to(device)  # True labels

#             outputs = model(input_ids, attention_mask=attention_mask)
#             logits = outputs.logits
#             preds = torch.argmax(logits, dim=1).cpu().numpy()

#             predictions.extend(preds)
#             true_labels.extend(labels.cpu().numpy())

#     return np.array(true_labels), np.array(predictions)

# y_true_train, y_pred_train = get_predictions(model, train_loader)

# print("Classification Report (Train Data):")
# print(classification_report(y_true_train, y_pred_train, target_names=["negative", "neutral", "positive"]))

# macro_f1_train = f1_score(y_true_train, y_pred_train, average="macro")
# print(f"\nMacro F1 Score (Train Data): {macro_f1_train:.4f}")

# conf_matrix_train = confusion_matrix(y_true_train, y_pred_train)
# print("\nConfusion Matrix (Train Data):")
# print(conf_matrix_train)

# label_mapping = {0: "negative", 1: "neutral", 2: "positive"}
# predicted_labels_train = [label_mapping[p] for p in y_pred_train]


# submission = pd.DataFrame({
#     "id": range(len(predicted_labels_train)),  # Sequential IDs
#     "sentiment": predicted_labels_train
# })

# # Save to CSV
# submission.to_csv("submission.csv", index=False)
# print("\nPredictions saved in `submission.csv`!")

In [None]:
# import torch
# import numpy as np
# import pandas as pd
# from transformers import (
#     BertForSequenceClassification,
#     AdamW,
#     AutoTokenizer,
#     get_linear_schedule_with_warmup
# )
# from torch.utils.data import Dataset, DataLoader
# from tqdm import tqdm

# # Set device
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# label_mapping = {'negative': 0, 'neutral': 1, 'positive': 2}
# train['sentiment_encoded'] = train['sentiment'].map(label_mapping)


# # Hyperparameters
# LEARNING_RATE = 1e-5
# BATCH_SIZE = 8
# NUM_EPOCHS = 15
# MODEL_NAME = "sagorsarker/bangla-bert-base"

# from transformers import AutoModelForSequenceClassification, AutoTokenizer

# MODEL_NAME = "csebuetnlp/banglabert_large"

# # Load tokenizer
# tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# # Load pre-trained model for classification
# model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=3)  # Assuming 3 sentiment classes
# model.to(device)  # Move to GPU if available


# # Dataset Class
# class BanglaDataset(Dataset):
#     def __init__(self, dataframe, tokenizer, max_length=128):
#         self.dataframe = dataframe
#         self.tokenizer = tokenizer
#         self.max_length = max_length

#     def __len__(self):
#         return len(self.dataframe)

#     def __getitem__(self, idx):
#         text = self.dataframe.iloc[idx]["text"]
#         label = self.dataframe.iloc[idx]["sentiment_encoded"]

#         encoding = self.tokenizer(
#             text,
#             max_length=self.max_length,
#             padding="max_length",
#             truncation=True,
#             return_tensors="pt"
#         )

#         return {
#             "input_ids": encoding["input_ids"].squeeze(0),
#             "attention_mask": encoding["attention_mask"].squeeze(0),
#             "label": torch.tensor(label, dtype=torch.long)
#         }

# # Load full training dataset
# train_dataset = BanglaDataset(train, tokenizer)
# train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

# # Optimizer and Scheduler
# optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, eps=1e-8, weight_decay=0.01)
# num_train_steps = len(train_loader) * NUM_EPOCHS
# scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=num_train_steps)

# # Training Loop
# for epoch in range(NUM_EPOCHS):
#     model.train()
#     total_train_loss = 0

#     for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{NUM_EPOCHS} - Training"):
#         input_ids = batch["input_ids"].to(device)
#         attention_mask = batch["attention_mask"].to(device)
#         labels = batch["label"].to(device)

#         model.zero_grad()
#         outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
#         loss = outputs.loss
#         total_train_loss += loss.item()

#         loss.backward()
#         optimizer.step()
#         scheduler.step()

#     print(f"\nEpoch {epoch+1}/{NUM_EPOCHS} - Train Loss: {total_train_loss / len(train_loader):.4f}")

# # Save trained model
# torch.save(model.state_dict(), "final_bangla_bert.pth")
# print("\nModel training complete. Model saved.")

# # Prediction Function
# def get_predictions(model, dataset):
#     model.eval()
#     predictions = []

#     with torch.no_grad():
#         for i in range(len(dataset)):
#             sample = dataset[i]
#             input_ids = sample["input_ids"].unsqueeze(0).to(device)
#             attention_mask = sample["attention_mask"].unsqueeze(0).to(device)

#             logits = model(input_ids, attention_mask=attention_mask).logits
#             pred = torch.argmax(logits, dim=1).item()
#             predictions.append(pred)

#     return np.array(predictions)

# # Predict on training data
# y_pred_train = get_predictions(model, train_dataset)

# # Convert numeric predictions to labels
# label_mapping = {0: "negative", 1: "neutral", 2: "positive"}
# predicted_labels = [label_mapping[p] for p in y_pred_train]

# # Create submission DataFrame
# submission = pd.DataFrame({
#     "id": range(len(train)),  # Sequential IDs
#     "sentiment": predicted_labels
# })

# # Save to CSV
# submission.to_csv("submission.csv", index=False)
# print("\nPredictions saved in `submission.csv`!")


In [None]:
sub = pd.read_csv("/kaggle/working/submission.csv")
sub.head()