In [1]:
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import joblib
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Install required packages
!pip install langdetect
!pip install nltk
!pip install sbnltk
!pip install textblob
!pip install imblearn

# Import text processing libraries
import pandas as pd
import re
import string
from langdetect import detect
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
from sbnltk.Stemmer import stemmerOP
from sbnltk.Preprocessor import preprocessor
from sbnltk.Tokenizer import wordTokenizer, sentenceTokenizer
from textblob import TextBlob
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from collections import Counter

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt_tab')

# Load data
df = pd.read_excel('/content/drive/MyDrive/TechTalentAssignment/Data & Topics.xlsx')
df.drop_duplicates(inplace=True)
print(f"Dataset shape after removing duplicates: {df.shape}")

# Define text processing functions
# Emoji removal pattern
emoji_pattern = re.compile(
    "["
    u"\U0001F600-\U0001F64F"  # Emoticons
    u"\U0001F300-\U0001F5FF"  # Symbols & Pictographs
    u"\U0001F680-\U0001F6FF"  # Transport & Map
    u"\U0001F1E0-\U0001F1FF"  # Flags
    u"\U00002700-\U000027BF"  # Dingbats
    u"\U0001F900-\U0001F9FF"  # Supplemental symbols
    u"\U0001FA70-\U0001FAFF"  # Extended-A (🫵)
    u"\U00002600-\U000026FF"  # Misc symbols
    u"\U0001F000-\U0001F02F"  # Mahjong
    u"\U0000203C"             # Double exclamation
    u"\uFE0F"                 # Variation Selector-16
    "]+",
    flags=re.UNICODE
)

# Custom punctuation (Bangla, Urdu, Arabic)
custom_punct = '।॥''""…،؛؟'
all_punct = string.punctuation + custom_punct

# Load stopwords
en_stop = set(stopwords.words('english'))

# Initialize processors
en_lem = WordNetLemmatizer()
bn_preprocessor = preprocessor()
bn_word_tokenizer = wordTokenizer()
bn_stemmer = stemmerOP()

# Detect Bengali characters
def is_bengali(word):
    return bool(re.search(r'[\u0980-\u09FF]', word))

# Clean and preprocess text
def preprocess_text(text):
    text = str(text)

    # Remove URLs, hashtags, mentions
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"#\w+", "", text)
    text = re.sub(r"@\w+", "", text)

    # Remove emojis
    text = emoji_pattern.sub('', text)

    # Remove punctuations
    text = re.sub(f"[{re.escape(all_punct)}]", "", text)

    # Normalize space
    text = re.sub(r'\s+', ' ', text).strip()

    try:
        lang = detect(text)
    except:
        lang = 'en'

    # For Bengali text, use SBNLTK tokenizer
    if (lang == 'bn'):
        try:
            text = bn_preprocessor.word_normalize(text)
            # Tokenize into words
            tokens = bn_word_tokenizer.basic_tokenizer(text)

            # Apply stemming to each token
            tokens = [bn_stemmer.stemWord(t) for t in tokens]
            # Apply dust remove to each token
            tokens = [bn_preprocessor.dust_removal(t) for t in tokens]
        except:
            # Fallback to NLTK if SBNLTK fails
            tokens = word_tokenize(text)
            tokens = [t.lower() for t in tokens if t.isalpha()]
    else:
        # For English, use NLTK with TextBlob correction
        try:
            textBlb = TextBlob(text)
            textBlb = textBlb.correct()
            text = textBlb.string
        except:
            # If TextBlob correction fails
            pass

        # Tokenize into words
        tokens = word_tokenize(text)
        tokens = [t.lower() for t in tokens if t.isalpha()]

    clean_tokens = []
    for token in tokens:
        if is_bengali(token):
            # Bengali tokens are already preprocessed and stemmed
            clean_tokens.append(token)
        else:
            if token not in en_stop:
                lemma = en_lem.lemmatize(token)
                clean_tokens.append(lemma)

    return ' '.join(clean_tokens)  # Return as space-separated string for TF-IDF

# Apply preprocessing to data
print("Applying text preprocessing...")
df['clean_context'] = df['Context'].apply(preprocess_text)

# Remove empty documents after cleaning
df = df[df['clean_context'].str.strip() != '']
print(f"Dataset shape after removing empty documents: {df.shape}")

# Check class distribution
X = df['clean_context']
y = df['Training Topic']

print("\nChecking original class distribution...")
class_counts = y.value_counts()
print(class_counts)

# Filter out classes with only one sample
print("\nRemoving classes with only one sample...")
valid_classes = class_counts[class_counts >= 2].index
filtered_df = df[df['Training Topic'].isin(valid_classes)]
print(f"Original dataset size: {len(df)}")
print(f"Filtered dataset size: {len(filtered_df)}")

# Get the new X and y from filtered dataset
X = filtered_df['clean_context']
y = filtered_df['Training Topic']



Mounted at /content/drive
Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m14.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: langdetect
  Building wheel for langdetect (setup.py) ... [?25l[?25hdone
  Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993223 sha256=8be502c944522b8ea42854d883dcaaa59f430db4d263f1ef74664af4d4316360
  Stored in directory: /root/.cache/pip/wheels/0a/f2/b2/e5ca405801e05eb7c8ed5b3b4bcf1fcabcd6272c167640072e
Successfully built langdetect
Installing collected packages: langdetect
Successfully installed langdetect-1.0.9
Collecting sbnltk
  Downloading sbnltk-2.0.2.tar.gz (28 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting google_trans_new>=1.1.9 (from sbnltk)
  Downloading google_trans_new-1.1.9-py3-none-any.whl.metadata (5.2 kB

Downloading...
From: https://drive.google.com/uc?id=142XvJg9xdpgzuYD31Y4pm-ZVdMaWmtuq
To: /usr/local/lib/python3.11/dist-packages/sbnltk/dataset/download_link.txt
100%|██████████| 1.66k/1.66k [00:00<00:00, 4.39MB/s]
Downloading...
From: https://drive.google.com/uc?id=1UIR2arP_6Fm0MqJVbcG2Q9iLCnjQriDB
To: /usr/local/lib/python3.11/dist-packages/sbnltk/dataset/bangla_word_list.txt
100%|██████████| 15.4M/15.4M [00:00<00:00, 148MB/s]
Downloading...
From: https://drive.google.com/uc?id=1e4x5FQrdGyEfBBzQ_DOJ9wBxZZ1F7bx7
To: /usr/local/lib/python3.11/dist-packages/sbnltk/dataset/stopword_list.txt
100%|██████████| 4.77k/4.77k [00:00<00:00, 8.82MB/s]
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_dat

Dataset shape after removing duplicates: (551, 3)


Downloading...
From: https://drive.google.com/uc?id=1ohRYMGAeq4RZTa3W-7DXEsaUCaVNIxgO
To: /usr/local/lib/python3.11/dist-packages/sbnltk/dataset/rootword_list.txt
100%|██████████| 2.00M/2.00M [00:00<00:00, 138MB/s]
Downloading...
From: https://drive.google.com/uc?id=1oBEK2DUs6Jt155AmMyc9565dHJJFZd_N
To: /usr/local/lib/python3.11/dist-packages/sbnltk/dataset/ner_static.txt
100%|██████████| 20.2M/20.2M [00:00<00:00, 44.0MB/s]


Applying text preprocessing...
Dataset shape after removing empty documents: (544, 4)

Checking original class distribution...
Training Topic
Mob Justice                   124
Law and Order                  66
Politics                       59
Islamic Fundamentalism         45
International affairs          34
Religion                       33
Corruption                     30
National Defence               27
Diplomacy                      24
Governance & Policy Reform     20
Women Rights                   19
Sports                         15
Terrorism                      12
Election                       11
Culture & Lifestyle            10
Trade & Commodity Price         6
Education                       4
Environment                     2
Natural Disaster                2
Religious Conflict              1
Name: count, dtype: int64

Removing classes with only one sample...
Original dataset size: 544
Filtered dataset size: 543

Performing train-test split...

Applying TF-IDF vectori

ValueError: With under-sampling methods, the number of samples in a class should be less or equal to the original number of samples. Originally, there is 27 samples and 30 samples are asked.

In [3]:
# Define a target for balancing classes
# For larger classes (>30 samples): undersample to 30
# For medium classes (10-30 samples): keep as is
# For smaller classes (<10 samples): oversample to 10
def get_sampling_strategy(class_counts): # Pass class_counts as an argument
    strategy = {}
    for class_name, count in class_counts.items():
        if count > 30:  # Undersample large classes
            strategy[class_name] = 30
        elif count < 10 and count >= 2:  # Oversample small classes
            strategy[class_name] = 10
        # Medium classes keep their original count
    return strategy

In [4]:


# Now perform the train-test split
print("\nPerforming train-test split...")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# TF-IDF Vectorization
print("\nApplying TF-IDF vectorization...")
tfidf_vectorizer = TfidfVectorizer(
    max_features=5000,  # Limit features to prevent overfitting
    min_df=2,           # Ignore terms that appear in less than 2 documents
    max_df=0.8,         # Ignore terms that appear in more than 80% of documents
    ngram_range=(1, 2)  # Use both unigrams and bigrams
)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Apply combined undersampling and oversampling
print("\nApplying balanced sampling strategy...")
# Get updated class counts from y_train
train_class_counts = y_train.value_counts()
undersampler = RandomUnderSampler(
    sampling_strategy=get_sampling_strategy(train_class_counts), # Use updated counts
    random_state=42
)
oversampler = SMOTE(
    sampling_strategy='auto',  # Will oversample all minority classes to match the majority
    random_state=42,
    k_neighbors=5  # Lower k_neighbors for small classes
)

# Apply undersampling then oversampling in a pipeline
sampling_pipeline = Pipeline([
    ('undersample', undersampler),
    ('oversample', oversampler)
])

X_train_resampled, y_train_resampled = sampling_pipeline.fit_resample(X_train_tfidf, y_train)

# Show the class distribution after resampling
print("\nClass distribution after resampling:")
resampled_class_counts = pd.Series(y_train_resampled).value_counts()
print(resampled_class_counts)

# Define and train multiple models
models = {
    'Logistic Regression': LogisticRegression(
        C=1.0,
        max_iter=1000,
        multi_class='multinomial',
        solver='lbfgs',
        random_state=42
    ),
    'SVM': SVC(
        C=1.0,
        kernel='linear',
        probability=True,
        random_state=42
    ),
    'Random Forest': RandomForestClassifier(
        n_estimators=100,
        max_depth=None,
        min_samples_split=2,
        random_state=42
    ),
    'Naive Bayes': MultinomialNB(alpha=0.1)
}

# Dictionary to store model results
model_results = {}

for name, model in models.items():
    print(f"\nTraining {name} model...")
    model.fit(X_train_resampled, y_train_resampled)

    # Make predictions
    y_train_pred = model.predict(X_train_tfidf)
    y_test_pred = model.predict(X_test_tfidf)

    # Calculate metrics
    train_accuracy = accuracy_score(y_train, y_train_pred)
    test_accuracy = accuracy_score(y_test, y_test_pred)

    print(f"Training accuracy: {train_accuracy:.4f}")
    print(f"Test accuracy: {test_accuracy:.4f}")

    # Store results
    model_results[name] = {
        'model': model,
        'train_accuracy': train_accuracy,
        'test_accuracy': test_accuracy,
        'train_predictions': y_train_pred,
        'test_predictions': y_test_pred,
        'test_report': classification_report(y_test, y_test_pred, output_dict=True)
    }

    # Print classification report for test set
    print("\nClassification Report (Test Set):")
    print(classification_report(y_test, y_test_pred))

    # Generate confusion matrix
    plt.figure(figsize=(12, 10))
    cm_test = confusion_matrix(y_test, y_test_pred)
    cm_test_normalized = cm_test.astype('float') / cm_test.sum(axis=1)[:, np.newaxis]
    sns.heatmap(cm_test_normalized, annot=True, fmt='.2f', cmap='Greens',
                xticklabels=model.classes_, yticklabels=model.classes_)
    plt.title(f'Normalized Confusion Matrix - {name} (Test Set)')
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.xticks(rotation=90)
    plt.yticks(rotation=0)
    plt.tight_layout()
    plt.savefig(f'confusion_matrix_{name.replace(" ", "_").lower()}.png')
    plt.close()

# Compare model performances
print("\nModel Performance Comparison:")
model_comparison = pd.DataFrame({
    name: {
        'Train Accuracy': results['train_accuracy'],
        'Test Accuracy': results['test_accuracy'],
        'Weighted F1-Score': results['test_report']['weighted avg']['f1-score']
    } for name, results in model_results.items()
}).T

print(model_comparison)

# Visualize model comparison
plt.figure(figsize=(12, 6))
model_comparison[['Train Accuracy', 'Test Accuracy', 'Weighted F1-Score']].plot(kind='bar')
plt.title('Model Performance Comparison')
plt.ylim(0, 1.0)
plt.xticks(rotation=45)
plt.tight_layout()
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.savefig('model_comparison.png')
plt.show()

# Find the best model based on test accuracy
best_model_name = model_comparison['Test Accuracy'].idxmax()
best_model = model_results[best_model_name]['model']
print(f"\nBest performing model: {best_model_name}")

# Save the best model and vectorizer
print(f"\nSaving the best model ({best_model_name}) and vectorizer...")
joblib.dump(best_model, f'best_model_{best_model_name.lower().replace(" ", "_")}.pkl')
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.pkl')
print("Model and vectorizer saved successfully!")

# Feature importance analysis for the best model
print("\nTop features for each class (using the best model):")
feature_names = tfidf_vectorizer.get_feature_names_out()

# Create a DataFrame to store feature importance for visualization
feature_importance_data = []

# Different feature importance extraction based on model type
if best_model_name == 'Logistic Regression' or best_model_name == 'SVM':
    if hasattr(best_model, 'coef_'):
        for i, class_label in enumerate(best_model.classes_):
            # Get top features with highest coefficients for this class
            top_features_idx = np.argsort(best_model.coef_[i])[-15:]
            top_features = [feature_names[idx] for idx in top_features_idx]
            top_coefficients = [best_model.coef_[i][idx] for idx in top_features_idx]

            # Print top features for this class
            print(f"Class '{class_label}': {', '.join(top_features)}")

            # Add to the feature importance data
            for feature, coef in zip(top_features, top_coefficients):
                feature_importance_data.append({
                    'Class': class_label,
                    'Feature': feature,
                    'Importance': coef
                })
elif best_model_name == 'Random Forest':
    # For Random Forest, use feature importances
    importances = best_model.feature_importances_
    indices = np.argsort(importances)[-30:]  # Get top 30 features

    print("Global feature importance:")
    for idx in reversed(indices):
        print(f"{feature_names[idx]}: {importances[idx]:.4f}")

    # Random Forest doesn't have class-specific feature importance by default
    # But we can visualize top global features
    plt.figure(figsize=(10, 8))
    plt.barh(range(len(indices)), importances[indices], align='center')
    plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
    plt.title('Random Forest Feature Importance')
    plt.tight_layout()
    plt.savefig('rf_feature_importance.png')
    plt.show()

# If we have feature importance data, visualize it
if feature_importance_data and (best_model_name == 'Logistic Regression' or best_model_name == 'SVM'):
    # Visualize top features for the most prevalent classes
    top_classes = class_counts.index[:5]  # Top 5 classes
    plt.figure(figsize=(15, 10))
    for i, class_name in enumerate(top_classes):
        class_data = [item for item in feature_importance_data if item['Class'] == class_name]
        class_data = sorted(class_data, key=lambda x: x['Importance'], reverse=True)[:10]

        plt.subplot(2, 3, i+1)
        features = [item['Feature'] for item in class_data]
        importances = [item['Importance'] for item in class_data]

        y_pos = np.arange(len(features))
        plt.barh(y_pos, importances, align='center')
        plt.yticks(y_pos, features)
        plt.xlabel('Coefficient Value')
        plt.title(f"Top Features: {class_name}")

    plt.tight_layout()
    plt.savefig('top_features_best_model.png')
    plt.show()

# Function for prediction with new text using the best model
def predict_topic(text, model=best_model, vectorizer=tfidf_vectorizer):
    # Preprocess the text
    clean_text = preprocess_text(text)
    # Vectorize the text
    text_tfidf = vectorizer.transform([clean_text])
    # Predict the topic
    prediction = model.predict(text_tfidf)[0]
    # Get prediction probabilities
    proba = model.predict_proba(text_tfidf)[0] if hasattr(model, 'predict_proba') else None

    if proba is not None:
        # Find top 3 classes
        top3_idx = proba.argsort()[-3:][::-1]
        top3_classes = [model.classes_[i] for i in top3_idx]
        top3_probs = [proba[i] for i in top3_idx]

        return {
            'predicted_topic': prediction,
            'top3_topics': list(zip(top3_classes, top3_probs))
        }
    else:
        return {
            'predicted_topic': prediction,
            'top3_topics': []
        }

# Demo prediction with sample texts
sample_texts = [
    "Police arrested three individuals involved in corruption at the ministry",
    "The president announced new policies to improve governance",
    "Mob beats thief to death in local market"
]

print("\nSample predictions with the best model:")
for text in sample_texts:
    prediction_result = predict_topic(text)
    print(f"\nText: '{text}'")
    print(f"Predicted Topic: {prediction_result['predicted_topic']}")
    if prediction_result['top3_topics']:
        print("Top 3 predictions:")
        for topic, prob in prediction_result['top3_topics']:
            print(f"  {topic}: {prob:.4f}")


Performing train-test split...

Applying TF-IDF vectorization...

Applying balanced sampling strategy...


ValueError: With under-sampling methods, the number of samples in a class should be less or equal to the original number of samples. Originally, there is 9 samples and 10 samples are asked.