In [1]:
import os
import requests
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold

# ---------- 1. Download the Devanagari font ----------

font_url = "https://github.com/googlefonts/noto-fonts/raw/main/hinted/ttf/NotoSansDevanagari/NotoSansDevanagari-Regular.ttf"
font_path = "NotoSansDevanagari-Regular.ttf"

if not os.path.exists(font_path):
    print("Downloading Devanagari font...")
    r = requests.get(font_url)
    with open(font_path, "wb") as f:
        f.write(r.content)
    print("Font downloaded!")

# ---------- 2. Load the dataset ----------

base_dir = os.path.abspath(os.path.join(os.getcwd(), ".."))
data_dir = os.path.join(base_dir, 'output', 'combined_dataset')
train_path = os.path.join(data_dir, 'train_strict.csv')

train_df = pd.read_csv(train_path)

# ---------- 3. Preprocessing ----------

train_df['text'] = train_df['text'].str.replace(r'[^\w\s]', '', regex=True)

# ---------- 4. Feature extraction ----------

vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
X = vectorizer.fit_transform(train_df['text'])

# ---------- 5. Encode labels ----------

le = LabelEncoder()
y = le.fit_transform(train_df['label'])

# ---------- 6. Stratified K-Fold ----------

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

accuracies = []
all_cm = np.zeros((len(le.classes_), len(le.classes_)), dtype=int)
all_true_positives = np.zeros(len(le.classes_), dtype=int)
all_actual_counts = np.zeros(len(le.classes_), dtype=int)

fold_no = 1
for train_index, val_index in skf.split(X, y):
    print(f"\n--- Fold {fold_no} ---")
    X_train, X_val = X[train_index], X[val_index]
    y_train, y_val = y[train_index], y[val_index]

    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_val)

    acc = accuracy_score(y_val, y_pred)
    print(f"Accuracy: {acc}")
    print("Classification Report:")
    print(classification_report(y_val, y_pred, target_names=le.classes_))

    accuracies.append(acc)

    # Confusion Matrix
    cm = confusion_matrix(y_val, y_pred)
    all_cm += cm

    # True Positives
    for i in range(len(y_val)):
        all_actual_counts[y_val[i]] += 1
        if y_val[i] == y_pred[i]:
            all_true_positives[y_val[i]] += 1

    fold_no += 1

# ---------- 7. Overall Results ----------

print("\n=== Overall Accuracy ===")
print(f"Mean Accuracy: {np.mean(accuracies):.4f}")
print(f"Std Dev: {np.std(accuracies):.4f}")

print("\n=== Aggregated Classification Report ===")
print(classification_report(all_actual_counts, all_true_positives, target_names=le.classes_))

# ---------- 8. Confusion Matrix Plot ----------

plt.figure(figsize=(8, 6))
sns.heatmap(all_cm, annot=True, fmt='d', cmap='Blues', xticklabels=le.classes_, yticklabels=le.classes_)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Aggregated Confusion Matrix')
plt.show()

# ---------- 9. Actual vs True Positive Plot ----------

x = np.arange(len(le.classes_))
width = 0.35

plt.figure(figsize=(8, 6))
plt.bar(x - width/2, all_actual_counts, width, label='Actual Positives')
plt.bar(x + width/2, all_true_positives, width, label='True Positives')
plt.xticks(x, le.classes_)
plt.ylabel('Count')
plt.title('Actual vs True Positives (Aggregated)')
plt.legend()
plt.show()

# ---------- 10. Word Cloud using Devanagari font ----------

all_text = ' '.join(train_df['text'])
wordcloud = WordCloud(width=800, height=400, background_color='white',
                      font_path=font_path).generate(all_text)

plt.figure(figsize=(15, 7))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud of Training Data')
plt.show()



--- Fold 1 ---
Accuracy: 0.6297619047619047
Classification Report:
              precision    recall  f1-score   support

    negative       0.63      0.63      0.63      1400
     neutral       0.55      0.62      0.58      1400
    positive       0.72      0.64      0.68      1400

    accuracy                           0.63      4200
   macro avg       0.64      0.63      0.63      4200
weighted avg       0.64      0.63      0.63      4200


--- Fold 2 ---
Accuracy: 0.6173809523809524
Classification Report:
              precision    recall  f1-score   support

    negative       0.63      0.60      0.62      1400
     neutral       0.54      0.60      0.57      1400
    positive       0.69      0.65      0.67      1400

    accuracy                           0.62      4200
   macro avg       0.62      0.62      0.62      4200
weighted avg       0.62      0.62      0.62      4200


--- Fold 3 ---
Accuracy: 0.6323809523809524
Classification Report:
              precision    recall 

ValueError: Number of classes, 4, does not match size of target_names, 3. Try specifying the labels parameter