In [4]:
import pandas as pd
import numpy as np
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline

# --- Configuration ---
DATA_PATH = 'AJGT.xlsx'
OUTPUT_FILE = 'Saudi_Tweets_Analyzed.csv'

def preprocess_text(text):
    """
    Cleans and normalizes Arabic text data for NLP tasks.
    Removes diacritics, tatweel, punctuation, and non-Arabic characters.
    """
    text = str(text)
    # Remove Diacritics
    text = re.sub(r'[\u0617-\u061A\u064B-\u0652]', '', text)
    # Remove Tatweel
    text = re.sub(r'\u0640', '', text)
    # Normalize Alef
    text = re.sub(r'[أإآ]', 'ا', text)
    # Filter non-Arabic characters
    text = re.sub(r'[a-zA-Z0-9]', '', text)
    # Remove Punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    return text.strip()

# --- 1. Data Ingestion & Schema validation ---
try:
    print(f"[INFO] Loading dataset from source: {DATA_PATH}...")
    df = pd.read_excel(DATA_PATH)

    # Auto-detect Text vs Sentiment columns based on content
    text_col, sentiment_col = None, None

    for col in df.columns:
        # Check for Arabic content dominance
        sample = df[col].astype(str).str.cat(sep=' ')
        if len(re.findall(r'[\u0600-\u06FF]', sample)) > 100:
            text_col = col
        # Check for categorical labels (low cardinality)
        elif df[col].nunique() < 10:
            sentiment_col = col

    if text_col and sentiment_col:
        print(f"[INFO] Auto-detected schema: Text='{text_col}', Target='{sentiment_col}'")
        df = df[[sentiment_col, text_col]]
        df.columns = ['Sentiment', 'Text']
    else:
        # Fallback to last two columns
        print("[WARN] Schema detection failed. Defaulting to last 2 columns.")
        df = df.iloc[:, -2:]
        df.columns = ['Sentiment', 'Text']

    print(f"[INFO] Data loaded successfully. Dimensions: {df.shape}")

except FileNotFoundError:
    print(f"[ERROR] Source file '{DATA_PATH}' not found.")
    exit()

# --- 2. Preprocessing Pipeline ---
print("[INFO] Executing text cleaning pipeline...")
df['Clean_Text'] = df['Text'].apply(preprocess_text)
# Remove empty rows after cleaning
df = df[df['Clean_Text'].str.strip() != '']

# --- 3. Model Training ---
print("[INFO] Initializing Naive Bayes Classifier...")

pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000)),
    ('classifier', MultinomialNB())
])

X_train, X_test, y_train, y_test = train_test_split(
    df['Clean_Text'],
    df['Sentiment'],
    test_size=0.2,
    random_state=42
)

pipeline.fit(X_train, y_train)

# --- 4. Evaluation ---
y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print("-" * 30)
print(f"✅ Model Accuracy: {accuracy:.4f}")
print("-" * 30)
print("[INFO] Performance Metrics:")
print(classification_report(y_test, y_pred))

# --- 5. Deployment Export ---
print(f"[INFO] Exporting inference results to {OUTPUT_FILE}...")
df['Predicted_Sentiment'] = pipeline.predict(df['Clean_Text'])
df.to_csv(OUTPUT_FILE, index=False, encoding='utf-8-sig')
print("[INFO] Process completed successfully.")

[INFO] Loading dataset from source: AJGT.xlsx...
[INFO] Auto-detected schema: Text='Feed', Target='Sentiment'
[INFO] Data loaded successfully. Dimensions: (1800, 2)
[INFO] Executing text cleaning pipeline...
[INFO] Initializing Naive Bayes Classifier...
------------------------------
✅ Model Accuracy: 0.8556
------------------------------
[INFO] Performance Metrics:
              precision    recall  f1-score   support

    Negative       0.86      0.83      0.84       167
    Positive       0.85      0.88      0.87       193

    accuracy                           0.86       360
   macro avg       0.86      0.85      0.85       360
weighted avg       0.86      0.86      0.86       360

[INFO] Exporting inference results to Saudi_Tweets_Analyzed.csv...
[INFO] Process completed successfully.
