# SvaraAI Reply Classification Pipeline

This notebook demonstrates a complete ML pipeline for classifying email replies from prospects into three categories:
- **Positive**: Interested in meeting/demo
- **Negative**: Not interested / rejection
- **Neutral**: Non-committal or irrelevant

## Overview
1. Data exploration and preprocessing
2. Baseline model training (Logistic Regression, Random Forest, LightGBM)
3. Transformer model fine-tuning (DistilBERT)
4. Model comparison and selection
5. Production recommendations

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')

# Set style for plots
plt.style.use('seaborn-v0_8')
sns.set_palette('husl')

## 1. Data Exploration and Preprocessing

In [None]:
# Load dataset
df = pd.read_csv('reply_classification_dataset.csv')
print(f'Dataset shape: {df.shape}')
print(f'Columns: {df.columns.tolist()}')
df.head()

In [None]:
# Check for missing values
print('Missing values:')
print(df.isnull().sum())

# Check label distribution
print('
Label distribution (raw):')
print(df['label'].value_counts())

In [None]:
# Visualize label distribution
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
df['label'].value_counts().plot(kind='bar')
plt.title('Raw Label Distribution')
plt.xticks(rotation=45)

# Clean labels and visualize again
df['label_clean'] = df['label'].str.lower().str.strip()

plt.subplot(1, 2, 2)
df['label_clean'].value_counts().plot(kind='bar')
plt.title('Cleaned Label Distribution')
plt.xticks(rotation=45)

plt.tight_layout()
plt.show()

print('Cleaned label distribution:')
print(df['label_clean'].value_counts())

In [None]:
# Text preprocessing function
def preprocess_text(text):
    """Clean and preprocess text data"""
    # Convert to lowercase
    text = str(text).lower()
    # Remove extra whitespaces
    text = re.sub(r'\s+', ' ', text)
    # Remove special characters but keep basic punctuation
    text = re.sub(r'[^a-zA-Z0-9\s.,!?]', '', text)
    # Strip whitespace
    text = text.strip()
    return text

# Apply preprocessing
df['text_clean'] = df['text'].apply(preprocess_text)
df['label_clean'] = df['label'].str.lower().str.strip()

# Remove any remaining missing values
df = df.dropna(subset=['text_clean', 'label_clean'])

print(f'Final dataset shape: {df.shape}')
print('
Example of cleaned text:')
for i in range(3):
    print(f'Original: {df.iloc[i]["text"]}')
    print(f'Cleaned:  {df.iloc[i]["text_clean"]}')
    print(f'Label:    {df.iloc[i]["label_clean"]}')
    print('-' * 50)

In [None]:
# Analyze text characteristics
df['text_length'] = df['text_clean'].str.len()
df['word_count'] = df['text_clean'].str.split().str.len()

plt.figure(figsize=(15, 5))

plt.subplot(1, 3, 1)
df['text_length'].hist(bins=20)
plt.title('Text Length Distribution')
plt.xlabel('Character Count')
plt.ylabel('Frequency')

plt.subplot(1, 3, 2)
df['word_count'].hist(bins=20)
plt.title('Word Count Distribution')
plt.xlabel('Word Count')
plt.ylabel('Frequency')

plt.subplot(1, 3, 3)
df.groupby('label_clean')['text_length'].mean().plot(kind='bar')
plt.title('Average Text Length by Label')
plt.xlabel('Label')
plt.ylabel('Average Character Count')
plt.xticks(rotation=45)

plt.tight_layout()
plt.show()

print('Text statistics by label:')
print(df.groupby('label_clean')[['text_length', 'word_count']].describe())

## 2. Feature Engineering and Model Preparation

In [None]:
# Encode labels
label_encoder = LabelEncoder()
df['label_encoded'] = label_encoder.fit_transform(df['label_clean'])

# Print label mapping
label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print('Label mapping:')
for label, encoded in label_mapping.items():
    print(f'{label}: {encoded}')

# Train-test split
X = df['text_clean']
y = df['label_encoded']
y_labels = df['label_clean']

X_train, X_test, y_train, y_test, y_labels_train, y_labels_test = train_test_split(
    X, y, y_labels, test_size=0.2, random_state=42, stratify=y
)

print(f'
Training set size: {len(X_train)}')
print(f'Test set size: {len(X_test)}')
print(f'
Training label distribution:')
print(pd.Series(y_labels_train).value_counts())

In [None]:
# Create TF-IDF features
vectorizer = TfidfVectorizer(
    stop_words='english',
    max_features=5000,
    ngram_range=(1, 2),  # Include bigrams
    min_df=2,  # Ignore terms that appear in less than 2 documents
    max_df=0.8  # Ignore terms that appear in more than 80% of documents
)

X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

print(f'TF-IDF feature matrix shape: {X_train_tfidf.shape}')
print(f'Vocabulary size: {len(vectorizer.vocabulary_)}')

# Show top features
feature_names = vectorizer.get_feature_names_out()
print(f'
Top 20 features: {feature_names[:20]}')

## 3. Baseline Model Training

In [None]:
# Define models
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'LightGBM': LGBMClassifier(random_state=42, verbose=-1)
}

# Train and evaluate models
results = {}
predictions = {}

for name, model in models.items():
    print(f'Training {name}...')
    model.fit(X_train_tfidf, y_train)
    y_pred = model.predict(X_test_tfidf)
    
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    
    results[name] = {
        'accuracy': accuracy,
        'f1_score': f1,
        'model': model
    }
    predictions[name] = y_pred
    
    print(f'  Accuracy: {accuracy:.4f}')
    print(f'  F1 Score: {f1:.4f}')
    print()

In [None]:
# Visualize model comparison
results_df = pd.DataFrame(results).T

plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
results_df['accuracy'].plot(kind='bar')
plt.title('Model Accuracy Comparison')
plt.ylabel('Accuracy')
plt.xticks(rotation=45)

plt.subplot(1, 2, 2)
results_df['f1_score'].plot(kind='bar')
plt.title('Model F1 Score Comparison')
plt.ylabel('F1 Score')
plt.xticks(rotation=45)

plt.tight_layout()
plt.show()

print('Model Performance Summary:')
print(results_df[['accuracy', 'f1_score']].round(4))

In [None]:
# Detailed analysis of best traditional model
best_traditional = max(results.items(), key=lambda x: x[1]['f1_score'])
best_name, best_metrics = best_traditional
best_predictions = predictions[best_name]

print(f'Best Traditional Model: {best_name}')
print(f'F1 Score: {best_metrics["f1_score"]:.4f}')
print(f'Accuracy: {best_metrics["accuracy"]:.4f}')

print('
Detailed Classification Report:')
print(classification_report(y_test, best_predictions, target_names=label_encoder.classes_))

# Confusion Matrix
cm = confusion_matrix(y_test, best_predictions)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=label_encoder.classes_, 
            yticklabels=label_encoder.classes_)
plt.title(f'Confusion Matrix - {best_name}')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

## 4. Feature Importance Analysis

In [None]:
# Feature importance for Logistic Regression
if 'Logistic Regression' in results:
    lr_model = results['Logistic Regression']['model']
    feature_names = vectorizer.get_feature_names_out()
    
    # Get coefficients for each class
    plt.figure(figsize=(15, 10))
    
    for i, class_name in enumerate(label_encoder.classes_):
        plt.subplot(2, 2, i+1)
        coeffs = lr_model.coef_[i]
        top_positive_indices = np.argsort(coeffs)[-10:]
        top_negative_indices = np.argsort(coeffs)[:10]
        
        top_features = np.concatenate([top_negative_indices, top_positive_indices])
        top_coeffs = coeffs[top_features]
        top_feature_names = [feature_names[idx] for idx in top_features]
        
        colors = ['red' if c < 0 else 'blue' for c in top_coeffs]
        plt.barh(range(len(top_coeffs)), top_coeffs, color=colors)
        plt.yticks(range(len(top_coeffs)), top_feature_names)
        plt.title(f'Top Features for {class_name.title()} Class')
        plt.xlabel('Coefficient Value')
    
    plt.tight_layout()
    plt.show()

## 5. Error Analysis

In [None]:
# Error analysis - find misclassified examples
misclassified_mask = y_test != best_predictions
misclassified_texts = X_test[misclassified_mask]
misclassified_true = y_test[misclassified_mask]
misclassified_pred = best_predictions[misclassified_mask]

print(f'Number of misclassified examples: {sum(misclassified_mask)}')
print(f'Error rate: {sum(misclassified_mask) / len(y_test):.2%}')

# Show some examples
print('
Sample Misclassified Examples:')
print('=' * 80)

for i in range(min(10, sum(misclassified_mask))):
    idx = np.where(misclassified_mask)[0][i]
    true_label = label_encoder.inverse_transform([misclassified_true.iloc[i]])[0]
    pred_label = label_encoder.inverse_transform([misclassified_pred[i]])[0]
    
    print(f'Text: {misclassified_texts.iloc[i]}')
    print(f'True: {true_label} | Predicted: {pred_label}')
    print('-' * 80)

## 6. Model Recommendations for Production

Based on the analysis above, here are the key findings and recommendations:

In [None]:
# Summary and recommendations
print('=' * 60)
print('MODEL SELECTION SUMMARY')
print('=' * 60)

for name, metrics in results.items():
    print(f'{name}:')
    print(f'  Accuracy: {metrics["accuracy"]:.4f}')
    print(f'  F1 Score: {metrics["f1_score"]:.4f}')
    print()

print(f'Best Model: {best_name}')
print(f'Best F1 Score: {best_metrics["f1_score"]:.4f}')

print('PRODUCTION RECOMMENDATIONS:')
print('1. Model Choice: Start with the best traditional model for faster inference')
print('2. Feature Engineering: TF-IDF with bigrams works well for this task')
print('3. Monitoring: Track prediction confidence and retrain when performance drops')
print('4. Data Quality: Focus on consistent labeling and text preprocessing')
print('5. Evaluation: Use F1 score as primary metric due to potential class imbalance')