In [None]:
import pandas as pd
import numpy as np
import re
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
import warnings
warnings.filterwarnings('ignore')

 # Data Loading and Initial Exploration 

In [None]:

df = pd.read_csv("train_complaints.csv")

print("Data shape:", df.shape)
print("\nColumns:", df.columns.tolist())
print("\nFirst few rows:")
print(df.head())
print("\nMissing values per column:")
print(df.isnull().sum())

# Text Preprocessing and Cleaning

In [None]:
def clean_text(text):

    if not isinstance(text, str):
        return ""
    
    text = str(text)
    text = BeautifulSoup(text, "html.parser").get_text(separator=" ")
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+', ' ', text, flags=re.MULTILINE)
    text = re.sub(r'\S+@\S+', ' ', text)
    text = re.sub(r'[^a-zA-Z\s\.\,\!\?]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

df['cleaned_text'] = df['complaint_text'].apply(clean_text)

print("Text cleaning completed!")
print(f"Sample cleaned text:\n{df['cleaned_text'][0][:200]}...")

# Severity Distribution Analysis

In [None]:

print("Severity Distribution:")
severity_counts = df['severity'].value_counts().sort_index()
print(severity_counts)
print(f"\nTotal samples: {len(df)}")
print(f"Number of severity levels: {df['severity'].nunique()}")

print("\nClass Balance (percentage):")
for level in sorted(df['severity'].unique()):
    percentage = (df['severity'] == level).sum() / len(df) * 100
    print(f"Severity {level}: {percentage:.2f}%")

# Train-Test Split with Stratification

In [None]:

X = df['cleaned_text']
y = df['severity']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42,
    stratify=y
)

print(f"Training set size: {len(X_train)}")
print(f"Test set size: {len(X_test)}")
print("\nTraining set class distribution:")
print(y_train.value_counts().sort_index())

# TF-IDF Feature Extraction

max_features=5000,
ngram_range=(1,2), Removes overly common words (appear in >85% of documents)

In [None]:

tfidf = TfidfVectorizer(
    max_features=5000, 
    ngram_range=(1, 2),  
    min_df=5,           
    max_df=0.85        
)

print("Creating TF-IDF features...")
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

print(f"TF-IDF shape - Train: {X_train_tfidf.shape}, Test: {X_test_tfidf.shape}")
print(f"Number of features: {len(tfidf.get_feature_names_out())}")

## Baseline Model Training (Logistic Regression)

In [None]:

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, r2_score

model = LogisticRegression(
    class_weight='balanced',
    max_iter=1000,
    random_state=42,
    n_jobs=-1
)

print("Training Logistic Regression model...")
model.fit(X_train_tfidf, y_train)
print("Training completed!")

y_pred = model.predict(X_test_tfidf)
y_pred_proba = model.predict_proba(X_test_tfidf)

print("\n Model Evaluation")
print(classification_report(y_test, y_pred, zero_division=0))

r2 = r2_score(y_test, y_pred)
print(f"R² Score: {r2:.4f}")

## Advanced Feature Engineering 

In [None]:
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
import numpy as np

def create_additional_features(texts):
    features = np.zeros((len(texts), 2))
    for i, text in enumerate(texts):
        features[i, 0] = len(text)  
        features[i, 1] = len(text.split())  
    return features

X_train_extra = create_additional_features(X_train)
X_test_extra = create_additional_features(X_test)

from scipy.sparse import hstack
X_train_combined = hstack([X_train_tfidf, X_train_extra])
X_test_combined = hstack([X_test_tfidf, X_test_extra])

print(f"Combined features shape: {X_train_combined.shape}")


rf_model = RandomForestClassifier(
    n_estimators=150,
    max_depth=20,
    class_weight='balanced_subsample',  
    random_state=42,
    n_jobs=-1
)


gb_model = GradientBoostingClassifier(
    n_estimators=150,
    max_depth=10,
    learning_rate=0.1,
    random_state=42
)

log_model = LogisticRegression(
    class_weight='balanced',
    max_iter=1000,
    random_state=42,
    n_jobs=-1
)

print("\nTraining Random Forest model...")
rf_model.fit(X_train_combined, y_train)

print("Training Gradient Boosting model...")
gb_model.fit(X_train_combined, y_train)

print("Training Logistic Regression model...")
log_model.fit(X_train_combined, y_train)

# Ensemble Model Development

In [None]:
from sklearn.metrics import classification_report, r2_score
import numpy as np


print("Making Predictions")


rf_pred = rf_model.predict(X_test_combined)
print("Random Forest Report:")
print(classification_report(y_test, rf_pred, zero_division=0))
rf_r2 = r2_score(y_test, rf_pred)
print(f"Random Forest R²: {rf_r2:.4f}")


gb_pred = gb_model.predict(X_test_combined)
print("\nGradient Boosting Report:")
print(classification_report(y_test, gb_pred, zero_division=0))
gb_r2 = r2_score(y_test, gb_pred)
print(f"Gradient Boosting R²: {gb_r2:.4f}")


log_pred = log_model.predict(X_test_combined)
print("\nLogistic Regression Report:")
print(classification_report(y_test, log_pred, zero_division=0))
log_r2 = r2_score(y_test, log_pred)
print(f"Logistic Regression R²: {log_r2:.4f}")


print("\n=== Creating Weighted Ensemble ===")
ensemble_pred = []
for i in range(len(y_test)):
    predictions = [rf_pred[i], gb_pred[i], log_pred[i]]
    ensemble_pred.append(np.bincount(predictions).argmax())

print("Ensemble Report:")
print(classification_report(y_test, ensemble_pred, zero_division=0))
ensemble_r2 = r2_score(y_test, ensemble_pred)
print(f"Ensemble R²: {ensemble_r2:.4f}")

models = {
    'Random Forest': rf_r2,
    'Gradient Boosting': gb_r2,
    'Logistic Regression': log_r2,
    'Ensemble': ensemble_r2
}
best_model_name = max(models, key=models.get)
print(f"\nBest model for Severity: {best_model_name} (R²: {models[best_model_name]:.4f})")

# Model Evaluation and Comparison

In [None]:
print("Training Models for Primary Category")

y_primary = df['primary_category']

X_train_primary, X_test_primary, y_train_primary, y_test_primary = train_test_split(
    X, y_primary, test_size=0.2, random_state=42, stratify=y_primary
)

tfidf_primary = TfidfVectorizer(
    max_features=4000,
    ngram_range=(1, 2),
    min_df=5,
    max_df=0.9
)

X_train_primary_tfidf = tfidf_primary.fit_transform(X_train_primary)
X_test_primary_tfidf = tfidf_primary.transform(X_test_primary)

primary_model = RandomForestClassifier(
    n_estimators=150,
    max_depth=20,
    class_weight='balanced_subsample',
    random_state=42,
    n_jobs=-1
)

primary_model.fit(X_train_primary_tfidf, y_train_primary)
primary_pred = primary_model.predict(X_test_primary_tfidf)

print("Primary Category Accuracy:", accuracy_score(y_test_primary, primary_pred))
print("\nPrimary Category Classification Report:")
print(classification_report(y_test_primary, primary_pred, zero_division=0))

print("\n=== Training Models for Secondary Category ===")
y_secondary = df['secondary_category']

X_train_secondary, X_test_secondary, y_train_secondary, y_test_secondary = train_test_split(
    X, y_secondary, test_size=0.2, random_state=42, stratify=y_secondary
)

tfidf_secondary = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1, 2),
    min_df=3,
    max_df=0.85
)

X_train_secondary_tfidf = tfidf_secondary.fit_transform(X_train_secondary)
X_test_secondary_tfidf = tfidf_secondary.transform(X_test_secondary)

secondary_model = GradientBoostingClassifier(
    n_estimators=150,
    max_depth=10,
    learning_rate=0.1,
    random_state=42
)

secondary_model.fit(X_train_secondary_tfidf, y_train_secondary)
secondary_pred = secondary_model.predict(X_test_secondary_tfidf)

print("Secondary Category Accuracy:", accuracy_score(y_test_secondary, secondary_pred))
print("\nSecondary Category Classification Report (top 10 classes):")

unique_classes = np.unique(y_test_secondary)
if len(unique_classes) > 10:
  
    top_classes = pd.Series(y_test_secondary).value_counts().head(10).index
    mask = y_test_secondary.isin(top_classes)
    print(classification_report(y_test_secondary[mask], secondary_pred[mask], zero_division=0))
else:
    print(classification_report(y_test_secondary, secondary_pred, zero_division=0))

In [None]:
test_df = pd.read_csv("test_complaints.csv")
print("Test data loaded, shape:", test_df.shape)

test_df['cleaned_text'] = test_df['complaint_text'].apply(clean_text)
print("\n Predicting Severity")

X_full_severity = df['cleaned_text']
y_full_severity = df['severity']

tfidf_full_severity = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1, 2),
    min_df=5,
    max_df=0.85
)

X_full_tfidf = tfidf_full_severity.fit_transform(X_full_severity)


def create_features(text_series):
    features = np.zeros((len(text_series), 2))
    for i, text in enumerate(text_series):
        features[i, 0] = len(text)
        features[i, 1] = len(text.split())
    return features

X_full_extra = create_features(X_full_severity)

from scipy.sparse import hstack
X_full_combined = hstack([X_full_tfidf, X_full_extra])

final_severity_model = GradientBoostingClassifier(
    n_estimators=150,
    max_depth=10,
    learning_rate=0.1,
    random_state=42
)
final_severity_model.fit(X_full_combined, y_full_severity)

X_test_tfidf = tfidf_full_severity.transform(test_df['cleaned_text'])
X_test_extra = create_features(test_df['cleaned_text'])
X_test_combined = hstack([X_test_tfidf, X_test_extra])

test_severity = final_severity_model.predict(X_test_combined)

print(" Predicting Primary Category")

y_full_primary = df['primary_category']

tfidf_full_primary = TfidfVectorizer(
    max_features=4000,
    ngram_range=(1, 2),
    min_df=5,
    max_df=0.9
)
X_full_primary_tfidf = tfidf_full_primary.fit_transform(X_full_severity)

final_primary_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=20,
    class_weight='balanced_subsample',
    random_state=42,
    n_jobs=-1
)
final_primary_model.fit(X_full_primary_tfidf, y_full_primary)

X_test_primary = tfidf_full_primary.transform(test_df['cleaned_text'])
test_primary = final_primary_model.predict(X_test_primary)

print(" Predicting Secondary Category")

y_full_secondary = df['secondary_category']

tfidf_full_secondary = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1, 2),
    min_df=3,
    max_df=0.85
)
X_full_secondary_tfidf = tfidf_full_secondary.fit_transform(X_full_severity)

final_secondary_model = GradientBoostingClassifier(
    n_estimators=150,
    max_depth=10,
    learning_rate=0.1,
    random_state=42
)
final_secondary_model.fit(X_full_secondary_tfidf, y_full_secondary)

X_test_secondary = tfidf_full_secondary.transform(test_df['cleaned_text'])
test_secondary = final_secondary_model.predict(X_test_secondary)

print("All predictions completed!")

In [None]:
submission = pd.DataFrame({
    'complaint_id': test_df['complaint_id'],
    'primary_category': test_primary,
    'secondary_category': test_secondary,
    'severity': test_severity
})

submission.to_csv('final_submission_ensemble.csv', index=False)

print("Submission file created: final_submission_ensemble.csv")
print("File preview:")
print(submission.head())
print(f"\nFile shape: {submission.shape}")
print(f"Severity distribution in submission:")
print(submission['severity'].value_counts().sort_index())