In [None]:
from datetime import datetime
print("Run by Adithya Reddy on", datetime.now())

Run by Adithya Reddy on 2025-10-14 16:17:24.476585


In [None]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from imblearn.under_sampling import RandomUnderSampler
from xgboost import XGBClassifier
import joblib

In [None]:
credit_df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Kaiburr/task5/complaints-reportsnother.csv")
debt_df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Kaiburr/task5/complaints-DebtCollection.csv")
mortgage_df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Kaiburr/task5/complaints-mortgage.csv")
student_vehicle_df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Kaiburr/task5/complaints-Loan.csv")

df = pd.concat([credit_df, debt_df, mortgage_df, student_vehicle_df], ignore_index=True)

print("Combined dataset shape:", df.shape)

Combined dataset shape: (479673, 18)


In [None]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df["Consumer complaint narrative"] = df["Consumer complaint narrative"].fillna("")
df["clean_text"] = df["Consumer complaint narrative"].apply(clean_text)
df = df[df["clean_text"].str.len() > 10]

In [None]:
label_map = {
    "Credit reporting or other personal consumer reports": 0,
    "Debt collection": 1,
    "Student loan": 2,
    "Vehicle loan or lease": 2,
    "Mortgage": 3
}

df["label"] = df["Product"].map(label_map)
df = df.dropna(subset=["label"])
df["label"] = df["label"].astype(int)

print("Label distribution before balancing:\n", df["label"].value_counts())

Label distribution before balancing:
 label
3    63428
2    61490
1    39330
0    25725
Name: count, dtype: int64


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    df["clean_text"], df["label"],
    test_size=0.2, random_state=42, stratify=df["label"]
)

In [None]:
vectorizer = TfidfVectorizer(stop_words='english', max_features=8000, ngram_range=(1,2))
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [None]:
from collections import Counter
rus = RandomUnderSampler(sampling_strategy='not minority', random_state=42)
X_res, y_res = rus.fit_resample(X_train_tfidf, y_train)
print("After undersampling:", Counter(y_res))

After undersampling: Counter({0: 20580, 1: 20580, 2: 20580, 3: 20580})


In [None]:
xgb_model = XGBClassifier(
    n_estimators=300,         # number of trees
    learning_rate=0.1,        # step size
    max_depth=6,              # tree depth
    subsample=0.8,            # random row sampling
    colsample_bytree=0.8,     # random column sampling
    random_state=42,
    use_label_encoder=False,
    eval_metric='mlogloss'
)

xgb_model.fit(X_res, y_res)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [None]:
y_pred = xgb_model.predict(X_test_tfidf)

print("\n✅ Accuracy:", round(accuracy_score(y_test, y_pred) * 100, 2), "%")
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


✅ Accuracy: 92.78 %

Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.93      0.89      5145
           1       0.89      0.90      0.89      7866
           2       0.93      0.93      0.93     12298
           3       0.98      0.95      0.96     12686

    accuracy                           0.93     37995
   macro avg       0.91      0.93      0.92     37995
weighted avg       0.93      0.93      0.93     37995


Confusion Matrix:
 [[ 4781   276    73    15]
 [  474  7059   278    55]
 [  281   444 11387   186]
 [   79   137   446 12024]]


In [None]:
test_texts = [
    "The bank added wrong information to my credit report.",
    "I keep getting calls from debt collectors about a loan I never took.",
    "My student loan interest rate is incorrect.",
    "My mortgage payment was processed late by the bank.",
    "The dealer overcharged me for my car loan."
]

test_clean = [clean_text(t) for t in test_texts]
test_vec = vectorizer.transform(test_clean)
preds = xgb_model.predict(test_vec)

reverse_map = {0:"Credit reporting, repair, or other",1:"Debt collection",2:"Consumer Loan",3:"Mortgage"}

print("\n🧠 Predictions:")
for text, label in zip(test_texts, preds):
    print(f"\nComplaint: {text}")
    print(f"Predicted Category: {reverse_map[label]}")
    print("-"*80)


🧠 Predictions:

Complaint: The bank added wrong information to my credit report.
Predicted Category: Credit reporting, repair, or other
--------------------------------------------------------------------------------

Complaint: I keep getting calls from debt collectors about a loan I never took.
Predicted Category: Debt collection
--------------------------------------------------------------------------------

Complaint: My student loan interest rate is incorrect.
Predicted Category: Consumer Loan
--------------------------------------------------------------------------------

Complaint: My mortgage payment was processed late by the bank.
Predicted Category: Mortgage
--------------------------------------------------------------------------------

Complaint: The dealer overcharged me for my car loan.
Predicted Category: Consumer Loan
--------------------------------------------------------------------------------


In [None]:
joblib.dump(xgb_model, "xgboost_complaint_model.pkl")
joblib.dump(vectorizer, "xgboost_tfidf_vectorizer.pkl")
print("\n✅ Model and vectorizer saved successfully!")



✅ Model and vectorizer saved successfully!


In [None]:
from google.colab import files
files.download("xgboost_tfidf_vectorizer.pkl")
files.download("xgboost_complaint_model.pkl")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>