In [None]:
from datetime import datetime
print("Run by Adithya Reddy on", datetime.now())

Run by Adithya Reddy on 2025-10-14 16:17:24.476585


In [2]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score

In [None]:
from google.colab import files
uploaded = files.upload()

Saving complaints.csv to complaints.csv


In [None]:
df = pd.read_csv("complaints.csv",
                 usecols=["Product", "Consumer complaint narrative"])

# Drop missing complaints
df = df.dropna(subset=["Consumer complaint narrative"])
df.head()

Unnamed: 0,Product,Consumer complaint narrative
0,Credit reporting or other personal consumer re...,This CFPB complaint has been filed to request ...
1,Student loan,I currently have a loan serviced by Aidvantage...
2,Credit reporting or other personal consumer re...,""" I was utterly shocked and devastated upon di..."
3,Debt collection,"I, XXXX XXXX, am filing a formal complaint aga..."
4,Credit reporting or other personal consumer re...,It has been brought to my attention that my pe...


In [None]:
import re

def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)  # remove special characters & numbers
    return text

df["clean_text"] = df["Consumer complaint narrative"].apply(clean_text)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    df["clean_text"], df["Product"], test_size=0.2, random_state=42
)

In [None]:
y_train.value_counts()

Unnamed: 0_level_0,count
Product,Unnamed: 1_level_1
Credit reporting or other personal consumer reports,34669
Debt collection,3219
Mortgage,458
Vehicle loan or lease,392
Student loan,278


In [None]:
vectorizer = TfidfVectorizer(stop_words="english", max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)


In [None]:
model = MultinomialNB()
model.fit(X_train_tfidf, y_train)

In [None]:
y_pred = model.predict(X_test_tfidf)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9065

Classification Report:
                                                      precision    recall  f1-score   support

Credit reporting or other personal consumer reports       0.93      0.99      0.96      1775
                                    Debt collection       0.52      0.32      0.39       160
                                           Mortgage       1.00      0.12      0.22        24
                                       Student loan       0.00      0.00      0.00        17
                              Vehicle loan or lease       0.00      0.00      0.00        24

                                           accuracy                           0.91      2000
                                          macro avg       0.49      0.29      0.31      2000
                                       weighted avg       0.88      0.91      0.88      2000



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
import joblib
joblib.dump(model, "complaint_model.pkl")
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")

['tfidf_vectorizer.pkl']

##testing the traditional model

In [None]:
model = joblib.load("complaint_model.pkl")
vectorizer = joblib.load("tfidf_vectorizer.pkl")

In [None]:
test_data = [
    "The bank added wrong information to my credit report.",
    "I keep receiving calls from debt collectors about a loan I never took.",
    "My student loan interest rate is incorrect.",
    "I applied for a mortgage but they delayed it for months.",
    "The dealer overcharged me for my car loan.",
]

In [None]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    return text

cleaned = [clean_text(t) for t in test_data]

In [None]:
test_tfidf = vectorizer.transform(cleaned)
predictions = model.predict(test_tfidf)

In [None]:
for text, label in zip(test_data, predictions):
    print(f"Complaint: {text}")
    print(f"Predicted Category: {label}")
    print("-" * 80)

Complaint: The bank added wrong information to my credit report.
Predicted Category: Credit reporting or other personal consumer reports
--------------------------------------------------------------------------------
Complaint: I keep receiving calls from debt collectors about a loan I never took.
Predicted Category: Credit reporting or other personal consumer reports
--------------------------------------------------------------------------------
Complaint: My student loan interest rate is incorrect.
Predicted Category: Credit reporting or other personal consumer reports
--------------------------------------------------------------------------------
Complaint: I applied for a mortgage but they delayed it for months.
Predicted Category: Credit reporting or other personal consumer reports
--------------------------------------------------------------------------------
Complaint: The dealer overcharged me for my car loan.
Predicted Category: Credit reporting or other personal consumer 

# Random UnderSampling

In [None]:
vectorizer = TfidfVectorizer(stop_words="english", max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [None]:
from imblearn.under_sampling import RandomUnderSampler

rus = RandomUnderSampler(sampling_strategy="not minority")
X_res, y_res = rus.fit_resample(X_train_tfidf, y_train)

y_res.value_counts()

Unnamed: 0_level_0,count
Product,Unnamed: 1_level_1
Credit reporting or other personal consumer reports,278
Debt collection,278
Mortgage,278
Student loan,278
Vehicle loan or lease,278


In [None]:
model = MultinomialNB()
model.fit(X_res, y_res)

In [None]:
y_pred = model.predict(X_test_tfidf)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.8743208610968733

Classification Report:
                                                      precision    recall  f1-score   support

Credit reporting or other personal consumer reports       0.99      0.88      0.93      8685
                                    Debt collection       0.49      0.75      0.60       799
                                           Mortgage       0.53      0.94      0.68       116
                                       Student loan       0.25      0.89      0.39        70
                              Vehicle loan or lease       0.25      0.88      0.39        85

                                           accuracy                           0.87      9755
                                          macro avg       0.50      0.87      0.60      9755
                                       weighted avg       0.93      0.87      0.89      9755



In [None]:
test_tfidf = vectorizer.transform(cleaned)
predictions = model.predict(test_tfidf)

In [None]:
for text, label in zip(test_data, predictions):
    print(f"Complaint: {text}")
    print(f"Predicted Category: {label}")
    print("-" * 80)

Complaint: The bank added wrong information to my credit report.
Predicted Category: Vehicle loan or lease
--------------------------------------------------------------------------------
Complaint: I keep receiving calls from debt collectors about a loan I never took.
Predicted Category: Student loan
--------------------------------------------------------------------------------
Complaint: My student loan interest rate is incorrect.
Predicted Category: Student loan
--------------------------------------------------------------------------------
Complaint: I applied for a mortgage but they delayed it for months.
Predicted Category: Mortgage
--------------------------------------------------------------------------------
Complaint: The dealer overcharged me for my car loan.
Predicted Category: Vehicle loan or lease
--------------------------------------------------------------------------------


## Manual Dataset Modifications

In [15]:
credit_df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Kaiburr/task5/complaints-reportsnother.csv")
debt_df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Kaiburr/task5/complaints-DebtCollection.csv")
mortgage_df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Kaiburr/task5/complaints-mortgage.csv")
student_vehicle_df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Kaiburr/task5/complaints-Loan.csv")

df = pd.concat([credit_df, debt_df, mortgage_df, student_vehicle_df], ignore_index=True)

print("Combined dataset shape:", df.shape)

Combined dataset shape: (98627, 18)


In [16]:
# Keeping only required columns
df = df[["Product", "Consumer complaint narrative"]].dropna()

In [17]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)  # keep only letters and spaces
    text = re.sub(r'\s+', ' ', text).strip()  # remove extra spaces
    return text

df["clean_text"] = df["Consumer complaint narrative"].apply(clean_text)

# Drop empty complaints (if any)
df = df[df["clean_text"].str.len() > 10]
print("After cleaning:", df.shape)

After cleaning: (38185, 3)


In [18]:
label_map = {
    "Credit reporting or other personal consumer reports": 0,  # Credit reporting
    "Debt collection": 1,                                      # Debt collection
    "Student loan": 2,                                         # Consumer loan
    "Vehicle loan or lease": 2,                                # Consumer loan
    "Mortgage": 3                                              # Mortgage
}

df["label"] = df["Product"].map(label_map)
df = df.dropna(subset=["label"])
df["label"] = df["label"].astype(int)

print("\nLabel Distribution (before balancing):\n", df["label"].value_counts())


Label Distribution (before balancing):
 label
3    11899
2    10529
1     8140
0     7617
Name: count, dtype: int64


In [19]:
X_train, X_test, y_train, y_test = train_test_split(
    df["clean_text"], df["label"], test_size=0.2, random_state=42, stratify=df["label"]
)

In [20]:
vectorizer = TfidfVectorizer(stop_words="english", max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [21]:
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(sampling_strategy="not minority", random_state=42)
X_res, y_res = rus.fit_resample(X_train_tfidf, y_train)

# Check new class distribution
print("\nLabel Distribution after undersampling:\n", pd.Series(y_res).value_counts())


Label Distribution after undersampling:
 label
0    6094
1    6094
2    6094
3    6094
Name: count, dtype: int64


In [22]:
model = MultinomialNB()
model.fit(X_res, y_res)

In [23]:
y_pred = model.predict(X_test_tfidf)
print("\n✅ Accuracy:", round(accuracy_score(y_test, y_pred) * 100, 2), "%")
print("\nClassification Report:\n", classification_report(y_test, y_pred))


✅ Accuracy: 87.76 %

Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.90      0.87      1523
           1       0.89      0.75      0.81      1628
           2       0.90      0.86      0.88      2106
           3       0.89      0.97      0.92      2380

    accuracy                           0.88      7637
   macro avg       0.88      0.87      0.87      7637
weighted avg       0.88      0.88      0.88      7637



IMPROVING ACCURACY BY INCREASING DATASET

In [4]:
credit_df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Kaiburr/task5/complaints-reportsnother.csv")
debt_df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Kaiburr/task5/complaints-DebtCollection.csv")
mortgage_df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Kaiburr/task5/complaints-mortgage.csv")
student_vehicle_df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Kaiburr/task5/complaints-Loan.csv")

df = pd.concat([credit_df, debt_df, mortgage_df, student_vehicle_df], ignore_index=True)

print("Combined dataset shape:", df.shape)

Combined dataset shape: (479673, 18)


In [5]:
# Keeping only required columns
df = df[["Product", "Consumer complaint narrative"]].dropna()

In [6]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)  # keep only letters and spaces
    text = re.sub(r'\s+', ' ', text).strip()  # remove extra spaces
    return text

df["clean_text"] = df["Consumer complaint narrative"].apply(clean_text)

# Drop empty complaints (if any)
df = df[df["clean_text"].str.len() > 10]
print("After cleaning:", df.shape)

After cleaning: (189973, 3)


In [7]:
label_map = {
    "Credit reporting or other personal consumer reports": 0,  # Credit reporting
    "Debt collection": 1,                                      # Debt collection
    "Student loan": 2,                                         # Consumer loan
    "Vehicle loan or lease": 2,                                # Consumer loan
    "Mortgage": 3                                              # Mortgage
}

df["label"] = df["Product"].map(label_map)
df = df.dropna(subset=["label"])
df["label"] = df["label"].astype(int)

print("\nLabel Distribution (before balancing):\n", df["label"].value_counts())


Label Distribution (before balancing):
 label
3    63428
2    61490
1    39330
0    25725
Name: count, dtype: int64


In [8]:
X_train, X_test, y_train, y_test = train_test_split(
    df["clean_text"], df["label"], test_size=0.2, random_state=42, stratify=df["label"]
)

In [9]:
vectorizer = TfidfVectorizer(stop_words="english", max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [10]:
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(sampling_strategy="not minority", random_state=42)
X_res, y_res = rus.fit_resample(X_train_tfidf, y_train)

# Check new class distribution
print("\nLabel Distribution after undersampling:\n", pd.Series(y_res).value_counts())


Label Distribution after undersampling:
 label
0    20580
1    20580
2    20580
3    20580
Name: count, dtype: int64


In [11]:
model = MultinomialNB()
model.fit(X_res, y_res)

In [12]:
y_pred = model.predict(X_test_tfidf)
print("\n✅ Accuracy:", round(accuracy_score(y_test, y_pred) * 100, 2), "%")
print("\nClassification Report:\n", classification_report(y_test, y_pred))


✅ Accuracy: 88.36 %

Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.90      0.82      5145
           1       0.89      0.76      0.82      7866
           2       0.91      0.87      0.89     12298
           3       0.92      0.97      0.94     12686

    accuracy                           0.88     37995
   macro avg       0.87      0.87      0.87     37995
weighted avg       0.89      0.88      0.88     37995



## Changing the model to increase accuraccy

In [14]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from imblearn.under_sampling import RandomUnderSampler
from xgboost import XGBClassifier
import joblib

In [15]:
credit_df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Kaiburr/task5/complaints-reportsnother.csv")
debt_df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Kaiburr/task5/complaints-DebtCollection.csv")
mortgage_df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Kaiburr/task5/complaints-mortgage.csv")
student_vehicle_df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Kaiburr/task5/complaints-Loan.csv")

df = pd.concat([credit_df, debt_df, mortgage_df, student_vehicle_df], ignore_index=True)

print("Combined dataset shape:", df.shape)

Combined dataset shape: (479673, 18)


In [17]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df["Consumer complaint narrative"] = df["Consumer complaint narrative"].fillna("")
df["clean_text"] = df["Consumer complaint narrative"].apply(clean_text)
df = df[df["clean_text"].str.len() > 10]

In [18]:
label_map = {
    "Credit reporting or other personal consumer reports": 0,
    "Debt collection": 1,
    "Student loan": 2,
    "Vehicle loan or lease": 2,
    "Mortgage": 3
}

df["label"] = df["Product"].map(label_map)
df = df.dropna(subset=["label"])
df["label"] = df["label"].astype(int)

print("Label distribution before balancing:\n", df["label"].value_counts())

Label distribution before balancing:
 label
3    63428
2    61490
1    39330
0    25725
Name: count, dtype: int64


In [19]:
X_train, X_test, y_train, y_test = train_test_split(
    df["clean_text"], df["label"],
    test_size=0.2, random_state=42, stratify=df["label"]
)

In [20]:
vectorizer = TfidfVectorizer(stop_words='english', max_features=8000, ngram_range=(1,2))
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [21]:
from collections import Counter
rus = RandomUnderSampler(sampling_strategy='not minority', random_state=42)
X_res, y_res = rus.fit_resample(X_train_tfidf, y_train)
print("After undersampling:", Counter(y_res))

After undersampling: Counter({0: 20580, 1: 20580, 2: 20580, 3: 20580})


In [22]:
xgb_model = XGBClassifier(
    n_estimators=300,         # number of trees
    learning_rate=0.1,        # step size
    max_depth=6,              # tree depth
    subsample=0.8,            # random row sampling
    colsample_bytree=0.8,     # random column sampling
    random_state=42,
    use_label_encoder=False,
    eval_metric='mlogloss'
)

xgb_model.fit(X_res, y_res)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [23]:
y_pred = xgb_model.predict(X_test_tfidf)

print("\n✅ Accuracy:", round(accuracy_score(y_test, y_pred) * 100, 2), "%")
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


✅ Accuracy: 92.78 %

Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.93      0.89      5145
           1       0.89      0.90      0.89      7866
           2       0.93      0.93      0.93     12298
           3       0.98      0.95      0.96     12686

    accuracy                           0.93     37995
   macro avg       0.91      0.93      0.92     37995
weighted avg       0.93      0.93      0.93     37995


Confusion Matrix:
 [[ 4781   276    73    15]
 [  474  7059   278    55]
 [  281   444 11387   186]
 [   79   137   446 12024]]


In [24]:
test_texts = [
    "The bank added wrong information to my credit report.",
    "I keep getting calls from debt collectors about a loan I never took.",
    "My student loan interest rate is incorrect.",
    "My mortgage payment was processed late by the bank.",
    "The dealer overcharged me for my car loan."
]

test_clean = [clean_text(t) for t in test_texts]
test_vec = vectorizer.transform(test_clean)
preds = xgb_model.predict(test_vec)

reverse_map = {0:"Credit reporting, repair, or other",1:"Debt collection",2:"Consumer Loan",3:"Mortgage"}

print("\n🧠 Predictions:")
for text, label in zip(test_texts, preds):
    print(f"\nComplaint: {text}")
    print(f"Predicted Category: {reverse_map[label]}")
    print("-"*80)


🧠 Predictions:

Complaint: The bank added wrong information to my credit report.
Predicted Category: Credit reporting, repair, or other
--------------------------------------------------------------------------------

Complaint: I keep getting calls from debt collectors about a loan I never took.
Predicted Category: Debt collection
--------------------------------------------------------------------------------

Complaint: My student loan interest rate is incorrect.
Predicted Category: Consumer Loan
--------------------------------------------------------------------------------

Complaint: My mortgage payment was processed late by the bank.
Predicted Category: Mortgage
--------------------------------------------------------------------------------

Complaint: The dealer overcharged me for my car loan.
Predicted Category: Consumer Loan
--------------------------------------------------------------------------------


In [25]:
joblib.dump(xgb_model, "xgboost_complaint_model.pkl")
joblib.dump(vectorizer, "xgboost_tfidf_vectorizer.pkl")
print("\n✅ Model and vectorizer saved successfully!")



✅ Model and vectorizer saved successfully!
