In [3]:
!pip install scikit-learn seaborn



Load and Preprocess Data (Step 1: Load Dataset and Select Columns)


In [2]:
import pandas as pd

# Load dataset
url = "https://files.consumerfinance.gov/ccdb/complaints.csv.zip"
df = pd.read_csv(url, compression='zip', low_memory=False)

# Selecting required columns
columns_needed = ["Product", "Consumer complaint narrative"]
df = df[columns_needed].dropna()
df.columns = ["Category", "Complaint"]

# Check the first few rows to ensure correct loading
print(df.head())


                                             Category  \
9   Credit reporting or other personal consumer re...   
13  Credit reporting or other personal consumer re...   
14  Credit reporting or other personal consumer re...   
19  Credit reporting or other personal consumer re...   
20  Credit reporting or other personal consumer re...   

                                            Complaint  
9   Subject : Complaint Regarding Inaccurate Late ...  
13  Subject : Formal Complaint Against XXXX and Tr...  
14  Subject : Follow-Up on Dispute Submitted XX/XX...  
19  XX/XX/year> Subject : XXXX XXXX XXXX TransUnio...  
20  XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX X...  


Clean Data and Map Categories

In [3]:
import re
import string

# Mapping categories to numerical values
category_map = {
    "Credit reporting, repair, or other": 0,
    "Debt collection": 1,
    "Consumer Loan": 2,
    "Mortgage": 3
}
df = df[df["Category"].isin(category_map.keys())]
df["Category"] = df["Category"].map(category_map)

# Text Preprocessing function
def clean_text(text):
    text = text.lower()
    text = re.sub(f"[{string.punctuation}]", "", text)
    text = re.sub("\d+", "", text)  # Remove numbers
    return text

# Clean the complaint text
df["Complaint"] = df["Complaint"].apply(clean_text)

# Check the first few rows to ensure preprocessing is done correctly
print(df.head())


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Category"] = df["Category"].map(category_map)


     Category                                          Complaint
299         1  i have dealt with xxxx xxxx all my life and ev...
321         1  i am writing to address a concerning matter re...
377         1  this is so annoying  frustrating ive sent expe...
378         1  delete those late dates and update the statuse...
380         1  this is so annoying  frustrating ive sent equi...


Split Dataset into Train and Test

In [4]:
from sklearn.model_selection import train_test_split

# Splitting dataset
X_train, X_test, y_train, y_test = train_test_split(
    df["Complaint"], df["Category"], test_size=0.2, random_state=42, stratify=df["Category"]
)

# Check the shape of train and test splits
print(f"Training Data Size: {X_train.shape}")
print(f"Test Data Size: {X_test.shape}")


Training Data Size: (351671,)
Test Data Size: (87918,)


Convert Text to TF-IDF Features

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Convert text to numerical features using TF-IDF
vectorizer = TfidfVectorizer(stop_words="english", max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Check the shape of the resulting TF-IDF matrices
print(f"TF-IDF Training Matrix Shape: {X_train_tfidf.shape}")
print(f"TF-IDF Test Matrix Shape: {X_test_tfidf.shape}")


TF-IDF Training Matrix Shape: (351671, 5000)
TF-IDF Test Matrix Shape: (87918, 5000)


 Train Naïve Bayes Model

In [6]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Train Naïve Bayes model
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)

# Predictions and evaluation
y_pred_nb = nb_model.predict(X_test_tfidf)
print("Naïve Bayes Model Accuracy:", accuracy_score(y_test, y_pred_nb))
print(classification_report(y_test, y_pred_nb))


Naïve Bayes Model Accuracy: 0.951238654200505
              precision    recall  f1-score   support

           1       0.97      0.97      0.97     60894
           2       0.76      0.30      0.43      1892
           3       0.91      0.96      0.93     25132

    accuracy                           0.95     87918
   macro avg       0.88      0.74      0.78     87918
weighted avg       0.95      0.95      0.95     87918



Train Logistic Regression Model

In [7]:
from sklearn.linear_model import LogisticRegression

# Train Logistic Regression model
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train_tfidf, y_train)

# Predictions and evaluation
y_pred_lr = lr_model.predict(X_test_tfidf)
print("Logistic Regression Model Accuracy:", accuracy_score(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr))


Logistic Regression Model Accuracy: 0.9678336631861507
              precision    recall  f1-score   support

           1       0.97      0.98      0.98     60894
           2       0.80      0.53      0.64      1892
           3       0.96      0.96      0.96     25132

    accuracy                           0.97     87918
   macro avg       0.91      0.83      0.86     87918
weighted avg       0.97      0.97      0.97     87918



Naive Bayes with Bernoulli Distribution

In [11]:
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import accuracy_score, classification_report

# Train Bernoulli Naive Bayes model
bnb_model = BernoulliNB()
bnb_model.fit(X_train_tfidf, y_train)

# Predictions and evaluation
y_pred_bnb = bnb_model.predict(X_test_tfidf)
print("Bernoulli Naive Bayes Model Accuracy:", accuracy_score(y_test, y_pred_bnb))
print(classification_report(y_test, y_pred_bnb))


Bernoulli Naive Bayes Model Accuracy: 0.8632475716008099
              precision    recall  f1-score   support

           1       0.94      0.89      0.92     60894
           2       0.20      0.69      0.32      1892
           3       0.86      0.80      0.83     25132

    accuracy                           0.86     87918
   macro avg       0.67      0.79      0.69     87918
weighted avg       0.90      0.86      0.88     87918



LightGBM

Prediction Function

In [10]:
# Prediction function
def predict_complaint(text, model):
    text_tfidf = vectorizer.transform([clean_text(text)])
    category_idx = model.predict(text_tfidf)[0]
    category = [k for k, v in category_map.items() if v == category_idx][0]
    return category

# Example prediction using Logistic Regression
example_text = "My credit report has incorrect information and they won't fix it."
print("Predicted Category:", predict_complaint(example_text, lr_model))


Predicted Category: Debt collection
