In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import warnings

warnings.filterwarnings('ignore')

# Download the 'stopwords' list
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [4]:
file_path = '/content/drive/MyDrive/Kaiburr/complaints.csv'

df = pd.read_csv(file_path)

categories_to_keep = [
    "Credit reporting, repair, or other",
    "Debt collection",
    "Consumer Loan",
    "Mortgage"
]


df = df[df['Product'].isin(categories_to_keep)]

df = df.dropna(subset=['Consumer complaint narrative'])

category_map = {
    "Credit reporting, repair, or other": 0,
    "Debt collection": 1,
    "Consumer Loan": 2,
    "Mortgage": 3
}
df['category_id'] = df['Product'].map(category_map)

df_final = df[['Consumer complaint narrative', 'category_id', 'Product']].copy()

print(f"Total complaints after filtering: {len(df_final)}")
print(df_final['Product'].value_counts())

Total complaints after filtering: 516220
Product
Debt collection    371899
Mortgage           134860
Consumer Loan        9461
Name: count, dtype: int64


In [5]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = ' '.join(word for word in text.split() if word not in stop_words)
    return text

df_final['processed_text'] = df_final['Consumer complaint narrative'].apply(preprocess_text)

print("Text processing complete.")

Text processing complete.


In [6]:
X = df_final['processed_text']
y = df_final['category_id']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

tfidf_vectorizer = TfidfVectorizer(max_features=5000)

X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

X_test_tfidf = tfidf_vectorizer.transform(X_test)

print("Data split and vectorized.")

Data split and vectorized.


In [7]:
# Model 1: Logistic Regression
print("Training Logistic Regression...")
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train_tfidf, y_train)
lr_preds = lr_model.predict(X_test_tfidf)
lr_accuracy = accuracy_score(y_test, lr_preds)
print(f"Logistic Regression Accuracy: {lr_accuracy * 100:.2f}%")

# Model 2: Multinomial Naive Bayes
print("\nTraining Multinomial Naive Bayes...")
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)
nb_preds = nb_model.predict(X_test_tfidf)
nb_accuracy = accuracy_score(y_test, nb_preds)
print(f"Naive Bayes Accuracy: {nb_accuracy * 100:.2f}%")

Training Logistic Regression...
Logistic Regression Accuracy: 96.98%

Training Multinomial Naive Bayes...
Naive Bayes Accuracy: 95.21%


In [9]:
print("\n--- Logistic Regression Evaluation ---")

unique_labels = sorted(y_test.unique())

target_names = [name for name, label in category_map.items() if label in unique_labels]
print(classification_report(y_test, lr_preds, target_names=target_names, labels=unique_labels))


--- Logistic Regression Evaluation ---
                 precision    recall  f1-score   support

Debt collection       0.98      0.98      0.98     74103
  Consumer Loan       0.78      0.52      0.63      1951
       Mortgage       0.96      0.96      0.96     27190

       accuracy                           0.97    103244
      macro avg       0.91      0.82      0.86    103244
   weighted avg       0.97      0.97      0.97    103244



In [10]:
category_names = ["Debt collection", "Consumer Loan", "Mortgage"]

new_complaint_text = "I was overcharged on my mortgage payment, this is unacceptable!"

processed_new_text = preprocess_text(new_complaint_text)

new_text_tfidf = tfidf_vectorizer.transform([processed_new_text])

prediction = lr_model.predict(new_text_tfidf)
predicted_category_id = prediction[0]

try:
    predicted_category_name = category_names[unique_labels.index(predicted_category_id)]
except ValueError:

    predicted_category_name = "Unknown Category"


print(f"\n--- New Prediction ---")
print(f"Complaint: '{new_complaint_text}'")
print(f"Predicted Category: {predicted_category_name}")


--- New Prediction ---
Complaint: 'I was overcharged on my mortgage payment, this is unacceptable!'
Predicted Category: Mortgage
