In [1]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\pc\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\pc\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# Step 1: Data Exploration

In [5]:
import pandas as pd

# Load the dataset
df = pd.read_csv('complaints.csv')

# Let's see the columns, shape, and first few rows
print(df.columns)
print(df.shape)
print(df.head())

df = df[['product', 'narrative']].copy()
df.dropna(inplace=True) # Remove rows with missing values
df.rename(columns={'product': 'category', 'narrative': 'text'}, inplace=True)

# Check the distribution of categories
print("\nCategory Distribution:")
print(df['category'].value_counts())

Index(['Unnamed: 0', 'product', 'narrative'], dtype='object')
(162421, 3)
   Unnamed: 0           product  \
0           0       credit_card   
1           1       credit_card   
2           2    retail_banking   
3           3  credit_reporting   
4           4  credit_reporting   

                                           narrative  
0  purchase order day shipping amount receive pro...  
1  forwarded message date tue subject please inve...  
2  forwarded message cc sent friday pdt subject f...  
3  payment history missing credit report speciali...  
4  payment history missing credit report made mis...  

Category Distribution:
category
credit_reporting       91172
debt_collection        23148
mortgages_and_loans    18990
credit_card            15566
retail_banking         13535
Name: count, dtype: int64


# Step 2: Text Preprocessing

In [16]:
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Initialize lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # 1. Convert to lowercase
    text = text.lower()
    # 2. Remove punctuation and numbers
    text = re.sub(r'[^a-z\s]', '', text)
    # 3. Tokenize and remove stopwords
    words = text.split()
    words = [word for word in words if word not in stop_words]
    # 4. Lemmatization
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

# Apply the preprocessing function to our text column
print("Preprocessing text... this may take a moment.")
df['cleaned_text'] = df['text'].apply(preprocess_text)
print("Preprocessing complete.")
print(df[['text', 'cleaned_text']].head())

Preprocessing text... this may take a moment.
Preprocessing complete.
                                                text  \
0  purchase order day shipping amount receive pro...   
1  forwarded message date tue subject please inve...   
2  forwarded message cc sent friday pdt subject f...   
3  payment history missing credit report speciali...   
4  payment history missing credit report made mis...   

                                        cleaned_text  
0  purchase order day shipping amount receive pro...  
1  forwarded message date tue subject please inve...  
2  forwarded message cc sent friday pdt subject f...  
3  payment history missing credit report speciali...  
4  payment history missing credit report made mis...  


# Step 3: Feature Extraction (TF-IDF)

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

# Define features (X) and target (y)
X = df['cleaned_text']
y = df['category']

# Split the data into training and testing sets (80% train, 20% test)
# This is crucial to evaluate the model on unseen data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Initialize the TF-IDF Vectorizer
# We'll limit to the top 5000 most frequent words to keep the feature set manageable
tfidf_vectorizer = TfidfVectorizer(max_features=5000)

# Fit the vectorizer on the training data and transform it
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

# Only transform the test data using the already fitted vectorizer
X_test_tfidf = tfidf_vectorizer.transform(X_test)

print(f"Shape of TF-IDF training data: {X_train_tfidf.shape}")
print(f"Shape of TF-IDF testing data: {X_test_tfidf.shape}")

Shape of TF-IDF training data: (129928, 5000)
Shape of TF-IDF testing data: (32483, 5000)


# Step 4: Model Training & Evaluation

# a. Train the Model

In [19]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report

# --- Model 1: Multinomial Naive Bayes ---
print("Training Multinomial Naive Bayes model...")
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)

# --- Model 2: Linear SVM ---
print("Training Linear SVM model...")
svm_model = LinearSVC(random_state=42)
svm_model.fit(X_train_tfidf, y_train)

Training Multinomial Naive Bayes model...
Training Linear SVM model...


# b. Evaluate the Models

In [20]:
# Make predictions on the test data
y_pred_nb = nb_model.predict(X_test_tfidf)
y_pred_svm = svm_model.predict(X_test_tfidf)

# Evaluate Naive Bayes
print("\n--- Multinomial Naive Bayes Evaluation ---")
print(f"Accuracy: {accuracy_score(y_test, y_pred_nb):.4f}")
print("Classification Report:")
print(classification_report(y_test, y_pred_nb))

# Evaluate Linear SVM
print("\n--- Linear SVM Evaluation ---")
print(f"Accuracy: {accuracy_score(y_test, y_pred_svm):.4f}")
print("Classification Report:")
print(classification_report(y_test, y_pred_svm))


--- Multinomial Naive Bayes Evaluation ---
Accuracy: 0.8378
Classification Report:
                     precision    recall  f1-score   support

        credit_card       0.73      0.73      0.73      3113
   credit_reporting       0.88      0.91      0.89     18235
    debt_collection       0.82      0.60      0.69      4630
mortgages_and_loans       0.76      0.85      0.80      3798
     retail_banking       0.84      0.84      0.84      2707

           accuracy                           0.84     32483
          macro avg       0.80      0.79      0.79     32483
       weighted avg       0.84      0.84      0.83     32483


--- Linear SVM Evaluation ---
Accuracy: 0.8720
Classification Report:
                     precision    recall  f1-score   support

        credit_card       0.79      0.77      0.78      3113
   credit_reporting       0.91      0.94      0.92     18235
    debt_collection       0.81      0.73      0.77      4630
mortgages_and_loans       0.85      0.82      0.

# Save the Model and Vectorizer

In [24]:
import joblib

# Choose the best model (e.g., SVM) and retrain on ALL data
final_model = LinearSVC(random_state=42)
final_vectorizer = TfidfVectorizer(max_features=5000)

# Fit on the entire dataset
X_tfidf_full = final_vectorizer.fit_transform(X)
final_model.fit(X_tfidf_full, y)

# Save the artifacts
joblib.dump(final_model, 'ticket_classifier_model.joblib')
joblib.dump(final_vectorizer, 'tfidf_vectorizer.joblib')

print("Model and vectorizer saved.")

Model and vectorizer saved.
