# Consumer Complaints

### Classify Consumer Complaints with Natural Language Processing

### Project Prerequisites

In [200]:
# import subprocess
# print(subprocess.getoutput("python -m spacy download en_core_web_sm"))

In [196]:
import pandas as pd
import spacy
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from spacy.lang.en.stop_words import STOP_WORDS
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from matplotlib import pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
import plotly.express as px

###  Preparing the Data

In [144]:
df = pd.read_csv("complaints_processed.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,product,narrative
0,0,credit_card,purchase order day shipping amount receive pro...
1,1,credit_card,forwarded message date tue subject please inve...
2,2,retail_banking,forwarded message cc sent friday pdt subject f...
3,3,credit_reporting,payment history missing credit report speciali...
4,4,credit_reporting,payment history missing credit report made mis...


In [145]:
df.drop(columns=["Unnamed: 0"],inplace=True)
df.head(2)

Unnamed: 0,product,narrative
0,credit_card,purchase order day shipping amount receive pro...
1,credit_card,forwarded message date tue subject please inve...


In [146]:
df.shape

(162421, 2)

In [147]:
df.describe()

Unnamed: 0,product,narrative
count,162421,162411
unique,5,124472
top,credit_reporting,victim identity notified collection creditor s...
freq,91179,739


### Handling Class Imbalance

In [148]:
df["product"].value_counts()

In [149]:
min_samples = 1000

In [150]:
credit_reporting_samples = df[df["product"] == "credit_reporting"].sample(min_samples,random_state=2022)

In [151]:
debt_collection_samples = df[df["product"] == "debt_collection"].sample(min_samples,random_state = 2022)

In [152]:
mortgages_and_loans_samples = df[df["product"] == "mortgages_and_loans"].sample(min_samples,random_state = 2022)

In [153]:
credit_card_samples = df[df["product"] == "credit_card"].sample(min_samples,random_state = 2022)

In [154]:
retail_banking_samples = df[df["product"] == "retail_banking"].sample(min_samples,random_state = 2022)

In [155]:
credit_reporting_samples.head()

Unnamed: 0,product,narrative
17858,credit_reporting,equifax adequately investigated inaccurate inf...
70553,credit_reporting,received copy credit report found following it...
95557,credit_reporting,looking credit report notice fraudulent accoun...
39422,credit_reporting,decided get credit pulled due couple bill cont...
62242,credit_reporting,tried multiple time dispute bankruptcy report ...


In [156]:
df = pd.concat([credit_reporting_samples,debt_collection_samples,mortgages_and_loans_samples,
                credit_card_samples,retail_banking_samples],axis=0)

In [157]:
df["product"].value_counts()

In [158]:
df["narrative"].shape

(5000,)

### Preprocessing

In [159]:
nlp = spacy.load("en_core_web_sm")

In [160]:
def preprocessing(text):
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)
    return " ".join(filtered_tokens)

In [161]:
df["New_narrative"] = df["narrative"].apply(preprocessing)
df.head()

Unnamed: 0,product,narrative,New_narrative
17858,credit_reporting,equifax adequately investigated inaccurate inf...,equifax adequately investigate inaccurate info...
70553,credit_reporting,received copy credit report found following it...,receive copy credit report find follow item er...
95557,credit_reporting,looking credit report notice fraudulent accoun...,look credit report notice fraudulent account b...
39422,credit_reporting,decided get credit pulled due couple bill cont...,decide credit pull couple bill continue receiv...
62242,credit_reporting,tried multiple time dispute bankruptcy report ...,try multiple time dispute bankruptcy report re...


In [162]:
df.shape

(5000, 3)

In [163]:
df.head()

Unnamed: 0,product,narrative,New_narrative
17858,credit_reporting,equifax adequately investigated inaccurate inf...,equifax adequately investigate inaccurate info...
70553,credit_reporting,received copy credit report found following it...,receive copy credit report find follow item er...
95557,credit_reporting,looking credit report notice fraudulent accoun...,look credit report notice fraudulent account b...
39422,credit_reporting,decided get credit pulled due couple bill cont...,decide credit pull couple bill continue receiv...
62242,credit_reporting,tried multiple time dispute bankruptcy report ...,try multiple time dispute bankruptcy report re...


### Splitting the data

In [164]:
X_train,X_test,y_train,y_test = train_test_split(df["New_narrative"],df["product"],test_size=0.2,random_state=2022)

In [165]:
X_train.head()

### Model Validation

In [198]:
arange = np.arange(1, 9)
train_score = np.empty(len(arange))
test_score = np.empty(len(arange))

for n,knn in enumerate(arange):
    clf = Pipeline([
    ("CountVec",CountVectorizer()),
    ("Knn",KNeighborsClassifier(n_neighbors=knn))
    ])
    clf.fit(X_train,y_train)
    train_score[n] = clf.score(X_train,y_train)
    test_score[n] = clf.score(X_test,y_test)

### Checking the model's prediction

In [199]:
fig = px.line(x= arange,y = [train_score,test_score],title = "Test Values")
# px.line(x = arange,y = test_score,title = "Train Values")
fig.show()