# Importing Required Modules

In [39]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
import nltk
import re
from nltk.corpus import stopwords
import string
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import seaborn as sns
import pickle

# Loading the Data

In [7]:
data = pd.read_csv('data\consumercomplaints.csv')

In [9]:
data.head()

Unnamed: 0.1,Unnamed: 0,Date received,Product,Sub-product,Issue,Sub-issue,Consumer complaint narrative
0,0,2022-11-11,Mortgage,Conventional home mortgage,Trouble during payment process,,
1,1,2022-11-23,"Credit reporting, credit repair services, or o...",Credit reporting,Improper use of your report,Reporting company used your report improperly,
2,2,2022-11-16,Mortgage,VA mortgage,Trouble during payment process,,
3,3,2022-11-15,Checking or savings account,Checking account,Managing an account,Fee problem,"Hi, I have been banking with Wells Fargo for o..."
4,4,2022-11-07,Mortgage,Other type of mortgage,Trouble during payment process,,


In [10]:
data = data.drop("Unnamed: 0",axis=1)

# Exploring Data

In [14]:
data.shape

(3101969, 6)

In [11]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3101969 entries, 0 to 3101968
Data columns (total 6 columns):
 #   Column                        Dtype 
---  ------                        ----- 
 0   Date received                 object
 1   Product                       object
 2   Sub-product                   object
 3   Issue                         object
 4   Sub-issue                     object
 5   Consumer complaint narrative  object
dtypes: object(6)
memory usage: 142.0+ MB


In [12]:
data.describe()

Unnamed: 0,Date received,Product,Sub-product,Issue,Sub-issue,Consumer complaint narrative
count,3101969,3101969,2866675,3101969,2418614,1113992
unique,4011,18,76,165,221,973058
top,2022-05-03,"Credit reporting, credit repair services, or o...",Credit reporting,Incorrect information on your report,Information belongs to someone else,My credit reports are inaccurate. These inaccu...
freq,3637,1432096,1415856,760403,487320,1648


In [13]:
data.isnull().sum()

Date received                         0
Product                               0
Sub-product                      235294
Issue                                 0
Sub-issue                        683355
Consumer complaint narrative    1987977
dtype: int64

### There are some null values in subproduct and Consumer Complaint narrative.The main compalint data have some null values and they are textual so any traditional data fill method will not work so for the mean time the best course of action is to drop those fields

In [15]:
data = data.dropna()

In [16]:
print(data["Product"].value_counts())

Credit reporting, credit repair services, or other personal consumer reports    507582
Debt collection                                                                 192045
Credit card or prepaid card                                                      80410
Checking or savings account                                                      54192
Student loan                                                                     32697
Vehicle loan or lease                                                            19874
Payday loan, title loan, or personal loan                                         1008
Name: Product, dtype: int64


### As we can see the product column in the dataset contains the labels. Here the labels represent the nature of the complaints reported by the consumers.

# Preprocessing the textual data 

### to remove stopwords( is,the, or etc.) punctuaions special symbols we are gonne use nltk and regex

In [17]:
nltk.download('stopwords')
stemmer = nltk.SnowballStemmer("english")
stopword=set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\AYUSH\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [18]:
def clean(text): # function to do the preprocessing on the text
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = [word for word in text.split(' ') if word not in stopword]
    text=" ".join(text)
    text = [stemmer.stem(word) for word in text.split(' ')]
    text=" ".join(text)
    return text

In [19]:
data["Consumer complaint narrative"] = data["Consumer complaint narrative"].apply(clean)# applying the function

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["Consumer complaint narrative"] = data["Consumer complaint narrative"].apply(clean)# applying the function


# Splitting the Data Based on Dependent and Independent Features

In [20]:
data = data[["Consumer complaint narrative", "Product"]]
x = np.array(data["Consumer complaint narrative"])
y = np.array(data["Product"])

# Splitting the data Based on training and testing

In [25]:
cv = CountVectorizer()
X = cv.fit_transform(x)
X_train, X_test, y_train, y_test = train_test_split(X, y,  test_size=0.33,  random_state=42)

# Training The Model

In [26]:
sgdmodel = SGDClassifier()
sgdmodel.fit(X_train,y_train)

# Testing the Model

In [29]:
model_accuracy=round((sgdmodel.score(X_test,y_test)*100),2)
model_accuracy# accuracy of the model

86.2

In [32]:
y_predict=sgdmodel.predict(X_test)

In [36]:
print(classification_report(y_test,y_predict))

                                                                              precision    recall  f1-score   support

                                                 Checking or savings account       0.87      0.84      0.85     17766
                                                 Credit card or prepaid card       0.80      0.76      0.78     26332
Credit reporting, credit repair services, or other personal consumer reports       0.88      0.94      0.91    167824
                                                             Debt collection       0.82      0.78      0.80     63496
                                   Payday loan, title loan, or personal loan       0.20      0.00      0.01       328
                                                                Student loan       0.89      0.76      0.82     10581
                                                       Vehicle loan or lease       0.79      0.48      0.60      6650

                                                      

In [40]:
user = input("Enter a Text: ")
data = cv.transform([user]).toarray()
output = sgdmodel.predict(data)
print(output)

Enter a Text:  On XXXX/XXXX/2022, I called Citi XXXX XXXX XXXX XXXX XXXX Customer Service at XXXX. I did not want to pay {$99.00} for the next year membership and wanted to cancel my card account. A customer service representative told me if I pay the {$99.00} membership fee and spending {$1000.00} in 3 months, I can get XXXX mileage reward points of XXXX XXXX. I believed what he said and paid {$99.00} membership fee on XXXX/XXXX/2022.   I spent more than {$1000.00} in 3 months since XXXX/XXXX/2022. On XXXX/XXXX/2022, I called the card Customer Service about my reward mileage points. I was total the reward mileage points are NOT XXXX. I can only get XXXX mileage points instead. I believe that the Citi XXXX XXXX XXXX XXXX XXXX Customer Service cheated me. This is business fraud!
['Credit card or prepaid card']


# Saving The Model

In [41]:
pickle.dump(sgdmodel,open('model.pkl','wb'))