In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
import nltk
import re
from nltk.corpus import stopwords
import string

In [6]:
data = pd.read_csv("/content/drive/MyDrive/consumercomplaints.csv")

In [7]:
data.head()

Unnamed: 0.1,Unnamed: 0,Date received,Product,Sub-product,Issue,Sub-issue,Consumer complaint narrative
0,0,2022-11-11,Mortgage,Conventional home mortgage,Trouble during payment process,,
1,1,2022-11-23,"Credit reporting, credit repair services, or o...",Credit reporting,Improper use of your report,Reporting company used your report improperly,
2,2,2022-11-16,Mortgage,VA mortgage,Trouble during payment process,,
3,3,2022-11-15,Checking or savings account,Checking account,Managing an account,Fee problem,"Hi, I have been banking with Wells Fargo for o..."
4,4,2022-11-07,Mortgage,Other type of mortgage,Trouble during payment process,,


In [8]:
data.isnull().sum()

Unnamed: 0,0
Unnamed: 0,0
Date received,0
Product,0
Sub-product,235294
Issue,0
Sub-issue,683355
Consumer complaint narrative,1987977


In [11]:
data= data.drop("Unnamed: 0",axis=1)

In [12]:
data.head()

Unnamed: 0,Date received,Product,Sub-product,Issue,Sub-issue,Consumer complaint narrative
0,2022-11-11,Mortgage,Conventional home mortgage,Trouble during payment process,,
1,2022-11-23,"Credit reporting, credit repair services, or o...",Credit reporting,Improper use of your report,Reporting company used your report improperly,
2,2022-11-16,Mortgage,VA mortgage,Trouble during payment process,,
3,2022-11-15,Checking or savings account,Checking account,Managing an account,Fee problem,"Hi, I have been banking with Wells Fargo for o..."
4,2022-11-07,Mortgage,Other type of mortgage,Trouble during payment process,,


In [13]:
data.describe()

Unnamed: 0,Date received,Product,Sub-product,Issue,Sub-issue,Consumer complaint narrative
count,3101969,3101969,2866675,3101969,2418614,1113992
unique,4011,18,76,165,221,973058
top,2022-05-03,"Credit reporting, credit repair services, or o...",Credit reporting,Incorrect information on your report,Information belongs to someone else,My credit reports are inaccurate. These inaccu...
freq,3637,1432096,1415856,760403,487320,1648


In [14]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3101969 entries, 0 to 3101968
Data columns (total 6 columns):
 #   Column                        Dtype 
---  ------                        ----- 
 0   Date received                 object
 1   Product                       object
 2   Sub-product                   object
 3   Issue                         object
 4   Sub-issue                     object
 5   Consumer complaint narrative  object
dtypes: object(6)
memory usage: 142.0+ MB


In [15]:
data.dropna()

Unnamed: 0,Date received,Product,Sub-product,Issue,Sub-issue,Consumer complaint narrative
3,2022-11-15,Checking or savings account,Checking account,Managing an account,Fee problem,"Hi, I have been banking with Wells Fargo for o..."
11,2022-11-09,Debt collection,Other debt,False statements or representation,Indicated you were committing crime by not pay...,XXXX is attempting to collect funds for Valuat...
15,2022-11-14,"Credit reporting, credit repair services, or o...",Credit reporting,Incorrect information on your report,Account information incorrect,Today I called to get my balance and reset my ...
51,2022-10-12,"Credit reporting, credit repair services, or o...",Credit reporting,Problem with a credit reporting company's inve...,Their investigation did not fix an error on yo...,The Federal Trade Commission Bureau of Consume...
72,2022-10-09,"Credit reporting, credit repair services, or o...",Credit reporting,Problem with a credit reporting company's inve...,Difficulty submitting a dispute or getting inf...,Ive mailed police report called been hung up o...
...,...,...,...,...,...,...
3101950,2017-03-04,Debt collection,I do not know,Disclosure verification of debt,Not given enough info to verify debt,I have received calls and notices in regards t...
3101955,2017-01-19,Student loan,Non-federal student loan,Can't repay my loan,Can't decrease my monthly payments,"Insanely high monthly payments, with "" no opti..."
3101956,2017-01-22,Student loan,Federal student loan servicing,Dealing with my lender or servicer,Need information about my balance/terms,My loans have an extraordinarily high interest...
3101958,2017-01-26,Debt collection,Auto,Communication tactics,Called after sent written cease of comm,Received cease and desist letter from them sho...


In [16]:
data=data.dropna()

In [17]:
print(data["Product"].value_counts())

Product
Credit reporting, credit repair services, or other personal consumer reports    507582
Debt collection                                                                 192045
Credit card or prepaid card                                                      80410
Checking or savings account                                                      54192
Student loan                                                                     32697
Vehicle loan or lease                                                            19874
Payday loan, title loan, or personal loan                                         1008
Name: count, dtype: int64


**Training the model **

In [20]:
#cleaning the data first
nltk.download('stopwords')
stemmer = nltk.SnowballStemmer("english")
stopword=set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [21]:
def clean(text):
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = [word for word in text.split(' ') if word not in stopword]
    text=" ".join(text)
    text = [stemmer.stem(word) for word in text.split(' ')]
    text=" ".join(text)
    return text
data["Consumer complaint narrative"] = data["Consumer complaint narrative"].apply(clean)

  text = re.sub('\[.*?\]', '', text)
  text = re.sub('https?://\S+|www\.\S+', '', text)
  text = re.sub('\w*\d\w*', '', text)


**splitting the data into trianing and testing sets**

In [22]:
data = data[["Consumer complaint narrative", "Product"]]
x = np.array(data["Consumer complaint narrative"])
y = np.array(data["Product"])

cv = CountVectorizer()
X = cv.fit_transform(x)
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.33,
                                                    random_state=42)

In [23]:
sgdmodel = SGDClassifier()
sgdmodel.fit(X_train,y_train)

In [25]:
#checking accuracy
from sklearn.metrics import accuracy_score
print(accuracy_score(sgdmodel.predict(X_train),y_train))

0.864854050982548
