In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB, BernoulliNB

In [2]:
consumer=pd.read_csv('../data/Consumer_Clean.csv',usecols=['Product','Consumer_Complaint','Product_id'])

In [3]:
consumer.head()

Unnamed: 0,Product,Consumer_Complaint,Product_id
0,Debt collection,Midland Funding had posted a debt under a diff...,7
1,Debt collection,XXXX XXXX XXXX and Nationstar Mortgage has bee...,7
2,Debt collection,Complaint against - XXXX XXXX -- - A company c...,7
3,Debt collection,"The bill should only be XXXX. XXXX in XXXX, FL...",7
4,Debt collection,I have been disputing an account with XXXX XXX...,7


In [4]:
consumer.loc[consumer['Product']=='Credit reporting, credit repair services, or other personal consumer reports', 'Product']='Credit_Reporting-Credit_Repair'

## Multinomial Dataset training using the MultinomialNB

In [5]:
cv_counts=CountVectorizer(stop_words='english', binary=False)

In [6]:
X_counts=cv_counts.fit_transform(consumer.Consumer_Complaint).toarray()

In [7]:
X_counts.shape

(5000, 14177)

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X_counts, consumer.Product, test_size=0.2, random_state=123)

In [21]:
clf_Multinomial=MultinomialNB()

In [22]:
clf_Multinomial.fit(X_train,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [23]:
print('The Train score for Multinomial is {0}'.format(clf_Multinomial.score(X_train,y_train)))

The Train score for Multinomial is 0.87075


In [24]:
print('The Test score for Multinomial is {0}'.format(clf_Multinomial.score(X_test,y_test)))

The Test score for Multinomial is 0.745


### Verifying with an example 

In [26]:
clf_Multinomial.predict(cv_counts.transform(["This company refuses to provide me verification and validation of debt per my right under the FDCPA. I do not believe this debt is mine."]))

array(['Debt collection'], dtype='<U30')

In [45]:
clf_Multinomial.predict(cv_counts.transform(["I am disputing the inaccurate information the Chex-Systems has on my credit report. I initially submitted a police report on XXXX/XXXX/16 and Chex Systems only deleted the items that I mentioned in the letter and not all the items that were actually listed on the police report. In other words they wanted me to say word for word to them what items were fraudulent. The total disregard of the police report and what accounts that it states that are fraudulent. If they just had paid a little closer attention to the police report I would not been in this position now and they would n't have to research once again. I would like the reported information to be removed : XXXX XXXX XXXX"]))

array(['Credit reporting'], dtype='<U30')

## Binary Dataset training using the BernoulliNB

In [34]:
X_counts_binary=X_counts>0

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X_counts_binary, consumer.Product, test_size=0.2, random_state=123)

In [36]:
clf_binary=BernoulliNB()

In [37]:
clf_binary.fit(X_train,y_train)
print('The Train score for Binary is {0}'.format(clf_binary.score(X_train,y_train)))

The Train score for Binary is 0.76625


In [38]:
print('The Test score for Binary is {0}'.format(clf_binary.score(X_test,y_test)))

The Test score for Binary is 0.643


### Verifying with an example 

In [46]:
clf_binary.predict(cv_counts.transform(["This company refuses to provide me verification and validation of debt per my right under the FDCPA. I do not believe this debt is mine."]))

array(['Credit reporting'], dtype='<U30')

In [47]:
clf_binary.predict(cv_counts.transform(["I am disputing the inaccurate information the Chex-Systems has on my credit report. I initially submitted a police report on XXXX/XXXX/16 and Chex Systems only deleted the items that I mentioned in the letter and not all the items that were actually listed on the police report. In other words they wanted me to say word for word to them what items were fraudulent. The total disregard of the police report and what accounts that it states that are fraudulent. If they just had paid a little closer attention to the police report I would not been in this position now and they would n't have to research once again. I would like the reported information to be removed : XXXX XXXX XXXX"]))

array(['Credit reporting'], dtype='<U30')

### Observation: 

> We observe that the Train and Test Accuracy using the Multinomial dataset is more than the Binary dataset. 
It is too early to conclude but from the first impression it seems that there is a possibility that the frequency of occurence of words plays a role in the classification.