In [124]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer

In [107]:
df = pd.read_csv('complaints.csv')
df.shape

(162421, 3)

df_label = df.Label
df_Text = df.Text

df_1 = pd.concat([df_label, df_Text], axis = 1)

df_1.head()

In [108]:
df['product'].value_counts()

product
credit_reporting       91179
debt_collection        23150
mortgages_and_loans    18990
credit_card            15566
retail_banking         13536
Name: count, dtype: int64

In [109]:
min_value = 13536

credit_reporting = df[df['product'] == 'credit_reporting'].sample(min_value, random_state = 150)
debt_collection = df[df['product'] == 'debt_collection'].sample(min_value, random_state = 150)
mortgages_and_loans = df[df['product'] == 'mortgages_and_loans'].sample(min_value, random_state = 150)
credit_card = df[df['product'] == 'credit_card'].sample(min_value, random_state = 150)
retail_banking = df[df['product'] == 'retail_banking'].sample(min_value, random_state = 150)

In [110]:
df = pd.concat([credit_reporting, debt_collection, mortgages_and_loans, credit_card, retail_banking], axis = 0)
df['product'].value_counts()

product
credit_reporting       13536
debt_collection        13536
mortgages_and_loans    13536
credit_card            13536
retail_banking         13536
Name: count, dtype: int64

In [111]:
df.shape

(67680, 3)

In [112]:
df.sample(10)

Unnamed: 0.1,Unnamed: 0,product,narrative
2870,2870,retail_banking,account closed chase via phone account service...
79989,79989,retail_banking,customer bank switched pnc investing wanted pn...
121384,121384,credit_card,received usbank reliacard upon research indica...
33285,33285,mortgages_and_loans,writing regarding timely processing disburseme...
149784,149784,retail_banking,happened bank purchased bunk bed child receive...
122664,122664,credit_reporting,request free annual credit report believe cons...
17164,17164,credit_reporting,easter account service reporting fraudulent ac...
98505,98505,credit_card,separated continued share finance assist raisi...
1734,1734,credit_reporting,debt collection lvnv funding llc dated appeare...
146402,146402,retail_banking,trying connect bank account easily able cash m...


In [113]:
df = df[['product', 'narrative']]

In [114]:
df.head()

Unnamed: 0,product,narrative
52684,credit_reporting,trying get updated info year long one item cre...
160635,credit_reporting,mi last name date birth transunion consumer so...
124138,credit_reporting,since battled credit bureau remove correct sta...
153887,credit_reporting,transunion reporting day late partial account ...
160785,credit_reporting,xxxxdispute xxxxdispute xxxxdispute xxxxdisput...


In [115]:
df['product_map_value'] = df['product'].map({
    'credit_reporting' : 0,       
    'debt_collection' : 1,     
    'mortgages_and_loans' : 2,    
    'credit_card' : 3,  
    'retail_banking' : 4,
})

In [116]:
df.isna().sum()

product              0
narrative            5
product_map_value    0
dtype: int64

In [117]:
df = df.dropna()

In [118]:
df.isna().sum()

product              0
narrative            0
product_map_value    0
dtype: int64

In [119]:
df.sample(10)

Unnamed: 0,product,narrative,product_map_value
40650,retail_banking,explain right government grant scam authorized...,4
111046,credit_card,made purchase paypal using mastercard debit ca...,3
139087,retail_banking,sba loan thats granted told sba agent use payp...,4
137594,credit_card,complaint regard chase closing credit account ...,3
13579,credit_card,credit card continues bill merchandise purchas...,3
92604,debt_collection,ive victim identity theft since paying debt is...,1
70915,debt_collection,received letter chase credit card account endi...,1
6226,mortgages_and_loans,american credit acceptance make extremely hard...,2
130730,retail_banking,previous choice system applicable happened spo...,4
80124,retail_banking,victim identity theft checking account fraudul...,4


In [152]:
X_train, X_test, Y_train, Y_test = train_test_split(df.narrative, df.product_map_value, 
                                                    test_size = 0.2, 
                                                    random_state = 150, 
                                                    stratify=df.product_map_value,
                                                    shuffle = True)

In [153]:
print(f'X_train Shape:- {X_train.shape}')
print(f'X_test Shape:- {X_test.shape}')

X_train Shape:- (54140,)
X_test Shape:- (13535,)


In [154]:
pipe = Pipeline([
    ('TF', TfidfVectorizer()),
    ('KNN', KNeighborsClassifier())
])

In [155]:
pipe.fit(X_train, Y_train)

Pipeline(steps=[('TF', TfidfVectorizer()), ('KNN', KNeighborsClassifier())])

In [156]:
Y_pred = pipe.predict(X_test)

print(classification_report(Y_pred, Y_test))

              precision    recall  f1-score   support

           0       0.84      0.73      0.78      3122
           1       0.72      0.81      0.76      2429
           2       0.79      0.82      0.80      2580
           3       0.76      0.75      0.76      2742
           4       0.82      0.83      0.82      2662

    accuracy                           0.78     13535
   macro avg       0.78      0.79      0.78     13535
weighted avg       0.79      0.78      0.78     13535



In [157]:
X_test[:10]

10178              ca connect also used work like month ago
34096     writing today regard account kia motor finance...
101143    name date birth social reached regard account ...
55606     filed dispute regard incorrect item credit rep...
55012     disputed account reached many time failed acco...
29675     busy schedule noticed certain debit charge acc...
113693    received credit monitoring alert collection ag...
144766    whomever may concern first foremost submitting...
90951     received offer upgrade hsbc choice checking ac...
93122     using month since use feature called take rema...
Name: narrative, dtype: object

In [158]:
print(Y_test[:10])
print('')
print('''credit_reporting' : 0 | debt_collection' : 1 | mortgages_and_loans' : 2 | credit_card' : 3 | retail_banking' : 4,''')

10178     4
34096     2
101143    1
55606     0
55012     1
29675     4
113693    1
144766    1
90951     4
93122     4
Name: product_map_value, dtype: int64

credit_reporting' : 0 | debt_collection' : 1 | mortgages_and_loans' : 2 | credit_card' : 3 | retail_banking' : 4,


In [159]:
Y_pred[:10]

array([4, 3, 1, 0, 1, 3, 1, 1, 4, 4], dtype=int64)