In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score



In [3]:
df = pd.read_csv("data.csv",low_memory=False)

In [4]:
# Preprocess
df['issue'].fillna('', inplace=True)
df['sub_issue'].fillna('', inplace=True)

In [5]:
df

Unnamed: 0,date_received,product,sub_product,issue,sub_issue,consumer_complaint_narrative,company_public_response,company,state,zipcode,tags,consumer_consent_provided,submitted_via,date_sent_to_company,company_response_to_consumer,timely_response,consumer_disputed?,complaint_id,Category
0,12-05-2014,Debt collection,Mortgage,Disclosure verification of debt,,,,"(Former)Shapiro, Swertfeger & Hasty, LLP",GA,30005,,,Referral,12-12-2014,Untimely response,No,No,1144671,0
1,11-10-2014,Mortgage,Other mortgage,"Loan servicing, payments, escrow account",,,,"(Former)Shapiro, Swertfeger & Hasty, LLP",DE,19803,,,Referral,11/19/2014,Untimely response,No,No,1109287,1
2,08/26/2015,Mortgage,Other mortgage,"Loan modification,collection,foreclosure",,,,"(Former)Shapiro, Swertfeger & Hasty, LLP",GA,30014,,,Referral,09-08-2015,Untimely response,No,No,1536776,1
3,01/16/2014,Debt collection,Mortgage,Disclosure verification of debt,,,,"(Former)Shapiro, Swertfeger & Hasty, LLP",GA,30087,,,Referral,02-11-2014,Untimely response,No,No,671539,0
4,06/25/2015,Mortgage,Conventional fixed mortgage,"Application, originator, mortgage broker",,My mortgage company has misrepresented themsel...,,"1st 2nd Mortgage Company Of NJ, Inc.",NJ,074XX,,Consent provided,Web,07/22/2015,Closed,Yes,No,1437506,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
555952,01/26/2014,Debt collection,Non-federal student loan,Improper contact or sharing of info,,,,Zwicker & Associates,MN,55428,,,Web,01/27/2014,Closed with non-monetary relief,Yes,No,685904,0
555953,01/26/2016,Debt collection,Non-federal student loan,Cont'd attempts collect debt not owed,,,,Zwicker & Associates,NJ,070XX,Older American,Consent provided,Web,02-10-2016,Closed with non-monetary relief,Yes,No,1759548,0
555954,03/31/2016,Debt collection,"Other (i.e. phone, health club, etc.)",Disclosure verification of debt,,,,Zwicker & Associates,FL,33837,,,Referral,04-04-2016,Closed with explanation,Yes,No,1859430,0
555955,10/13/2015,Debt collection,Credit card,Disclosure verification of debt,,,,Zwicker & Associates,FL,33308,,,Phone,10/13/2015,Closed with non-monetary relief,Yes,No,1603745,0


In [6]:
df["product"].unique()

array(['Debt collection', 'Mortgage', 'Consumer Loan',
       'Bank account or service', 'Credit reporting', 'Payday loan',
       'Other financial service', 'Student loan', 'Money transfers',
       'Prepaid card', 'Credit card'], dtype=object)

In [7]:
# Assign numerical labels
category_mapping = {category: index for index, category in enumerate(df['product'].unique())}
df['Category'] = df['product'].map(category_mapping)

In [8]:
df.to_csv("Clean_category_product.csv",index=False)

In [9]:
# Split the dataset
X = df['issue'] + ' ' + df['sub_issue']
y = df['Category']
X_train, X_eval, y_train, y_eval = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:

vectorizer = TfidfVectorizer()

In [11]:
# Convert into numerical features
vectorizer = TfidfVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_eval_vectorized = vectorizer.transform(X_eval)

In [12]:
import pickle

In [12]:
pickle.dump(vectorizer, open('vectorizer.pkl','wb'))

In [28]:
vectorizer.get_feature_names_out()

array(['8am', '9pm', 'about', 'abusive', 'account', 'acct', 'action',
       'adding', 'advance', 'advertising', 'after', 'agree', 'alerts',
       'amount', 'amt', 'an', 'and', 'annual', 'application', 'applied',
       'apply', 'apr', 'arbitration', 'are', 'arrest', 'as', 'asked',
       'atm', 'attempt', 'attempted', 'attempts', 'attorney', 'available',
       'bad', 'balance', 'bank', 'bankruptcy', 'being', 'better',
       'billing', 'broker', 'by', 'called', 'calls', 'can', 'cancelling',
       'card', 'cash', 'caused', 'cease', 'changes', 'charged', 'charges',
       'check', 'checks', 'closing', 'collect', 'collected', 'collection',
       'comm', 'committed', 'communication', 'company', 'consent', 'cont',
       'contact', 'contacted', 'convenience', 'costs', 'crc', 'credit',
       'credited', 'crime', 'customer', 'damaged', 'day', 'dealing',
       'debit', 'debt', 'decision', 'decrease', 'delay', 'deleted',
       'delinquent', 'deposits', 'destroyed', 'determination', 'did

In [13]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train_vectorized, y_train)

In [14]:
# Predict on the evaluation set
y_pred = model.predict(X_eval_vectorized)

In [15]:
y_pred

array([ 1,  3,  1, ..., 10,  0,  1], dtype=int64)

In [None]:
# Evaluate the model
accuracy = accuracy_score(y_eval, y_pred)
print(f"Accuracy: {accuracy}")

In [5]:
def predict_category(Issue, Subissue):
  example_text = Issue + " " + Subissue

  example_text_vectorized = vectorizer.transform([example_text])

  predicted_category = model.predict(example_text_vectorized)
  predicted_category_name = [category for category, index in category_mapping.items() if index == predicted_category[0]][0]
  return predicted_category_name

In [None]:
Issue = "disclosure verification of debt"
Subissue = "not given enough info to verify debt"

In [1]:
import pickle

In [2]:
pickle.dump(model,open('model.pkl','wb'))

NameError: name 'model' is not defined

In [3]:
model = pickle.load(open('model.pkl','rb'))



In [4]:
import gradio as gr

In [10]:


demo = gr.Interface(title="Predict Category Using Issue and Subissue",fn=predict_category, 
                    inputs=[gr.Textbox(lines=2,placeholder="Please Enter Your Issue"),
                            gr.Textbox(lines=2,placeholder="Please Enter Your SubIssue")], 
                    outputs="text",
                   examples=[["My money is debited but not received", "it happened 2 hours ago"],
                            ["I want to go USA and I need money",""]],
                   allow_flagging="never")

demo.launch()

Running on local URL:  http://127.0.0.1:7861

To create a public link, set `share=True` in `launch()`.




Traceback (most recent call last):
  File "c:\Users\Chandrashekhar\.conda\envs\tf-gpu\lib\site-packages\gradio\routes.py", line 394, in run_predict
    output = await app.get_blocks().process_api(
  File "c:\Users\Chandrashekhar\.conda\envs\tf-gpu\lib\site-packages\gradio\blocks.py", line 1075, in process_api
    result = await self.call_function(
  File "c:\Users\Chandrashekhar\.conda\envs\tf-gpu\lib\site-packages\gradio\blocks.py", line 884, in call_function
    prediction = await anyio.to_thread.run_sync(
  File "c:\Users\Chandrashekhar\.conda\envs\tf-gpu\lib\site-packages\anyio\to_thread.py", line 33, in run_sync
    return await get_asynclib().run_sync_in_worker_thread(
  File "c:\Users\Chandrashekhar\.conda\envs\tf-gpu\lib\site-packages\anyio\_backends\_asyncio.py", line 877, in run_sync_in_worker_thread
    return await future
  File "c:\Users\Chandrashekhar\.conda\envs\tf-gpu\lib\site-packages\anyio\_backends\_asyncio.py", line 807, in run
    result = context.run(func, *args)


In [None]:
model2 = RandomForestClassifier()
model2.fit(X_train_vectorized, y_train)

In [None]:
y_pred = model2.predict(X_eval_vectorized)

In [None]:
accuracy_score(y_eval, y_pred)

In [None]:
def predict_category_using_RFC(Issue, Subissue):
  example_text = Issue + " " + Subissue

  example_text_vectorized = vectorizer.transform([example_text])

  predicted_category = model2.predict(example_text_vectorized)
  predicted_category_name = [category for category, index in category_mapping.items() if index == predicted_category[0]][0]
  return predicted_category_name

# use new model and then predict

In [16]:
LAYERS

NameError: name 'LAYERS' is not defined