# FINANCE COMPLAINT PROJECT

### Feature Engineering & Model Training

##### Importing required libraries

In [2]:
#!pip install plotly
#!pip install nltk
#!pip install scikit-learn
#!pip install xgboost
#!pip install catboost
#!pip install category_encoders
#!pip install imbalanced-learn
!pip install hyperopt




In [3]:
#Basic

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import warnings
warnings.filterwarnings("ignore")


# For Text Processing

import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize, FreqDist
import string
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer


#For Classification Model Selection

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
#from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay, \
                            precision_score, recall_score, f1_score, roc_auc_score, roc_curve


# For data-preprocessing 
from sklearn.preprocessing import OneHotEncoder, StandardScaler, RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from category_encoders.binary import BinaryEncoder
from imblearn.combine import SMOTETomek

#For Hyperparameter Tuning
from hyperopt import tpe,hp,Trials,space_eval
from hyperopt.fmin import fmin
from hyperopt.pyll import scope


### Import the data from source

In [4]:
import os
os.getcwd()

'd:\\Bala\\Project_Inprogress\\Finance\\notebook'

In [5]:
os.chdir("D:\Bala\Project_Inprogress\Finance")

In [6]:
df = pd.read_parquet("input.parquet")
df.head()

Unnamed: 0,company,company_public_response,company_response,complaint_id,complaint_what_happened,consumer_consent_provided,consumer_disputed,date_received,date_sent_to_company,issue,product,state,sub_issue,sub_product,submitted_via,tags,timely,zip_code
0,Portfolio Recovery Associates,,,ID000000,Yes,Consent not provided,Yes,2023-08-03,2023-01-26,Billing disputes,Retail store card,TX,,Store credit card,Postal mail,,Yes,
1,Fifth Third Bank,,Company believes it acted appropriately,ID000001,Yes,Consent not provided,No,2024-01-09,2024-06-13,Payment processing issues,Auto loan,PA,Improper reporting,,Phone,Unresolved complaint,Yes,
2,Bank of America,,,ID000002,Yes,Consent not provided,No,2023-06-14,2022-05-13,Unauthorized charges,Insurance,CO,,,Web,,Yes,
3,Chase Bank,,,ID000003,Yes,Consent not provided,No,2022-02-13,2024-08-31,Payment processing issues,Personal loan,WA,,Other bank product/service,Web,,Yes,12468.0
4,American Express Company,,,ID000004,Yes,Consent not provided,Yes,2023-10-03,2022-02-26,Service not received,Auto loan,TX,Application denied,General-purpose credit card,Web,,No,


### As per Final EDA report we can remove some features

In [7]:
missing = df.isnull().sum().div(df.shape[0]).mul(100).to_frame().sort_values(by=0,ascending=False)
missing[:8]

Unnamed: 0,0
sub_issue,78.0828
sub_product,78.0524
tags,77.3056
company_response,77.1952
company_public_response,77.1548
zip_code,75.002
submitted_via,4.9192
product,0.0


In [8]:
drop_columns = ['tags', 'complaint_what_happened', 'company_public_response', 'sub_issue', 'sub_product', 'zip_code', 'complaint_id', 'company']
df.drop(columns=drop_columns, inplace=True)

In [9]:
missing = df.isnull().sum().div(df.shape[0]).mul(100).to_frame().sort_values(by=0,ascending=False)
missing[:8]

Unnamed: 0,0
company_response,77.1952
submitted_via,4.9192
consumer_consent_provided,0.0
consumer_disputed,0.0
date_received,0.0
date_sent_to_company,0.0
issue,0.0
product,0.0


# Feature Extraction

In [10]:
df[['date_received','date_sent_to_company']].head(3)

Unnamed: 0,date_received,date_sent_to_company
0,2023-08-03,2023-01-26
1,2024-01-09,2024-06-13
2,2023-06-14,2022-05-13


In [11]:
df['days_to_forward_complaint']=pd.to_datetime(df['date_sent_to_company'])-pd.to_datetime(df['date_received'])

df['days_to_forward_complaint'] = df['days_to_forward_complaint'].dt.days

In [12]:
df.drop(['date_received','date_sent_to_company'],axis=1,inplace=True)

### For Model to reduce computation time we can use sample of the data for model

In [13]:
#sample data

df1 = df.groupby("consumer_disputed").sample(n=50000)
df1.reset_index(inplace=True)
df1.head()

Unnamed: 0,index,company_response,consumer_consent_provided,consumer_disputed,issue,product,state,submitted_via,timely,days_to_forward_complaint
0,92142,Investigation ongoing,Consent not provided,No,Service not received,Mortgage,OR,Web,Yes,560
1,31566,Closed with monetary relief,Consent not provided,No,Problem with a credit reporting company's inve...,Credit reporting,NC,Web,Yes,682
2,239128,,Consent provided,No,Identity theft issues,Student loan,VA,Web,Yes,-611
3,97276,,Consent not provided,No,Loan application denied,Credit reporting,AZ,Web,No,512
4,100792,,Consent not provided,No,Unauthorized charges,Credit reporting,IN,Postal mail,No,804


# Text Processing

### For Vectorization
1. TFIDF -> term frequency - inverse document frequency
2. CountVectorizer
3. NLTK/Scipy Library
4. Pretrained Glove
 
 -> here we can use TFIDF to process

### Steps for text preprocessing
1. Remove Punctuation
2. Remove Stop Words
3. Lower Case
4. Tokenization
5. Stemming/Lemmatization

For this "issue" column has text which needs to be processed

### Create a list of Stop words which has to be removed

In [14]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [15]:
stopwords_list = stopwords.words('english') + list(string.punctuation)

### Create function to tokenize and lemmatize text column

In [16]:
# Function to tokenize data and remove stopwords

def process_text(issue):

    #create tokens
    tokens = nltk.word_tokenize(issue)

    #remove common stopwords
    stopwords_removed = [token.lower() for token in tokens if token.lower() not in stopwords_list]

    #remove stopwords including few punctuation
    stopwords_removed = [word for word in stopwords_removed if word.isalpha() ]
    
    return stopwords_removed

# Concat the strings
def concat_the_strings(words_list):
    concat_words = ""
    for word in words_list:
        concat_words+= word + ' '

    return concat_words

# function to lemmatize words and merge eeach complaint into a single space-separated string
lemm = WordNetLemmatizer()

def lemmatizer_concat(words_list):
    #remove any NaN's
    list_of_words = [i for i in words_list if i is not np.nan]

    # lemmatize each word
    lemmatized_list = []
    for idx, word in enumerate(words_list):
        lemmatized_list.append(lemm.lemmatize(word))

    # make the list into a single string with the words separated by ' '
    final_string = concat_the_strings(lemmatized_list)

    return final_string    

### Prepare data with Text processing

In [17]:
# NLTK stopwords pre requisite import

import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\balaj\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [18]:
for i in range(len(df1)):
    text = process_text(df1['issue'].loc[i])
    final_texts = lemmatizer_concat(text)
    df1['issue'].loc[i] = final_texts
    if i % 5000 == 0:
        print(f'Processed Row Number {i}')

Processed Row Number 0
Processed Row Number 5000
Processed Row Number 10000
Processed Row Number 15000
Processed Row Number 20000
Processed Row Number 25000
Processed Row Number 30000
Processed Row Number 35000
Processed Row Number 40000
Processed Row Number 45000
Processed Row Number 50000
Processed Row Number 55000
Processed Row Number 60000
Processed Row Number 65000
Processed Row Number 70000
Processed Row Number 75000
Processed Row Number 80000
Processed Row Number 85000
Processed Row Number 90000
Processed Row Number 95000


### Vectorize the processed texts

In [19]:
tfidv = TfidfVectorizer(max_features=None, strip_accents='unicode',analyzer='word',ngram_range=(1,2))

#Get data after vectorizing issue column
df_vect = tfidv.fit_transform(df1['issue'])

feature_names = tfidv.get_feature_names_out()

# Data Preprocessing

Concat old data with vectorized data from issue text column

In [20]:
df1 = pd.concat([df1, pd.DataFrame(df_vect.toarray())], axis=1)

### After processing issue column as vectors, Now issue column can be removed

In [21]:
df1.drop(['issue', 'index'], axis=1, inplace=True)

In [22]:
from sklearn.model_selection import train_test_split 
X = df1.drop(['consumer_disputed'], axis=1)
y = df1['consumer_disputed']

In [23]:
#check shape of Train data
X.shape

(100000, 62)

### Initialize features for transformation

In [24]:
# for binary encoder
binary_features = ['product', 'state', 'submitted_via', 'company_response'] 
# for onehot encoder
onhot_features = ['consumer_consent_provided', 'timely', 'state']

In [27]:
df1.head()

Unnamed: 0,company_response,consumer_consent_provided,consumer_disputed,product,state,submitted_via,timely,days_to_forward_complaint,0,1,...,45,46,47,48,49,50,51,52,53,54
0,Investigation ongoing,Consent not provided,No,Mortgage,OR,Web,Yes,560,0.0,0.0,...,0.57735,0.0,0.0,0.0,0.57735,0.57735,0.0,0.0,0.0,0.0
1,Closed with monetary relief,Consent not provided,No,Credit reporting,NC,Web,Yes,682,0.0,0.0,...,0.0,0.0,0.350825,0.350825,0.0,0.0,0.0,0.0,0.0,0.0
2,,Consent provided,No,Student loan,VA,Web,Yes,-611,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.472621,0.472621,0.0,0.0
3,,Consent not provided,No,Credit reporting,AZ,Web,No,512,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,,Consent not provided,No,Credit reporting,IN,Postal mail,No,804,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.57735,0.57735


### Create columntrasnformer for transformation

In [None]:
onehot_encoder_pipeline = Pipeline(steps=[
    ('SimpleImputer', SimpleImputer (strategy = 'most_frequent')), 
    ('OneHot_encoder', OneHotEncoder())
])

binary_encoder_pipeline = Pipeline(steps=[
    ('SimpleImputer', SimpleImputer (strategy = 'most_frequent')), 
    ('BinaryEncoder', BinaryEncoder())
])

# getting data pre processor object
preprocessor = ColumnTransformer(
    [
        ("Categorical_Pipeline", onehot_encoder_pipeline, onhot_features), 
        ("Binary_encoder_pipeline", binary_encoder_pipeline, binary_features), 
        ("Numeric_Pipeline", RobustScaler (), numerical_feature)
    ],
    remainder='passthrough'
)

### Transforming the data for Modelling

In [None]:
#Fit transform the train data

X = preprocessor.fit_transform(X)

### Manually Encoding Target Feature

In [None]:
### Manually encoding "Yes" as 0 and "No" as 1
y = np.where(y.values == 'Yes', 0,1 )

# Handling Imbalanced Dataset

### Handling Imbalanced Dataset Handling Imbalanced Target Variable.

 • Synthetic Minority Oversampling Technique or SMOTE is another technique to oversample the minority class.Simply adding duplicate records of minority class often don't add any new information to the model

 • SMOTE is one of the famous oversampling techniques and is very effective in handling class imbalance. The idea is to combine SMOTE with some undersampling techniques (ENN, Tomek) to increase the effectiveness of handling the imbalanced class.

In [None]:
#Resampling the minority class. The strategy can be changed as required. 
smt = SMOTETomek (random_state=42, sampling_strategy='minority', n_jobs=-1) 

# Fit the model to generate the data. 
X_res, y_res = smt.fit_resample(X, y)

# Model Selection

Here should understand the Various Classification models with default values from these models we can choose top 4 with Highest Accuracy score and proceed with HyperParameter Tuning

In [28]:
# Function which returns all evaluation metrics for classification model

def evaluate_clf(true, predicted):
    acc = accuracy_score(true, predicted) # Calculate Accuracy
    f1 = f1_score(true, predicted) # Calculate F1-score
    precision = precision_score(true, predicted) # Calculate Precision
    recall = recall_score(true, predicted) # Calculate Recall
    roc_auc = roc_auc_score(true, predicted) #Calculate Roc
    return acc, f1 , precision, recall, roc_auc

In [29]:
# Initialize models which are required for model selection

models = {
    "Random Forest": RandomForestClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "Logistic Regression": LogisticRegression(),
    "K-Neighbors Classifier": KNeighborsClassifier(), "XGBClassifier": XGBClassifier(),
    #"CatBoosting Classifier": CatBoostClassifier(verbose=False), 
    "AdaBoost Classifier": AdaBoostClassifier()
}

In [31]:
# Create a function which can evaluate models and return a report in Dataframe

def evaluate_models (x, y, models):
    '''
    This function takes in X and y and models dictionary as input
    It splits the data into Train Test split
    Iterates through the given model dictionary and evaluates the metrics 
    Returns: Dataframe which contains report of all models metrics with cost
    '''
    # separate dataset into train and test X_train, X_test, y_train, y_test
    X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2, random_state=42)

    models_list = []
    accuracy_list = []
    auc = []
    
    for i in range(len(list (models))):
        model = list(models.values()) [i]
        model.fit(X_train, y_train) # Train model
        
        # Make predictions
        y_train_pred=model.predict(X_train)
        y_test_pred=model.predict(X_test)
        
        # Training set performance
        model_train_accuracy, model_train_f1, model_train_precision, \
        model_train_recall, model_train_rocauc_score = evaluate_clf(y_train,y_train_pred)
        
        # Test set performance
        model_test_accuracy, model_test_f1, model_test_precision, \
        model_test_recall, model_test_rocauc_score = evaluate_clf(y_test,y_test_pred)

        print(list(models.keys())[i])
        models_list.append(list(models.keys())[i])

        print('Model Performance for training Set')
        print("- Accuracy: {:.4f}".format(model_train_accuracy))
        print('- F1 score: {:.4f}'.format(model_train_f1))
        print('- Precision: {:.4f}'.format(model_train_precision))
        print('- Recall: {:.4f}'.format(model_train_recall))
        print('- Roc Auc Score: {:.4f}'.format(model_train_rocauc_score))

        print('---------------------')

        print('Model Performance for test Set')
        print("- Accuracy: {:.4f}".format(model_test_accuracy))
        accuracy_list.append(model_test_accuracy)
        print('- F1 score: {:.4f}'.format(model_test_f1))
        print('- Precision: {:.4f}'.format(model_test_precision))
        print('- Recall: {:.4f}'.format(model_train_recall))
        print('- Roc Auc Score: {:.4f}'.format(model_train_rocauc_score))
        auc.append(model_test_rocauc_score)
        print('='*35)
        print('\n')

    report = pd.DataFrame(list(zip(models_list, accuracy_list)),
                          columns=['Model Name', 'Accuracy']).sort_values(by=['Accuracy'],ascending=False)
    
    return report

### Base report of all models  with default parameters

In [None]:
base_report = evaluate_models(X=X_res, y=y_res, models=models)

# Report in DataFrame

base_report


Here we can use CatBoost Classifier, XGBClassifier for Hyper Parameter Tuning

## Hyperopt: Distributed Hyperparameter Optimization
    • Hyperopt is a powerful python library for hyperparameter optimization developed by Jarnes Bergstra. Hyperopt uses a form of Bayesian optimization for parameter tuning that allows you to get the best parameters for a given model.
    • Grid Search is exhaustive in case of Resources usage.
    • Random Search, is random, so could miss the most important values. However, there is a superior method available through the Hyperopt package

## Search space is where Hyperopt really gives you a many of sampling options:
    • for categorical parameters you have hp.choice
    • for integers you get hp.randit, hp.quniform, hp.qloguniform and hp.qlognormal
    • for floats we have hp.normal, hp.uniform, hp.lognormal and hp.loguniform
    • It is the most extensive sampling functionality out there.

You define your search space before you run optimization but you can create very complex parameter spaces:

# Hyperparameter Tuning for XGBoost Model

This is a function to minimize that receives hyperparameters values as input from the search space and returns the loss

In [32]:
# Create an objective function for hyperopt
def XGB_objective(params):
    model = XGBClassifier(**params, n_jobs=1)
    X_train, X_test, y_train, y_test = train_test_split(X_res, y_res,test_size=0.2, random_state=42)
    model = model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    return acc

In [None]:
# Define the search phase

search_phase = {'max_depth': hp.quniform("max_depth", 3, 10, 1),
                'gamma': hp.uniform('gamma',1,9),
                'colsample_bytree': hp.uniform('colsample_bytree',0,5,1),
                'min_child_weight': hp.quniform('min_child_weight',0,10,1),
                'n_estimators': 180,
                'seed': 0
                }

xgb_trials = Trials()

#using Fmin function to get best xgb_objective

best_xgb = fmin(
    fn = XGB_objective,
    space = search_phase,
    algo = tpe.suggest,
    trials=xgb_trials,
    max_evals=10,
    rstate=np.random.default_rng()
)

# Hyperparameter Tuning for CatBoost Model

In [33]:
# Create an objective function for hyperopt
def CatBoost_objective(params):
    model = CatBoostClassifier(**params, verbose=False, thread_count=-1)
    X_train, X_test, y_train, y_test = train_test_split(X_res, y_res,test_size=0.2, random_state=42)
    model = model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    return acc

In [None]:
# Define the search phase

search_phase = {
    'learning_rate': hp.uniform('learning_rate',0.001,1),
    'depth': scope.int(hp.quniform('max_depth',2,10,1)),
    'iterations': scope.int(hp.quniform('iterations',50,1000,50)),
    'l2_leaf_reg': scope.int(hp.quniform('l2_leaf_reg',1,50,1))
}

cat_trials = Trials()

#using Fmin function to get best xgb_objective

best_cat = fmin(
    fn = CatBoost_objective,
    space = search_phase,
    algo = tpe.suggest,
    trials=cat_trials,
    max_evals=10,
    rstate=np.random.default_rng()
)

### Best Parameters for CatBoost Model

In [None]:
best_cat

# FINAL MODEL

In [None]:
model = CatBoostClassifier(**params, verbose=False, thread_count=-1)
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res,test_size=0.2, random_state=42)

model = model.fit(X_train,y_train)
y_pred = model.predict(X_test)
acc = accuracy_score(y_test, y_pred)

matrix = confusion_matrix(y_test,y_pred)
cm = ConfusionMatrixDisplay(matrix)