In [6]:
import pandas as pd
import numpy as np
import seaborn as sns

In [3]:
data = pd.read_csv('DataForPipeline.csv')

In [4]:
data.isnull().sum().sum()

5

In [5]:
cat = data.select_dtypes(exclude = ['int', 'float']).columns

In [8]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
for col in cat:
    data[col] = encoder.fit_transform(data[col]) 

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score, mean_squared_error
import pandas as pd

In [10]:
sns.set()
import warnings
warnings.filterwarnings("ignore")

In [11]:
data.dropna(inplace=True)

In [12]:
data.isnull().sum().sum()

0

In [13]:
X = data.drop(columns=['Status', 'LoanTenure', 'EMI', 'ELA'])
y_class = data['Status']
y_reg1 = data['LoanTenure']
y_reg2 = data['EMI']
y_reg3 = data['ELA']

In [14]:
X_train, X_test, y_class_train, y_class_test = train_test_split(X, y_class, test_size=0.2, random_state=42)
y_reg1_train, y_reg1_test = train_test_split(y_reg1, test_size=0.2, random_state=42)
y_reg2_train, y_reg2_test = train_test_split(y_reg2, test_size=0.2, random_state=42)
y_reg3_train, y_reg3_test = train_test_split(y_reg3, test_size=0.2, random_state=42)

In [15]:
# Train RandomForestClassifier for classification
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_class_train)

In [16]:
# Train RandomForestRegressor for LoanTenure
reg1 = RandomForestRegressor(random_state=42)
reg1.fit(X_train, y_reg1_train)

In [17]:
# Train RandomForestRegressor for EMI
reg2 = RandomForestRegressor(random_state=42)
reg2.fit(X_train, y_reg2_train)

In [18]:
# Train RandomForestRegressor for ELA
reg3 = RandomForestRegressor(random_state=42)
reg3.fit(X_train, y_reg3_train)

In [19]:
importances_clf = clf.feature_importances_

In [20]:
importances_reg1 = reg1.feature_importances_

In [21]:
importances_reg2 = reg2.feature_importances_

In [22]:
importances_reg3 = reg3.feature_importances_

In [23]:
top_feature_df_clf = pd.DataFrame({'Feature': X.columns, 'Importance': importances_clf})
top_feature_df_reg1 = pd.DataFrame({'Feature': X.columns, 'Importance': importances_reg1})
top_feature_df_reg2 = pd.DataFrame({'Feature': X.columns, 'Importance': importances_reg2})
top_feature_df_reg3 = pd.DataFrame({'Feature': X.columns, 'Importance': importances_reg3})

In [24]:
print("Classification Feature Importances:")
print(top_feature_df_clf .sort_values(by='Importance', ascending=False).head(10)['Feature'].values.tolist())

print("\nLoanTenure Feature Importances:")
print(top_feature_df_reg1.sort_values(by='Importance', ascending=False).head(10)['Feature'].values.tolist())

print("\nEMI Feature Importances:")
print(top_feature_df_reg2.sort_values(by='Importance', ascending=False).head(10)['Feature'].values.tolist())

print("\nELA Feature Importances:")
print(top_feature_df_reg3.sort_values(by='Importance', ascending=False).head(10)['Feature'].values.tolist())

Classification Feature Importances:
['LoanCurrentDaysDelinquent', 'LP_CustomerPrincipalPayments', 'LP_CustomerPayments', 'LP_NonPrincipalRecoverypayments', 'DateCreditPulled', 'LoanNumber', 'LoanMonthsSinceOrigination', 'ListingNumber', 'LP_CollectionFees', 'MonthlyLoanPayment']

LoanTenure Feature Importances:
['LoanOriginalAmount', 'MonthlyLoanPayment', 'EstimatedReturn', 'LoanNumber', 'DateCreditPulled', 'LP_ServiceFees', 'BorrowerAPR', 'LP_CustomerPrincipalPayments', 'ListingNumber', 'LenderYield']

EMI Feature Importances:
['LP_CustomerPrincipalPayments', 'LenderYield', 'BorrowerRate', 'LP_CustomerPayments', 'BorrowerAPR', 'LoanOriginalAmount', 'LP_ServiceFees', 'EstimatedReturn', 'MonthlyLoanPayment', 'EstimatedEffectiveYield']

ELA Feature Importances:
['LoanOriginalAmount', 'MonthlyLoanPayment', 'BorrowerRate', 'LenderYield', 'EstimatedReturn', 'StatedMonthlyIncome', 'EstimatedEffectiveYield', 'BorrowerAPR', 'DateCreditPulled', 'LP_CustomerPrincipalPayments']


In [25]:
top_features_clf = [
    'LoanCurrentDaysDelinquent', 'LoanStatus', 'LP_NonPrincipalRecoverypayments', 
    'LP_CustomerPrincipalPayments', 'LP_CustomerPayments', 'LoanMonthsSinceOrigination', 
    'ListingNumber', 'LoanNumber', 'DateCreditPulled', 'LP_CollectionFees'
]

top_features_reg1 = [
    'LoanOriginalAmount', 'MonthlyLoanPayment', 'EstimatedReturn', 'LoanNumber', 
    'DateCreditPulled', 'LP_ServiceFees', 'BorrowerAPR', 'LP_CustomerPrincipalPayments', 
    'ListingNumber', 'EstimatedEffectiveYield'
]

top_features_reg2 = [
    'LP_CustomerPrincipalPayments', 'LenderYield', 'BorrowerRate', 'LP_CustomerPayments', 
    'BorrowerAPR', 'LoanOriginalAmount', 'EstimatedReturn', 'LP_ServiceFees', 
    'MonthlyLoanPayment', 'EstimatedEffectiveYield'
]

top_features_reg3 = [
    'LoanOriginalAmount', 'MonthlyLoanPayment', 'BorrowerRate', 'LenderYield', 
    'EstimatedReturn', 'StatedMonthlyIncome', 'EstimatedEffectiveYield', 'BorrowerAPR', 
    'DateCreditPulled', 'LP_CustomerPrincipalPayments'
]

# Combine all important features into a set to avoid duplicates
important_features = set(top_features_clf + top_features_reg1 + top_features_reg2 + top_features_reg3)

In [26]:
important_features = list(important_features)

In [27]:
print(important_features)

['LP_NonPrincipalRecoverypayments', 'StatedMonthlyIncome', 'LoanCurrentDaysDelinquent', 'BorrowerRate', 'LP_CustomerPrincipalPayments', 'LP_CustomerPayments', 'LP_CollectionFees', 'MonthlyLoanPayment', 'DateCreditPulled', 'LenderYield', 'LoanNumber', 'EstimatedReturn', 'LP_ServiceFees', 'BorrowerAPR', 'LoanMonthsSinceOrigination', 'EstimatedEffectiveYield', 'LoanOriginalAmount', 'LoanStatus', 'ListingNumber']


In [45]:
selected = ['BorrowerAPR', 'MonthlyLoanPayment', 'LoanOriginalAmount', 
    'BorrowerRate', 'LoanNumber','LP_ServiceFees','EstimatedEffectiveYield','EstimatedReturn', 'LoanCurrentDaysDelinquent', 'StatedMonthlyIncome']

In [46]:
X_filtered = X[selected]

# Split filtered data into training and testing sets
X_train_filtered, X_test_filtered, y_class_train, y_class_test = train_test_split(X_filtered, y_class, test_size=0.2, random_state=42)
y_reg1_train, y_reg1_test = train_test_split(y_reg1, test_size=0.2, random_state=42)
y_reg2_train, y_reg2_test = train_test_split(y_reg2, test_size=0.2, random_state=42)
y_reg3_train, y_reg3_test = train_test_split(y_reg3, test_size=0.2, random_state=42)

In [47]:
X.columns

Index(['ListingKey', 'ListingNumber', 'BorrowerAPR', 'BorrowerRate',
       'LenderYield', 'EstimatedEffectiveYield', 'EstimatedLoss',
       'EstimatedReturn', 'ProsperRating (numeric)', 'ProsperRating (Alpha)',
       'ProsperScore', 'ListingCategory (numeric)', 'BorrowerState',
       'EmploymentStatusDuration', 'IsBorrowerHomeowner', 'CurrentlyInGroup',
       'DateCreditPulled', 'CreditScoreRangeLower', 'CreditScoreRangeUpper',
       'CurrentCreditLines', 'OpenCreditLines', 'OpenRevolvingAccounts',
       'OpenRevolvingMonthlyPayment', 'InquiriesLast6Months', 'TotalInquiries',
       'AmountDelinquent', 'DelinquenciesLast7Years', 'RevolvingCreditBalance',
       'BankcardUtilization', 'AvailableBankcardCredit', 'TotalTrades',
       'TradesNeverDelinquent (percentage)', 'TradesOpenedLast6Months',
       'DebtToIncomeRatio', 'IncomeVerifiable', 'StatedMonthlyIncome',
       'LoanCurrentDaysDelinquent', 'LoanMonthsSinceOrigination', 'LoanNumber',
       'LoanOriginalAmount', 'LoanO

In [48]:
# Train the final models with the filtered dataset
clf_final = RandomForestClassifier(random_state=42)
clf_final.fit(X_train_filtered, y_class_train)

In [49]:
reg1_final = RandomForestRegressor(random_state=42)
reg1_final.fit(X_train_filtered, y_reg1_train)

In [50]:
reg2_final = RandomForestRegressor(random_state=42)
reg2_final.fit(X_train_filtered, y_reg2_train)

In [51]:
reg3_final = RandomForestRegressor(random_state=42)
reg3_final.fit(X_train_filtered, y_reg3_train)

In [52]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import train_test_split
import joblib

In [53]:
class CombinedPipeline:
    def __init__(self, clf, reg1, reg2, reg3):
        self.clf = clf
        self.reg1 = reg1
        self.reg2 = reg2
        self.reg3 = reg3

    def fit(self, X, y_class, y_reg1, y_reg2, y_reg3):
        self.clf.fit(X, y_class)
        X_filtered = X[y_class == 1]  # Filter entries where class is 'Current' (assuming 1 represents 'Current')
        self.reg1.fit(X_filtered, y_reg1[y_class == 1])
        self.reg2.fit(X_filtered, y_reg2[y_class == 1])
        self.reg3.fit(X_filtered, y_reg3[y_class == 1])
        return self

    def predict(self, X):
        y_class_pred = self.clf.predict(X)
        X_filtered = X[y_class_pred == 1]
        y_reg1_pred = self.reg1.predict(X_filtered) if len(X_filtered) > 0 else [None]
        y_reg2_pred = self.reg2.predict(X_filtered) if len(X_filtered) > 0 else [None]
        y_reg3_pred = self.reg3.predict(X_filtered) if len(X_filtered) > 0 else [None]
        return y_class_pred, y_reg1_pred, y_reg2_pred, y_reg3_pred

In [54]:
# Save the final models to a file
pipeline = CombinedPipeline(clf_final, reg1_final, reg2_final, reg3_final)
joblib.dump(pipeline, 'Pipeline.pkl')

print("Final pipeline saved as 'Pipeline.pkl'")

Final pipeline saved as 'Pipeline.pkl'


In [None]:
pipeline = joblib.load('Pipeline.pkl')

In [53]:
print(important_features)

['BorrowerAPR', 'MonthlyLoanPayment', 'LP_NonPrincipalRecoverypayments', 'LenderYield', 'ListingNumber', 'EstimatedReturn', 'EstimatedEffectiveYield', 'LP_CustomerPrincipalPayments', 'LoanCurrentDaysDelinquent', 'LoanNumber', 'LoanStatus', 'DateCreditPulled', 'LP_ServiceFees', 'LP_CustomerPayments', 'LoanMonthsSinceOrigination', 'LP_CollectionFees', 'LoanOriginalAmount', 'StatedMonthlyIncome', 'BorrowerRate']


In [61]:
selected = ['BorrowerAPR', 'MonthlyLoanPayment', 'LoanOriginalAmount', 
    'BorrowerRate', 'LoanNumber','LP_ServiceFees','EstimatedEffectiveYield','EstimatedReturn', 'LoanCurrentDaysDelinquent', 'StatedMonthlyIncome']