In [2]:
import warnings

import pandas as pd 
import numpy as np

from sklearn.pipeline import Pipeline,FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin

from mypipess import *

In [3]:
warnings.filterwarnings('ignore')

In [4]:
train_file=r"D:\IITK Data Analytics\Python Data Analytics\Course work\Python-Pipelines\loan_data_train.csv"
test_file=r"D:\IITK Data Analytics\Python Data Analytics\Course work\Python-Pipelines\loan_data_test.csv"

ld_train=pd.read_csv(train_file)
ld_test=pd.read_csv(test_file)     

In [5]:
ld_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2200 entries, 0 to 2199
Data columns (total 15 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   ID                              2199 non-null   float64
 1   Amount.Requested                2199 non-null   object 
 2   Amount.Funded.By.Investors      2199 non-null   object 
 3   Interest.Rate                   2200 non-null   object 
 4   Loan.Length                     2199 non-null   object 
 5   Loan.Purpose                    2199 non-null   object 
 6   Debt.To.Income.Ratio            2199 non-null   object 
 7   State                           2199 non-null   object 
 8   Home.Ownership                  2199 non-null   object 
 9   Monthly.Income                  2197 non-null   float64
 10  FICO.Range                      2200 non-null   object 
 11  Open.CREDIT.Lines               2196 non-null   object 
 12  Revolving.CREDIT.Balance        21

In [6]:
ld_train.head()
#dont need id . interest rest is the output feature. so remove these two columns

#conversion to numeric and impute missing values with median

Unnamed: 0,ID,Amount.Requested,Amount.Funded.By.Investors,Interest.Rate,Loan.Length,Loan.Purpose,Debt.To.Income.Ratio,State,Home.Ownership,Monthly.Income,FICO.Range,Open.CREDIT.Lines,Revolving.CREDIT.Balance,Inquiries.in.the.Last.6.Months,Employment.Length
0,79542.0,25000,25000.0,18.49%,60 months,debt_consolidation,27.56%,VA,MORTGAGE,8606.56,720-724,11,15210,3.0,5 years
1,75473.0,19750,19750.0,17.27%,60 months,debt_consolidation,13.39%,NY,MORTGAGE,6737.5,710-714,14,19070,3.0,4 years
2,67265.0,2100,2100.0,14.33%,36 months,major_purchase,3.50%,LA,OWN,1000.0,690-694,13,893,1.0,< 1 year
3,80167.0,28000,28000.0,16.29%,36 months,credit_card,19.62%,NV,MORTGAGE,7083.33,710-714,12,38194,1.0,10+ years
4,17240.0,24250,17431.82,12.23%,60 months,credit_card,23.79%,OH,MORTGAGE,5833.33,730-734,6,31061,2.0,10+ years


In [7]:
# dont need : ID , Intereste.Rate(output feature  )

# Amount requested : V1
# Amount.Funded.By.Investors : V2 
# Open.CREDIT.Lines : V3
# Revolving.CREDIT.Balance : V4
# convert it to numeric , and then impute missing values with median

# Debt.To.Income.Ratio : V5
# remove percentage sign , covert to numeric and then impute with median

# Loan.Length ,Loan.Purpose,State,Home.Ownership,Employment.Length
# V6-V10
# create dummies for these, considering frequency cutoff as 20
#else we take into account data which doesn't help in the model's performance

# Monthly.Income : V11,
# Inquiries.in.the.Last.6.Months :V12
# impute missing values with median

# FICO.Range : V13
# split the column at '-', convert the resulting columns to numeric(say a ,b) , 
# then create new column = (a+b)/2 


In [8]:

# pipe_name=pdPipeline([
#     ('name of the process',call to process),
#     ('name of the process', call to process),
#     .....
# ])


In [9]:
p1=pdPipeline([
    ('var_select',VarSelector(['Amount.Requested','Amount.Funded.By.Investors',
                               'Open.CREDIT.Lines','Revolving.CREDIT.Balance'])),
    ('convert_to_numeric',convert_to_numeric()),
    ('missing_trt',DataFrameImputer())
])



In [10]:
p2=pdPipeline([
    ('var_select',VarSelector(['Debt.To.Income.Ratio'])),
    ('string_clean',string_clean(replace_it='%',replace_with='')),
    ('convert_to_numeric',convert_to_numeric()),
    ('missing_trt',DataFrameImputer())
])

In [11]:
p3=pdPipeline([
    ('var_select',VarSelector(['Loan.Length', 'Loan.Purpose','State','Home.Ownership',
                               'Employment.Length'])),
    ('missing_trt',DataFrameImputer()),
    ('create_dummies',get_dummies_Pipe(20))
])

In [12]:
p4=pdPipeline([
    ('var_select',VarSelector(['Monthly.Income','Inquiries.in.the.Last.6.Months'])),
    ('missing_trt',DataFrameImputer())
])

In [13]:
p5=pdPipeline([
    ('var_select',VarSelector(['FICO.Range'])),
    ('custom_fico',custom_fico()),
    ('missing_trt',DataFrameImputer())
])

In [14]:
data_pipe=FeatureUnion([
    ('obj_to_num',p1),
    ('dtir',p2),
    ('obj_to_dum',p3),
    ('num',p4),
    ('fico',p5)
])

In [15]:
data_pipe.fit(ld_train)

FeatureUnion(transformer_list=[('obj_to_num',
                                pdPipeline(steps=[('var_select',
                                                   VarSelector(feature_names=['Amount.Requested',
                                                                              'Amount.Funded.By.Investors',
                                                                              'Open.CREDIT.Lines',
                                                                              'Revolving.CREDIT.Balance'])),
                                                  ('convert_to_numeric',
                                                   convert_to_numeric()),
                                                  ('missing_trt',
                                                   DataFrameImputer())])),
                               ('dtir',
                                pdPipeline(steps=[('var_select',
                                                   VarSelector(feature_names=...
 

In [16]:
len(data_pipe.get_feature_names())

61

In [17]:
data_pipe.transform(ld_train).shape

(2200, 61)

In [18]:
x_train=pd.DataFrame(data=data_pipe.transform(ld_train),
                    columns=data_pipe.get_feature_names())

In [19]:
x_test=pd.DataFrame(data=data_pipe.transform(ld_test),
                    columns=data_pipe.get_feature_names())

In [20]:
x_train.shape

(2200, 61)

In [21]:
x_test.shape

(300, 61)

# basic example of pipeline building

In [1]:
from sklearn.base import BaseEstimator, TransformerMixin

In [7]:
class DataFrameImputer(BaseEstimator,TransformerMixin):
    
    def __init__(self):
        self.impute_dict={}
        self.feature_names=[]
        
    def fit(self,x,y=None): #learning part
        for col in x.columns:
            if x[col].dtype=='O':     #object
                self.impute_dict[col]='missing'
            else:
                self.impute_dict[col]=x[col].mean()
        
        self.feature_names=x.columns
        return self #for getfeaturename function
        
    def transform(self,x): #
        x=x.fillna(self.impute_dict) #if nullvalues occur in the other dataset
        return x
        
    def get_feature_names(self):
        return self.feature_names

In [8]:
import pandas as pd 
import numpy as np

In [9]:
d=pd.DataFrame({'age':[20,15,np.nan,30,35,40],
                'city':['delhi',np.nan,'pune','delhi','hyderabad','agra']})

In [10]:
a=DataFrameImputer()

In [11]:
a.feature_names

[]

In [12]:
a.get_feature_names()

[]

In [13]:
a.impute_dict

{}

In [14]:
a.fit(d)

DataFrameImputer()

In [15]:
a.impute_dict

{'age': 28.0, 'city': 'missing'}

In [16]:
a.feature_names

Index(['age', 'city'], dtype='object')

In [18]:
a.get_feature_names()

Index(['age', 'city'], dtype='object')

In [19]:
#checking with another dataframe
d1=pd.DataFrame({'age':[10,15,np.nan],
                'city':[np.nan,'delhi','chennai']})

In [20]:
a.transform(d1)

Unnamed: 0,age,city
0,10.0,missing
1,15.0,delhi
2,28.0,chennai


In [21]:
a.transform(d)

Unnamed: 0,age,city
0,20.0,delhi
1,15.0,missing
2,28.0,pune
3,30.0,delhi
4,35.0,hyderabad
5,40.0,agra
