In [29]:
import pandas as pd
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier


In [30]:
data = pd.read_csv("creditcard.csv")

In [2]:
data.head(2)

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0


In [3]:
data.columns

Index(['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
       'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
       'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount',
       'Class'],
      dtype='object')

In [4]:
data.shape

(284807, 31)

In [5]:
data_nona=data.copy()

In [6]:
data_nona=data_nona.dropna()

In [7]:
data_nona.shape

(284807, 31)

In [8]:
#feature selections
data_ff=data.copy()
d_clm=['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
       'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
       'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28']
data_ff=data_ff[d_clm]
data_ff.head(2)

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28
0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,0.090794,...,0.403993,0.251412,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053
1,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,-0.166974,...,-0.145783,-0.069083,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724


In [9]:
feature_data=data_ff.columns

In [10]:
feature_data.shape

(28,)

In [11]:
data_ff.shape

(284807, 28)

In [12]:
X = data_ff.copy()
Y = data['Class']

In [13]:
print(len(X),len(Y))

284807 284807


<b>PEARSON</b>

In [47]:
feature_name = list(X.columns)
# no of maximum features we need to select
num_feats=28

import numpy as np
def cor_selector(X, y,num_feats):
    cor_list = []
    feature_name = X.columns.tolist()
    # calculate the correlation with y for each feature
    for i in X.columns.tolist():
        cor = np.corrcoef(X[i], y)[0, 1]
        cor_list.append(cor)
    # replace NaN with 0
    cor_list = [0 if np.isnan(i) else i for i in cor_list]
    # feature name
    cor_feature = X.iloc[:,np.argsort(np.abs(cor_list))[-num_feats:]].columns.tolist()
    # feature selection? 0 for not select, 1 for select
    cor_support = [True if i in cor_feature else False for i in feature_name]
    return cor_support, cor_feature
cor_support, cor_feature = cor_selector(X, Y,num_feats)
print(str(len(cor_feature)), 'selected features')

28 selected features


In [48]:
cor_feature

['V22',
 'V23',
 'V25',
 'V15',
 'V26',
 'V13',
 'V24',
 'V28',
 'V27',
 'V8',
 'V20',
 'V19',
 'V21',
 'V6',
 'V2',
 'V5',
 'V9',
 'V1',
 'V18',
 'V4',
 'V11',
 'V7',
 'V3',
 'V16',
 'V10',
 'V12',
 'V14',
 'V17']

<b>Chi2</b>

In [49]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler

X_norm = MinMaxScaler().fit_transform(X)

chi_selector = SelectKBest(chi2, k=num_feats)

In [50]:
chi_selector.fit(X_norm, Y)

SelectKBest(k=28, score_func=<function chi2 at 0x000001AF51C24840>)

In [51]:
chi_support = chi_selector.get_support()
chi_feature = X.loc[:,chi_support].columns.tolist()
print(str(len(chi_feature)), 'selected features')

28 selected features


In [52]:
chi_feature

['V1',
 'V2',
 'V3',
 'V4',
 'V5',
 'V6',
 'V7',
 'V8',
 'V9',
 'V10',
 'V11',
 'V12',
 'V13',
 'V14',
 'V15',
 'V16',
 'V17',
 'V18',
 'V19',
 'V20',
 'V21',
 'V22',
 'V23',
 'V24',
 'V25',
 'V26',
 'V27',
 'V28']

### Recursive Feature Elemination

In [53]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
rfe_selector = RFE(estimator=LogisticRegression(), n_features_to_select=num_feats, step=10, verbose=5)
rfe_selector.fit(X_norm, Y)



RFE(estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
  n_features_to_select=28, step=10, verbose=5)

In [54]:
rfe_support = rfe_selector.get_support()
rfe_feature = X.loc[:,rfe_support].columns.tolist()
print(str(len(rfe_feature)), 'selected features')

28 selected features


In [55]:
rfe_feature

['V1',
 'V2',
 'V3',
 'V4',
 'V5',
 'V6',
 'V7',
 'V8',
 'V9',
 'V10',
 'V11',
 'V12',
 'V13',
 'V14',
 'V15',
 'V16',
 'V17',
 'V18',
 'V19',
 'V20',
 'V21',
 'V22',
 'V23',
 'V24',
 'V25',
 'V26',
 'V27',
 'V28']

### Lasso: SelectFromModel

In [56]:
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression

embeded_lr_selector = SelectFromModel(LogisticRegression(penalty="l1"), max_features=num_feats)
embeded_lr_selector.fit(X_norm, Y)



SelectFromModel(estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l1', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
        max_features=28, norm_order=1, prefit=False, threshold=None)

In [57]:
embeded_lr_support = embeded_lr_selector.get_support()
embeded_lr_feature = X.loc[:,embeded_lr_support].columns.tolist()
print(str(len(embeded_lr_feature)), 'selected features')

15 selected features


In [59]:
embeded_lr_feature

['V1',
 'V3',
 'V4',
 'V5',
 'V8',
 'V10',
 'V11',
 'V12',
 'V13',
 'V14',
 'V16',
 'V19',
 'V21',
 'V22',
 'V24']

#### Random forest

In [60]:
embeded_rf_selector = SelectFromModel(RandomForestClassifier(n_estimators=100), max_features=num_feats)
embeded_rf_selector.fit(X, Y)

SelectFromModel(estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
        max_features=28, norm_order=1, prefit=False, threshold=None)

In [61]:
embeded_rf_support = embeded_rf_selector.get_support()
embeded_rf_feature = X.loc[:,embeded_rf_support].columns.tolist()
print(str(len(embeded_rf_feature)), 'selected features')

7 selected features


In [62]:
pd.set_option('display.max_rows', None)
# put all selection together
feature_selection_df = pd.DataFrame({'Feature':feature_name, 
                                     'Pearson':cor_support, 
                                     'Chi-2':chi_support, 
                                     'RFE':rfe_support, 
                                     'Logistics':embeded_lr_support,
                                    'Random Forest':embeded_rf_support})
# count the selected times for each feature
feature_selection_df['Total'] = np.sum(feature_selection_df, axis=1)
# display the top 100
feature_selection_df = feature_selection_df.sort_values(['Total','Feature'] , ascending=False)
feature_selection_df.index = range(1, len(feature_selection_df)+1)
feature_selection_df.head(5)

Unnamed: 0,Feature,Pearson,Chi-2,RFE,Logistics,Random Forest,Total
1,V16,True,True,True,True,True,5
2,V14,True,True,True,True,True,5
3,V12,True,True,True,True,True,5
4,V11,True,True,True,True,True,5
5,V10,True,True,True,True,True,5


In [63]:
import seaborn as sns
import matplotlib.pyplot as plt

new_df = data.copy()
new_df = new_df[['V16','V14','V12','V11','V10','Class']]

correlation = data.corr()#dataset.corr()
plt.figure(figsize=(20,20))
sns.heatmap(correlation.loc[['Class'],:],square=True,annot=True)


<matplotlib.axes._subplots.AxesSubplot at 0x1af03716d68>

## 2. Analyse the data

In [85]:
gb_data = data.copy()
gb_data.head(5)

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [86]:
columns_gb = ['Time','Amount','Class']
gb_data = gb_data[columns_gb]

In [88]:
gb_data_dum = pd.get_dummies(gb_data['Class'])

In [89]:
gb_data_dum.head(5)

Unnamed: 0,0,1
0,1,0
1,1,0
2,1,0
3,1,0
4,1,0


In [90]:
gb_data_join = gb_data.join(gb_data_dum)

In [94]:
gb_data_join = gb_data_join[['Time','Amount',0,1]]

In [95]:
gb_data_join.head(5)

Unnamed: 0,Time,Amount,0,1
0,0.0,149.62,1,0
1,0.0,2.69,1,0
2,1.0,378.66,1,0
3,1.0,123.5,1,0
4,2.0,69.99,1,0


In [99]:
 gb_data_join_true = gb_data_join[gb_data_join[1]==1]

In [100]:
gb_data_join_true

Unnamed: 0,Time,Amount,0,1
541,406.0,0.0,0,1
623,472.0,529.0,0,1
4920,4462.0,239.93,0,1
6108,6986.0,59.0,0,1
6329,7519.0,1.0,0,1
6331,7526.0,1.0,0,1
6334,7535.0,1.0,0,1
6336,7543.0,1.0,0,1
6338,7551.0,1.0,0,1
6427,7610.0,1.0,0,1
