In [1]:
#loading libraries
import pandas as pd 
import featuretools as ft
import featuretools.primitives

In [2]:
#Load data and put into dataframe
data_transformed = pd.read_csv('../data_transformed.csv')

In [3]:
#Re-name the first column as "unique_id"
df = data_transformed.rename(columns={'Unnamed: 0' : 'unique_id'})
df['Class'] = df['Class'].map({0: 'NF', 1: 'F'})



In [4]:
#Checking the variance contribution of each principal compomoment
df_pcvar = df.drop(df[['unique_id','value','Class']],
                   axis = 1)


In [5]:
#Sum of all the variance across all PCs
total_var = df_pcvar.var().sum()

In [6]:
#Percentage variance contribution of each PCA
per_pcvar = (df_pcvar.var()/total_var)*100


In [7]:
#Converting series into dataframe
per_pcvar = per_pcvar.to_frame()
per_pcvar.columns = ['Var'] #renaming the column name

In [8]:
#Getting the commulative percentage of percentage PCs
per_pcvar['cum_per'] = per_pcvar['Var'].cumsum()

In [9]:
#Getting the PCs that contribute towards 80% variance of the data
per_pcvar[per_pcvar['cum_per'] <= 80]

Unnamed: 0,Var,cum_per
T1,12.48161,12.48161
T2,8.871778,21.353388
T3,7.481918,28.835305
T4,6.519593,35.354898
T5,6.204762,41.55966
T6,5.771936,47.331596
T7,4.990695,52.322292
T8,4.648705,56.970997
T9,3.926609,60.897605
T10,3.860441,64.758047


In [10]:
#Deleting the columns from T15-T28
for column in df_pcvar.columns[14:]:
    df.drop([column],axis=1,inplace=True)
    


In [11]:
# Make an entityset and add the entity
es = ft.EntitySet(id = 'fraud_detect')
es.entity_from_dataframe(entity_id = 'data', dataframe = df, 
                     index = "unique_id")



Entityset: fraud_detect
  Entities:
    data [Rows: 281959, Columns: 17]
  Relationships:
    No relationships

In [31]:
# Run deep feature synthesis with transformation primitives
feature_matrix, feature_defs = ft.dfs(entityset = es,
                                      target_entity = 'data',
                                      trans_primitives=['subtract_numeric',\
                                                        'divide_numeric'])
feature_matrix.shape

(281959, 331)

In [26]:
# Defining mean value for value feature from data set
mean_value = ft.Feature(es["data"]["value"]) > df.value.mean()

In [27]:
# Creating aggregate primitive
feature_matrix, feature_defs = ft.dfs(entityset=es,
                                     target_entity="data",
                                     agg_primitives=["percent_true"],
                                     seed_features=[mean_value])

In [29]:
# Check the new feature in end
feature_matrix.shape

(281959, 17)