<a href="https://colab.research.google.com/github/AtharKharal/Data_science_with_python/blob/dev-branch/featuretools_and_TPOT_workflow_ver_0p3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# Run following line just once at the start of a new colab session
#!pip install tpot

In [0]:
import numpy as np
import pandas as pd
import featuretools as ft
from featuretools import variable_types as vtypes # importing vtypes to classify variables
import sklearn as sk
import sklearn.model_selection
import tpot

#Reading Data

In [0]:
# Go to the CSV file in Google Drive, right-click on it and select 
# “Get shareable link”. The link will be copied into your clipboard. 
# Paste this link into following a string variable i.e. link:

link = 'https://drive.google.com/open?id=10JNCo2Opquuq2cwGiyGgvlds8U2wz3mW'  # The shareable link

# Code to read csv file into Colaboratory:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)


fluff, id = link.split('=')   # Verify that you have everything after '=' by using print(id)

# Finally reading the data into a dataframe

downloaded = drive.CreateFile({'id':id}) 
downloaded.GetContentFile('dummy_file_name.csv')  
df = pd.read_csv('dummy_file_name.csv')

In [0]:
df.head()

Unnamed: 0,Creditability,Account Balance,Duration of Credit (month),Payment Status of Previous Credit,Purpose,Credit Amount,Value Savings/Stocks,Length of current employment,Instalment per cent,Sex & Marital Status,...,Duration in Current address,Most valuable available asset,Age (years),Concurrent Credits,Type of apartment,No of Credits at this Bank,Occupation,No of dependents,Telephone,Foreign Worker
0,1,1,18,4,2,1049,1,2,4,2,...,4,2,21,3,1,1,3,1,1,1
1,1,1,9,4,0,2799,1,3,2,3,...,2,1,36,3,1,2,3,2,1,1
2,1,2,12,2,9,841,2,4,2,2,...,4,1,23,3,1,1,2,1,1,1
3,1,1,12,4,0,2122,1,3,3,3,...,2,1,39,3,1,2,2,2,1,2
4,1,1,12,4,0,2171,1,3,4,3,...,4,2,38,1,2,2,2,1,1,2


In [0]:
target = df.Creditability
df = df.drop(columns = target.name)
#df['df_index'] = range(0,len(df))

In [0]:
df.columns

Index(['Account Balance', 'Duration of Credit (month)',
       'Payment Status of Previous Credit', 'Purpose', 'Credit Amount',
       'Value Savings/Stocks', 'Length of current employment',
       'Instalment per cent', 'Sex & Marital Status', 'Guarantors',
       'Duration in Current address', 'Most valuable available asset',
       'Age (years)', 'Concurrent Credits', 'Type of apartment',
       'No of Credits at this Bank', 'Occupation', 'No of dependents',
       'Telephone', 'Foreign Worker'],
      dtype='object')

#Feature Augmentation

In [0]:
es = ft.EntitySet("mydata")

#vari_types = {'vari_name': vtypes.Categorical, ...}

# Create an entity from the dataframe df
# This dataframe must have an index col (and if required, a time index col) 
#      namely 'df_index' (and time_index = 'dt_time_col')

es = es.entity_from_dataframe(entity_id = 'my_entity_id', dataframe = df, index = 'df_index')

#, time_index = 'dt_time_col', variable_types = vari_types



# Perform deep feature synthesis without specifying primitives
features, feature_names = ft.dfs(entityset=es, target_entity='my_entity_id', 
                                 trans_primitives=["add","subtract"],
                                 max_depth = 2
                                )
#trans_primitives = ["absolute", "cum_count","subtract","cum_sum","haversine","or","mod","characters","cum_max","multiply","add"]
# max_features = 100, max_depth = 2

#Pipeline Optimization

In [0]:
X, y = df, target

X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y)

tpot = tpot.TPOTClassifier(generations=100, population_size=500, verbosity=2, 
                           n_jobs = -1, cv=5
                          )

# many more parameters may be set as follows:
# scoring = 'neg_mean_absolute_error', max_time_mins = 480, n_jobs = -1,
# cv = 5


tpot.fit(X_train, y_train)
y_hat = tpot.predict(X_test)

tpot.fitted_pipeline_

Optimization Progress:   2%|▏         | 1000/50500 [08:25<9:54:04,  1.39pipeline/s]

Generation 1 - Current best internal CV score: 0.7733333333333334


Optimization Progress:   3%|▎         | 1500/50500 [14:51<9:11:12,  1.48pipeline/s]

Generation 2 - Current best internal CV score: 0.7733333333333334


Optimization Progress:   4%|▍         | 2000/50500 [22:10<10:10:36,  1.32pipeline/s]

Generation 3 - Current best internal CV score: 0.7733333333333334


Optimization Progress:   5%|▍         | 2500/50500 [35:56<9:52:28,  1.35pipeline/s]

Generation 4 - Current best internal CV score: 0.7733333333333334






TPOT closed prematurely. Will use the current best pipeline.

Best pipeline: ExtraTreesClassifier(input_matrix, bootstrap=True, criterion=entropy, max_features=0.35000000000000003, min_samples_leaf=3, min_samples_split=7, n_estimators=100)


In [0]:
tpot.export('solution_to_german_credit_12.py')

#Check if and how the following line may be used


True

In [0]:
print(tpot.score(X_test, y_test))

0.772


In [0]:
tpot_data = pd.concat([X,y.rename("target")], axis = 1)
tpot_data.head()

Unnamed: 0,Account Balance,Duration of Credit (month),Payment Status of Previous Credit,Purpose,Credit Amount,Value Savings/Stocks,Length of current employment,Instalment per cent,Sex & Marital Status,Guarantors,...,Age (years),Concurrent Credits,Type of apartment,No of Credits at this Bank,Occupation,No of dependents,Telephone,Foreign Worker,df_index,target
0,1,18,4,2,1049,1,2,4,2,1,...,21,3,1,1,3,1,1,1,0,1
1,1,9,4,0,2799,1,3,2,3,1,...,36,3,1,2,3,2,1,1,1,1
2,2,12,2,9,841,2,4,2,2,1,...,23,3,1,1,2,1,1,1,2,1
3,1,12,4,0,2122,1,3,3,3,1,...,39,3,1,2,2,2,1,2,3,1
4,1,12,4,0,2171,1,3,4,3,1,...,38,1,2,2,2,1,1,2,4,1


In [0]:
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import ExtraTreesClassifier, GradientBoostingClassifier
from tpot.builtins import StackingEstimator

# from the exported file by tpot.export copy two lines viz training_features and exported_pipeline

tpot_data = pd.concat([X,y.rename("target")], axis = 1)

features = tpot_data.drop('target', axis=1).values


training_features, testing_features, training_target, testing_target = \
            sklearn.model_selection.train_test_split(features, tpot_data['target'].values, random_state=42)


exported_pipeline = make_pipeline(
  
  StackingEstimator(estimator=GradientBoostingClassifier(learning_rate=0.5, 
  max_depth=10, max_features=0.55, min_samples_leaf=18, min_samples_split=8, 
  n_estimators=100, subsample=0.8500000000000001) ),
  
  ExtraTreesClassifier(bootstrap=False, 
  criterion="gini", max_features=0.3, min_samples_leaf=16, 
  min_samples_split=16, n_estimators=100)
  
)

exported_pipeline.fit(training_features, training_target)

Pipeline(memory=None,
     steps=[('stackingestimator', StackingEstimator(estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.5, loss='deviance', max_depth=10,
              max_features=0.55, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=N...tors=100, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False))])

In [0]:
#Show the features with each used coefficient:

important_coefs = pd.Series(data=exported_pipeline.steps[1][1].coef_, 
                            index=X.columns)

sorted_coef = important_coefs.sort_values(ascending=False)

sorted_coef

AttributeError: ignored

In [0]:
exported_pipeline.get_params()

{'extratreesclassifier': ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
            max_depth=None, max_features=0.3, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=16, min_samples_split=16,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=None, verbose=0, warm_start=False),
 'extratreesclassifier__bootstrap': False,
 'extratreesclassifier__class_weight': None,
 'extratreesclassifier__criterion': 'gini',
 'extratreesclassifier__max_depth': None,
 'extratreesclassifier__max_features': 0.3,
 'extratreesclassifier__max_leaf_nodes': None,
 'extratreesclassifier__min_impurity_decrease': 0.0,
 'extratreesclassifier__min_impurity_split': None,
 'extratreesclassifier__min_samples_leaf': 16,
 'extratreesclassifier__min_samples_split': 16,
 'extratreesclassifier__min_weight_fraction_leaf': 0.0,
 'extratreesclassifier__n_estimators': 100,