In [3]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### Import Libraries and packages"

In [4]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import pylab as ply
import seaborn as sns
import math
import random

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.model_selection import KFold
from sklearn.utils import shuffle
#Ref: week 6 lecture material"

In [5]:
# this configuration will remove the restriction on the number of columns displayed\n",
pd.options.display.max_columns = None

# Task 1. Retrieve and Prepare the Data

## Load file

In [6]:
# data set is available as a csv file,
df_orig = pd.read_csv('../input/transactions/TransactionsDataset.csv', sep=',', decimal='.',encoding = 'latin-1', error_bad_lines=False)

# create a copy of the data frame as a working copy,
df_clean = df_orig.copy(deep=True)

## Observe data

In [7]:
df_clean.columns = ["ID", "AccountID", "Value", "Comment"]

In [8]:
# check that the number of rows/columns is as expected,
df_clean.shape

In [9]:
# get column names
df_clean.columns

In [10]:
# get descriptive statistics
df_clean.describe()

In [11]:
# check for null values",
plt.figure(figsize=(8,8)),
sns.heatmap(df_clean.isnull(), cbar=False)
plt.title("Missing values (shown as white marks)")
plt.savefig('heatmap.png')
plt.show()

In [12]:
# From the heatmap we can see that there are no missing or null values

## Data Preparation

In [13]:
#Columns don't need to be cleaned, dataset is reliable
df_clean['AccountID'].describe()

In [14]:
df_clean['Comment'].describe()

In [15]:
df_clean['Value'].describe()

In [16]:
is_negative =  df_clean['Value']<0
is_positive =  df_clean['Value']>0
print(is_negative.head())

In [17]:
negativeValues = df_clean[is_negative]
positiveValues = df_clean[is_positive]

In [18]:
negativeValues.head()

In [19]:
negativeValues.describe()

In [20]:
positiveValues.head()

In [21]:
positiveValues.describe()

# Task 2. Data Exploration

## Explore the data set as a whole

In [22]:
#Summary stats for each
#positiveValues.groupby('AccountID')['Value'].value_counts().unstack().fillna(0)
positiveValues.boxplot(column='Value', by='AccountID', figsize=(8,8))
plt.title('Value')
plt.xlabel('AccountId')
plt.show()

In [23]:
negativeValues.boxplot(column='Value', by='AccountID', figsize=(8,8))
plt.title('Value')
plt.xlabel('AccountId')
plt.show()

## Summarise Datasets

### Summarise Positives

In [24]:
positiveTest = positiveValues[['AccountID','Value']]

In [25]:
#Summarise Positive Average
positiveValuesAverage = positiveTest.groupby(['AccountID']).mean()
positiveValuesAverage.head()

In [26]:
#Summarise Sum
overallSum = positiveTest.groupby(['AccountID']).sum()
overallSum.head()

In [27]:
#Summarise Positive Count
positiveValuesCount = positiveTest.groupby(['AccountID']).count()
positiveValuesCount.head()

### Summarise Negatives

In [28]:
negativeTest = negativeValues[['AccountID','Value']]

In [29]:
#Summarise Negative Average
negativeValuesAverage = negativeTest.groupby(['AccountID']).mean()
negativeValuesAverage.head()

In [30]:
#Summarise Negative Median
#negativeValuesMedian = negativeTest.groupby(['AccountID']).median()
#negativeValuesMedian.head()

In [31]:
#Summarise Negative Count
negativeValuesCount = negativeTest.groupby(['AccountID']).count()
negativeValuesCount.head()

### Combine Positives & Negatives

In [32]:
summarisedData = positiveValuesAverage
summarisedData['PosCount'] = positiveValuesCount[['Value']]
summarisedData['NegMean'] = negativeValuesAverage[['Value']]
#summarisedData['NegMedian'] = negativeValuesMedian[['Value']]
summarisedData['NegCount'] = negativeValuesCount[['Value']]
summarisedData['Sum'] = overallSum[['Value']]
summarisedData.columns = ["PosMean", "PosCount","NegMean","NegCount","Sum"]
summarisedData.head(1)

In [33]:
summarisedData.head(20)

# Create a Correlation Matrix

In [34]:
# a correlation matrix will be used to explore the relation between pairs of features in the dataset (Task 2.2)\n",
df_corr=summarisedData.corr()
df_corr.head()

####Standard Deviation

In [67]:
# Get the standard deviation of the sum.
# Anything above 1 sd is rich
# Anything below 1 but above mean is mod rich
# Anything below mean but above -1 st dev is mod poor
# Anything below -1 st dev is poor
st_dev = np.std(summarisedData["Sum"])
sum_mean = np.mean(summarisedData["Sum"])
print(st_dev)
print(sum_mean)

conditions = [
    (summarisedData['Sum'] <= (sum_mean - st_dev)),
    (summarisedData['Sum'] < sum_mean) & (summarisedData['Sum'] > (sum_mean - st_dev)),
    (summarisedData['Sum'] > sum_mean) & (summarisedData['Sum'] <= (sum_mean + st_dev)),
    (summarisedData['Sum'] > (sum_mean + st_dev))
    ]

# create a list of the values we want to assign for each condition
values = ['Poor', 'Moderately Poor', 'Moderately Rich', 'Rich']

# create a new column and use np.select to assign values to it using our lists as arguments
summarisedData['class'] = np.select(conditions, values)

summarisedData.head(20)

## Model 1 - Decision Tree

In [56]:
X = summarisedData[["PosMean", "PosCount","NegMean","NegCount"]]

y = summarisedData[["class"]]

In [57]:
# define a function for prediction model using decision tree
# Parameters:
# - X_data is the dataset
# - y_data is the target
# - size is an integer value used to set max_depth for the tree\n",
# - criterion can be one of 'gini' or 'entropy'\n",
# - output is 'Y' or 'N' to indicate whether the function is to print the feature scores, decision tree, \n",
#   confusion matrix, classification report\n",
# Return values:\n",
# - function will return a feature list which have an importance score > 0, this will identify which\n",
#   features contribute towards the prediction model\n",

def Predict_DT(X_data, y_data, test_size, min_samples_split, min_samples_leaf, max_depth, criterion, output):
# split dataset in train and test set\n",
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=test_size, random_state=0)
# ref: https://towardsdatascience.com/how-to-tune-a-decision-tree-f03721801680 for hyperparameter\n",
#      tuning of decision tree\n",
# - decision tree model with Gini index\n",
# - splitter set to best, to force finding the most ideal split; dataset consists of few features\n",
#   and as such the computational overhead to find the best split will be minimal\n",
# - max features set to auto to use sqrt(n_features)\n",
# - max depth\n",
# - min_samples_split set to 2 (default); minimum samples required to split, set as the default value to\n",
#   allow the model to learn, since too high a value can result in overfitting\n",
# - min_samples_leaf set to 1 (default); minimum samples to make a leaf node; same as above, set to default\n",
#   since too high a value can result in overfitting\n",
    dtc = DecisionTreeClassifier(criterion=criterion, splitter='best', max_features='auto', 
                             min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, 
                             max_depth=max_depth, random_state=0)
    # fit the model\n",
    model = dtc.fit(X_train, y_train)

    # Predict results
    y_predict = model.predict(X_test)
    # get accuracy score (% value)
    accuracy = accuracy_score(y_test, y_predict, normalize=True) * 100

    # display feature importance, for features which have a score > 0\n",
    # https://machinelearningmastery.com/calculate-feature-importance-with-python/\n",
    importance = dtc.feature_importances_

    feature_list = []
    # summarize feature importance\n",
    for i,v in enumerate(importance):
        if (v > 0):
            feature_list.append(i)
            # when function called with output, display feature score
            if (output=='Y'):
                print('Feature: %0d, Score: %.5f' % (i,v))

    # display metrics when function called with output
    if (output=='Y'):
        # display accuracy score
        print('Decision Tree Accuracy : ' + str(accuracy))
    
        # display confusion matrix\n",
        # https://scikit-learn.org/stable/modules/model_evaluation.html#classification-report
        cm = confusion_matrix(y_test, y_predict)
        print('Confusion matrix:')
        print('TN:{} FP:{} FN:{} TP:{}'.format(cm[0,0],cm[0,1],cm[1,0],cm[1,1]))
    
        # display classification report\n",
        print(metrics.classification_report(y_test, y_predict))
    
        # display tree\n",
        plt.figure(figsize=(10,10))
        tree.plot_tree(dtc.fit(X_train, y_train))
    
        return feature_list

In [65]:
# create variable to hold latest features list
prev_ft=[]
# parameters to use for prediction
test_size = 0.33
min_samples_split = 2
min_samples_leaf = 2
max_depth = 3
criterion = 'gini'
ft = Predict_DT(X, y, test_size, min_samples_split, min_samples_leaf, max_depth, criterion, 'N')

# predict using the decision tree until there is no change in the features are returned\n",
# no change in the features list returned, means that the data set is reduced to the least\n",
# number of features required to predict with the best accuracy\n",
#while (True):
    # predict and get features list which have an accuracy score\n",
#    ft = Predict_DT(X, y, test_size, min_samples_split, min_samples_leaf, max_depth, criterion, 'N')
    # check if the features list is the same as the previous list\n",
#    if (ft != prev_ft):
        # create a subset of the data set with the features list from the prediction\n",
#        X  = X.iloc[: , ft].copy()
        # store features list\n",
#        prev_ft = ft
#    else:
        # no change in features list\n",
#        break

# predict with latest reduced data set and produce output\n",
X_dt = Predict_DT(X, y, test_size, min_samples_split, min_samples_leaf, max_depth, criterion, 'Y')

In [None]:
# split dataset in features and target variable\n",
# since the feature was not contributing to the accuracy score\n",
# experiments were carried out with and without the 'time' feature included in the data set\n",
X = summarisedData.drop(columns=['value'], axis=1) # features
y = summarisedData['value'] # target"

In [66]:
# model was run with various parameters for both criterion gini and entropy\n",
# with the optimal parameters gini returns accuracy score of 86, entropy returns accuracy score of 82\n",
    # both cases returned the same set features\n",
    # experiments were also carried out without the 'time' feature which returned an accuracy score of 73.73%\n",
    # and features 'age', 'creatinine_phosphokinase', 'ejection_fraction', 'serum_creatinine'\n",
    # these experiments prove the hypothesis that the decision tree model works best with \n",
    # the inclusion of the 'time' feature\n",
    # the confusion matrix shows that:\n",
    # 65% guessed DEATH_EVENT 1 correctly\n",
    # 21% guessed DEATH_EVENT 0 correctly\n",
    # 3%  guessed DEATH_EVENT 1 incorrectly\n",
    # 10% guessed DEATH_EVENT 0 incorrectly\n",
    # the classification report shows that the model has a greater ability to guess survival (DEATH_EVENT=0) better\n",
    # Ref: https://muthu.co/understanding-the-classification-report-in-sklearn/\n",
print('Selected features for Decision Tree {}:'.format(X.columns.to_list()))