## Importing Packages and Data

In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.metrics import classification_report, accuracy_score, log_loss

In [97]:
test_df = pd.read_csv('test.csv', error_bad_lines=False)
train_df = pd.read_csv('training.csv')

b'Skipping line 17772: expected 15 fields, saw 24\nSkipping line 35549: expected 15 fields, saw 18\nSkipping line 47486: expected 15 fields, saw 24\n'
  interactivity=interactivity, compiler=compiler, result=result)


In [0]:
sample_df = pd.read_csv('sample_submission.csv', error_bad_lines=False)
xente_df = pd.read_csv('Xente_Variable_Definitions.csv')

In [99]:
sample_df.head()

Unnamed: 0,TransactionId,FraudResult
0,TransactionId_50600,
1,TransactionId_95109,
2,TransactionId_47357,
3,TransactionId_28185,
4,TransactionId_22140,


In [100]:
xente_df

Unnamed: 0,Column Name,Definition
0,TransactionId,Unique �transaction identifier on platform
1,BatchId,Unique number assigned to a batch of transacti...
2,AccountId,Unique number identifying the customer on plat...
3,SubscriptionId,Unique number identifying the customer subscri...
4,CustomerId,Unique identifier attached to Account
5,CurrencyCode,Country currency
6,CountryCode,Numerical geographical code of country
7,ProviderId,Source provider of Item �bought.
8,ProductId,Item name being bought.
9,ProductCategory,ProductIds are organized into these broader pr...


In [101]:
print(train_df.shape)
print(test_df.shape)

(95662, 16)
(78175, 15)


In [102]:
train_df.head()

Unnamed: 0,TransactionId,BatchId,AccountId,SubscriptionId,CustomerId,CurrencyCode,CountryCode,ProviderId,ProductId,ProductCategory,ChannelId,Amount,Value,TransactionStartTime,PricingStrategy,FraudResult
0,TransactionId_76871,BatchId_36123,AccountId_3957,SubscriptionId_887,CustomerId_4406,UGX,256,ProviderId_6,ProductId_10,airtime,ChannelId_3,1000.0,1000,2018-11-15T02:18:49Z,2,0
1,TransactionId_73770,BatchId_15642,AccountId_4841,SubscriptionId_3829,CustomerId_4406,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-20.0,20,2018-11-15T02:19:08Z,2,0
2,TransactionId_26203,BatchId_53941,AccountId_4229,SubscriptionId_222,CustomerId_4683,UGX,256,ProviderId_6,ProductId_1,airtime,ChannelId_3,500.0,500,2018-11-15T02:44:21Z,2,0
3,TransactionId_380,BatchId_102363,AccountId_648,SubscriptionId_2185,CustomerId_988,UGX,256,ProviderId_1,ProductId_21,utility_bill,ChannelId_3,20000.0,21800,2018-11-15T03:32:55Z,2,0
4,TransactionId_28195,BatchId_38780,AccountId_4841,SubscriptionId_3829,CustomerId_988,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-644.0,644,2018-11-15T03:34:21Z,2,0


In [103]:
test_df.head()

Unnamed: 0,TransactionId,BatchId,AccountId,SubscriptionId,CustomerId,CurrencyCode,CountryCode,ProviderId,ProductId,ProductCategory,ChannelId,Amount,Value,TransactionStartTime,PricingStrategy
0,TransactionId_50600,BatchId_35028,AccountId_2441,SubscriptionId_4426,CustomerId_2857,UGX,256,ProviderId_5,ProductId_3,airtime,ChannelId_3,1000.0,1000.0,2019-02-13T10:01:40Z,4.0
1,TransactionId_95109,BatchId_45139,AccountId_3439,SubscriptionId_2643,CustomerId_3874,UGX,256,ProviderId_5,ProductId_15,financial_services,ChannelId_3,2000.0,2000.0,2019-02-13T10:02:12Z,2.0
2,TransactionId_47357,BatchId_74887,AccountId_4841,SubscriptionId_3829,CustomerId_2857,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-50.0,50.0,2019-02-13T10:02:30Z,2.0
3,TransactionId_28185,BatchId_11025,AccountId_2685,SubscriptionId_4626,CustomerId_3105,UGX,256,ProviderId_5,ProductId_10,airtime,ChannelId_3,3000.0,3000.0,2019-02-13T10:02:38Z,4.0
4,TransactionId_22140,BatchId_29804,AccountId_4841,SubscriptionId_3829,CustomerId_3105,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-60.0,60.0,2019-02-13T10:02:58Z,2.0


## **Preprocessing**


In [104]:
train_df['FraudResult'].value_counts()

0    95469
1      193
Name: FraudResult, dtype: int64

In [105]:
train_df['FraudResult'].unique()

array([0, 1])

In [106]:
train_df.isnull().sum()

TransactionId           0
BatchId                 0
AccountId               0
SubscriptionId          0
CustomerId              0
CurrencyCode            0
CountryCode             0
ProviderId              0
ProductId               0
ProductCategory         0
ChannelId               0
Amount                  0
Value                   0
TransactionStartTime    0
PricingStrategy         0
FraudResult             0
dtype: int64

In [107]:
test_df.isnull().sum()

TransactionId           0
BatchId                 0
AccountId               0
SubscriptionId          0
CustomerId              0
CurrencyCode            0
CountryCode             1
ProviderId              1
ProductId               1
ProductCategory         2
ChannelId               3
Amount                  4
Value                   6
TransactionStartTime    6
PricingStrategy         6
dtype: int64

In [0]:
test_df['CountryCode'].fillna(method='backfill', inplace=True)
test_df['ProviderId'].fillna(method='backfill', inplace=True)
test_df['ProductId'].fillna(method='backfill', inplace=True)
test_df['ProductCategory'].fillna(method='backfill', inplace=True)
test_df['ChannelId'].fillna(method='backfill', inplace=True)
test_df['Amount'].fillna(method='backfill', inplace=True)
test_df['Value'].fillna(method='backfill', inplace=True)
test_df['TransactionStartTime'].fillna(method='backfill', inplace=True)
test_df['PricingStrategy'].fillna(method='backfill', inplace=True)

In [109]:
test_df.isnull().sum()

TransactionId           0
BatchId                 0
AccountId               0
SubscriptionId          0
CustomerId              0
CurrencyCode            0
CountryCode             0
ProviderId              0
ProductId               0
ProductCategory         0
ChannelId               0
Amount                  0
Value                   0
TransactionStartTime    0
PricingStrategy         0
dtype: int64

Creating Sample set from data

In [0]:
trainSample_df = train_df.sample(n=9500, random_state=42)
testSample_df = test_df.sample(n=7800, random_state=42)

In [0]:
totalSample_df = pd.concat([trainSample_df, testSample_df], axis=0, sort=True)
total_df = pd.concat([train_df, test_df], axis=0, sort=True)

In [112]:
# totalSample_df.columns
total_df.columns

Index(['AccountId', 'Amount', 'BatchId', 'ChannelId', 'CountryCode',
       'CurrencyCode', 'CustomerId', 'FraudResult', 'PricingStrategy',
       'ProductCategory', 'ProductId', 'ProviderId', 'SubscriptionId',
       'TransactionId', 'TransactionStartTime', 'Value'],
      dtype='object')

In [113]:
def display_all(df):
    with pd.option_context("display.max_rows", 500, "display.max_columns", 500): 
        display(df)

        
display_all(total_df.describe(include='all').T)

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
AccountId,173837,4844.0,AccountId_4841,52529.0,,,,,,,
Amount,173837,,,,5792.28,111365.0,-2500000.0,-50.0,1000.0,3800.0,9880000.0
BatchId,173837,139495.0,BatchId_67019,28.0,,,,,,,
ChannelId,173837,7.0,ChannelId_3,125351.0,,,,,,,
CountryCode,173837,6.0,256,108301.0,,,,,,,
CurrencyCode,173837,6.0,UGX,173831.0,,,,,,,
CustomerId,173837,7484.0,CustomerId_7343,4091.0,,,,,,,
FraudResult,95662,,,,0.00201752,0.0448717,0.0,0.0,0.0,0.0,1.0
PricingStrategy,173837,,,,2.26249,0.742391,0.0,2.0,2.0,2.0,4.0
ProductCategory,173837,13.0,financial_services,82684.0,,,,,,,


### Replacing Erroneous Data Values

**CountryCode**

In [114]:
total_df['CountryCode'].value_counts()

256                   108301
256                    65531
financial_services         2
16000                      1
ChannelId_3                1
-250                       1
Name: CountryCode, dtype: int64

In [0]:
total_df['CountryCode'] = '256'

In [116]:
total_df['CountryCode'].value_counts()

256    173837
Name: CountryCode, dtype: int64

**CurrencyCode**

In [117]:
total_df['CurrencyCode'].value_counts()

UGX                   173831
ProductId_6                2
16000                      1
2                          1
financial_services         1
ChannelId_3                1
Name: CurrencyCode, dtype: int64

In [0]:
total_df['CurrencyCode'] = 'UGX'

In [119]:
total_df['CurrencyCode'].value_counts()

UGX    173837
Name: CurrencyCode, dtype: int64

**ProductCategory**

In [120]:
total_df['ProductCategory'].value_counts()

financial_services      82684
airtime                 81779
data_bundles             3235
utility_bill             3206
tv                       2067
movies                    299
retail                    291
ticket                    239
transport                  31
50                          2
other                       2
2019-02-15T17:29:44Z        1
2                           1
Name: ProductCategory, dtype: int64

In [0]:
total_df[total_df['ProductCategory'] == '50']['ProductCategory'] = 'financial_services'
total_df[total_df['ProductCategory'] == '2']['ProductCategory']  = 'financial_services'
total_df[total_df['ProductCategory'] == '2019-02-15T17:29:44Z']['ProductCategory']  = 'airtime'

In [122]:
total_df['ProductCategory'].value_counts()

financial_services      82684
airtime                 81779
data_bundles             3235
utility_bill             3206
tv                       2067
movies                    299
retail                    291
ticket                    239
transport                  31
50                          2
other                       2
2019-02-15T17:29:44Z        1
2                           1
Name: ProductCategory, dtype: int64

**ProviderId**

In [123]:
total_df['ProviderId'].value_counts()

ProviderId_4            69683
ProviderId_6            61630
ProviderId_5            26710
ProviderId_1             9879
ProviderId_3             5901
ProviderId_2               29
ChannelId_3                 2
2019-03-07T19:28:45Z        1
250                         1
10000                       1
Name: ProviderId, dtype: int64

In [0]:
total_df[total_df['ProviderId'] == 'ChannelId_3']['ProviderId']  = 'ProviderId_4'
total_df[total_df['ProviderId'] == '2019-03-07T19:28:45Z']['ProviderId'] = 'ProviderId_6'
total_df[total_df['ProviderId'] == '250']['ProviderId'] = 'ProviderId_6'
total_df[total_df['ProviderId'] == '10000']['ProviderId'] = 'ProviderId_4'
#total_df[total_df['ProviderId'] == 'airtime'] = 'ProviderId_4'
#total_df[total_df['ProviderId'] == 'financial_services'] = 'ProviderId_6'

In [125]:
total_df['ProviderId'].value_counts()

ProviderId_4            69683
ProviderId_6            61630
ProviderId_5            26710
ProviderId_1             9879
ProviderId_3             5901
ProviderId_2               29
ChannelId_3                 2
2019-03-07T19:28:45Z        1
250                         1
10000                       1
Name: ProviderId, dtype: int64

**ProductId**

In [32]:
total_df['ProductId'].value_counts()

ProductId_6     58326
ProductId_3     42309
ProductId_10    30434
ProductId_15    22637
ProductId_1      5029
ProductId_4      3907
ProductId_11     2916
ProductId_21     2504
ProductId_14     1639
ProductId_19     1529
ProductId_13      471
ProductId_8       327
ProductId_2       307
ProductId_24      299
ProductId_26      291
ProductId_20      239
ProductId_22      231
ProductId_7       163
ProductId_16       80
ProductId_25       62
ProductId_27       48
ProductId_5        31
ProductId_9        20
ProductId_18       14
ProductId_17       12
ProductId_23        5
ProviderId_6        4
ProductId_12        2
ProviderId_4        1
Name: ProductId, dtype: int64

In [0]:
total_df[total_df['ProductId'] == '-50'] = 'ProductId_6'
total_df[total_df['ProductId'] == '10000'] = 'ProductId_6'
total_df[total_df['ProductId'] == '2'] = 'ProductId_3'
total_df[total_df['ProductId'] == '2019-02-21T00:13:20Z'] = 'ProductId_10'

In [34]:
total_df['ProductId'].value_counts()

ProductId_6     58326
ProductId_3     42309
ProductId_10    30434
ProductId_15    22637
ProductId_1      5029
ProductId_4      3907
ProductId_11     2916
ProductId_21     2504
ProductId_14     1639
ProductId_19     1529
ProductId_13      471
ProductId_8       327
ProductId_2       307
ProductId_24      299
ProductId_26      291
ProductId_20      239
ProductId_22      231
ProductId_7       163
ProductId_16       80
ProductId_25       62
ProductId_27       48
ProductId_5        31
ProductId_9        20
ProductId_18       14
ProductId_17       12
ProductId_23        5
ProviderId_6        4
ProductId_12        2
ProviderId_4        1
Name: ProductId, dtype: int64

In [25]:
def display_all(df):
    with pd.option_context("display.max_rows", 1000, "display.max_columns", 1000): 
        display(df)

        
display_all(total_df.describe(include='all').T)

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
AccountId,173837,4844.0,AccountId_4841,52529.0,,,,,,,
Amount,173837,,,,5792.28,111365.0,-2500000.0,-50.0,1000.0,3800.0,9880000.0
BatchId,173837,139495.0,BatchId_67019,28.0,,,,,,,
ChannelId,173837,7.0,ChannelId_3,125351.0,,,,,,,
CountryCode,173837,1.0,256,173837.0,,,,,,,
CurrencyCode,173837,1.0,UGX,173837.0,,,,,,,
CustomerId,173837,7484.0,CustomerId_7343,4091.0,,,,,,,
FraudResult,95662,,,,0.00201752,0.0448717,0.0,0.0,0.0,0.0,1.0
PricingStrategy,173837,,,,2.26249,0.742391,0.0,2.0,2.0,2.0,4.0
ProductCategory,173837,13.0,financial_services,82684.0,,,,,,,


In [26]:
total_df['FraudResult'].value_counts()

0.0    95469
1.0      193
Name: FraudResult, dtype: int64

In [0]:
total_df[total_df['FraudResult'] == 'ProviderId_6'] = 0.0
total_df[total_df['FraudResult'] == 'ProviderId_4'] = 0.0

### Encoding Categorical Variables

In [0]:
# choosing categorical variables that need to be encoded
categorical = ['ChannelId', 'ProviderId', 'ProductId', 'ProductCategory']

for var in categorical:
    total_df = pd.concat([total_df, 
                    pd.get_dummies(total_df[var], prefix=var)], axis=1)
    del total_df[var]

In [0]:
categoricalSample = ['ChannelId', 'ProviderId', 'ProductId', 'ProductCategory']

for var in categoricalSample:
    totalSample_df = pd.concat([totalSample_df, 
                    pd.get_dummies(totalSample_df[var], prefix=var)], axis=1)
    del totalSample_df[var]

In [29]:
total_df.head()

Unnamed: 0,AccountId,Amount,BatchId,CountryCode,CurrencyCode,CustomerId,FraudResult,PricingStrategy,SubscriptionId,TransactionId,TransactionStartTime,Value,ChannelId_2,ChannelId_2019-03-04T09:10:00Z,ChannelId_ChannelId_1,ChannelId_ChannelId_2,ChannelId_ChannelId_3,ChannelId_ChannelId_4,ChannelId_ChannelId_5,ProviderId_10000,ProviderId_2019-03-07T19:28:45Z,ProviderId_250,ProviderId_ChannelId_3,ProviderId_ProviderId_1,ProviderId_ProviderId_2,ProviderId_ProviderId_3,ProviderId_ProviderId_4,ProviderId_ProviderId_5,ProviderId_ProviderId_6,ProductId_-50,ProductId_10000,ProductId_2,ProductId_2019-02-21T00:13:20Z,ProductId_ProductId_1,ProductId_ProductId_10,ProductId_ProductId_11,ProductId_ProductId_12,ProductId_ProductId_13,ProductId_ProductId_14,ProductId_ProductId_15,ProductId_ProductId_16,ProductId_ProductId_17,ProductId_ProductId_18,ProductId_ProductId_19,ProductId_ProductId_2,ProductId_ProductId_20,ProductId_ProductId_21,ProductId_ProductId_22,ProductId_ProductId_23,ProductId_ProductId_24,ProductId_ProductId_25,ProductId_ProductId_26,ProductId_ProductId_27,ProductId_ProductId_3,ProductId_ProductId_4,ProductId_ProductId_5,ProductId_ProductId_6,ProductId_ProductId_7,ProductId_ProductId_8,ProductId_ProductId_9,ProductCategory_2,ProductCategory_2019-02-15T17:29:44Z,ProductCategory_50,ProductCategory_airtime,ProductCategory_data_bundles,ProductCategory_financial_services,ProductCategory_movies,ProductCategory_other,ProductCategory_retail,ProductCategory_ticket,ProductCategory_transport,ProductCategory_tv,ProductCategory_utility_bill
0,AccountId_3957,1000.0,BatchId_36123,256,UGX,CustomerId_4406,0.0,2.0,SubscriptionId_887,TransactionId_76871,2018-11-15T02:18:49Z,1000.0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
1,AccountId_4841,-20.0,BatchId_15642,256,UGX,CustomerId_4406,0.0,2.0,SubscriptionId_3829,TransactionId_73770,2018-11-15T02:19:08Z,20.0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
2,AccountId_4229,500.0,BatchId_53941,256,UGX,CustomerId_4683,0.0,2.0,SubscriptionId_222,TransactionId_26203,2018-11-15T02:44:21Z,500.0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
3,AccountId_648,20000.0,BatchId_102363,256,UGX,CustomerId_988,0.0,2.0,SubscriptionId_2185,TransactionId_380,2018-11-15T03:32:55Z,21800.0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
4,AccountId_4841,-644.0,BatchId_38780,256,UGX,CustomerId_988,0.0,2.0,SubscriptionId_3829,TransactionId_28195,2018-11-15T03:34:21Z,644.0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0


Drop unnecessary columns

In [0]:
total_df.drop(['AccountId', 'BatchId', 'CustomerId', 'CurrencyCode', 'SubscriptionId', 'TransactionId', 'TransactionStartTime'], axis=1, inplace=True)
totalSample_df.drop(['AccountId', 'BatchId', 'CustomerId', 'CurrencyCode', 'SubscriptionId', 'TransactionId', 'TransactionStartTime'], axis=1, inplace=True)

In [67]:
total_df.head()

Unnamed: 0,Amount,CountryCode,FraudResult,PricingStrategy,Value,ChannelId_0.0,ChannelId_ChannelId_1,ChannelId_ChannelId_2,ChannelId_ChannelId_3,ChannelId_ChannelId_4,ChannelId_ChannelId_5,ProviderId_0.0,ProviderId_ProviderId_1,ProviderId_ProviderId_2,ProviderId_ProviderId_3,ProviderId_ProviderId_4,ProviderId_ProviderId_5,ProviderId_ProviderId_6,ProductId_0.0,ProductId_ProductId_1,ProductId_ProductId_10,ProductId_ProductId_11,ProductId_ProductId_12,ProductId_ProductId_13,ProductId_ProductId_14,ProductId_ProductId_15,ProductId_ProductId_16,ProductId_ProductId_17,ProductId_ProductId_18,ProductId_ProductId_19,ProductId_ProductId_2,ProductId_ProductId_20,ProductId_ProductId_21,ProductId_ProductId_22,ProductId_ProductId_23,ProductId_ProductId_24,ProductId_ProductId_25,ProductId_ProductId_26,ProductId_ProductId_27,ProductId_ProductId_3,ProductId_ProductId_4,ProductId_ProductId_5,ProductId_ProductId_6,ProductId_ProductId_7,ProductId_ProductId_8,ProductId_ProductId_9,ProductCategory_0.0,ProductCategory_airtime,ProductCategory_data_bundles,ProductCategory_financial_services,ProductCategory_movies,ProductCategory_other,ProductCategory_retail,ProductCategory_ticket,ProductCategory_transport,ProductCategory_tv,ProductCategory_utility_bill
0,1000,256,0,2,1000,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
1,-20,256,0,2,20,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0
2,500,256,0,2,500,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
3,20000,256,0,2,21800,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
4,-644,256,0,2,644,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0


## Model Building

Train/Test Split

In [0]:
sampleTrain = totalSample_df[pd.notnull(totalSample_df['FraudResult'])]
sample_X_test = totalSample_df[pd.isnull(totalSample_df['FraudResult'])].drop(['FraudResult'], axis=1)

train = total_df[pd.notnull(total_df['FraudResult'])]
X_test = total_df[pd.isnull(total_df['FraudResult'])].drop(['FraudResult'], axis=1)

Validation Set

In [0]:
sample_X_train, sample_X_val, sample_y_train, sample_y_val = train_test_split(
    sampleTrain.drop(['FraudResult'], axis=1),
    sampleTrain['FraudResult'],
    test_size=0.2, random_state=42)

X_train, X_val, y_train, y_val = train_test_split(
    train.drop(['FraudResult'], axis=1),
    train['FraudResult'],
    test_size=0.2, random_state=42)

In [33]:
for i in [X_train, X_val, X_test]:
    print(type(i))

#print(type(y_train))
#print(type(y_val))

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>


In [34]:
X_train.head()

Unnamed: 0,Amount,CountryCode,PricingStrategy,Value,ChannelId_2,ChannelId_2019-03-04T09:10:00Z,ChannelId_ChannelId_1,ChannelId_ChannelId_2,ChannelId_ChannelId_3,ChannelId_ChannelId_4,ChannelId_ChannelId_5,ProviderId_10000,ProviderId_2019-03-07T19:28:45Z,ProviderId_250,ProviderId_ChannelId_3,ProviderId_ProviderId_1,ProviderId_ProviderId_2,ProviderId_ProviderId_3,ProviderId_ProviderId_4,ProviderId_ProviderId_5,ProviderId_ProviderId_6,ProductId_-50,ProductId_10000,ProductId_2,ProductId_2019-02-21T00:13:20Z,ProductId_ProductId_1,ProductId_ProductId_10,ProductId_ProductId_11,ProductId_ProductId_12,ProductId_ProductId_13,ProductId_ProductId_14,ProductId_ProductId_15,ProductId_ProductId_16,ProductId_ProductId_17,ProductId_ProductId_18,ProductId_ProductId_19,ProductId_ProductId_2,ProductId_ProductId_20,ProductId_ProductId_21,ProductId_ProductId_22,ProductId_ProductId_23,ProductId_ProductId_24,ProductId_ProductId_25,ProductId_ProductId_26,ProductId_ProductId_27,ProductId_ProductId_3,ProductId_ProductId_4,ProductId_ProductId_5,ProductId_ProductId_6,ProductId_ProductId_7,ProductId_ProductId_8,ProductId_ProductId_9,ProductCategory_2,ProductCategory_2019-02-15T17:29:44Z,ProductCategory_50,ProductCategory_airtime,ProductCategory_data_bundles,ProductCategory_financial_services,ProductCategory_movies,ProductCategory_other,ProductCategory_retail,ProductCategory_ticket,ProductCategory_transport,ProductCategory_tv,ProductCategory_utility_bill
59096,-100.0,256,2.0,100.0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
35991,5000.0,256,2.0,5000.0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
69457,1000.0,256,2.0,1000.0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
58498,-70.0,256,2.0,70.0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
88790,20000.0,256,2.0,20000.0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0


In [35]:
for i in [X_train, X_val, X_test]:
    print(i.shape)

(76529, 65)
(19133, 65)
(78175, 65)


### Random Forest Classifier

In [0]:
rfc1 = RandomForestClassifier(random_state=42)

Training the data

In [37]:
y_val.value_counts()

0.0    19097
1.0       36
Name: FraudResult, dtype: int64

In [38]:
rfc1.fit(sample_X_train, sample_y_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [40]:
rfc1.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [0]:
'''#sample_y_hat = rfc1.predict(sample_X_test)

y_hat = rfc1.predict(X_test)'''

In [0]:
'''#sample_y_proba = rfc1.predict_proba(sample_X_test)

y_proba = rfc1.predict_proba(X_test)'''

In [0]:
'''y_hat_star = np.where(y_proba[:,1] > 0.004, 1, 0)'''

In [46]:
'''count = 0
for i in y_hat_star:
  if y_hat_star[i] == 1:
    count += 1
    
print(count)'''

0


Testing the model

In [47]:
accuracy_score(y_val, rfc1.predict(X_val))

0.9997909371243402

### Cross Validation

In [0]:
X_train = pd.concat([X_train, X_val])
y_train = pd.concat([y_train, y_val])

In [49]:
y_train.shape

(95662,)

In [52]:
rfc2 = RandomForestClassifier(n_estimators=10, random_state=42)
cross_val_score(rfc2, X_train, y_train, cv=10)

array([0.99895474, 0.9995819 , 0.99968642, 0.99926824, 0.99937278,
       0.99958185, 0.99937278, 0.99926824, 0.99958185, 0.99989545])

In [53]:
cross_val_score(rfc2, X_train, y_train, cv=5).mean()

0.9994146113813652

### Hyper-parameter tuning

In [0]:
# creating the grid for GridSearch optimisor
n_estimators = [10, 25, 50, 100, 1000, 2000]
max_depth = [None, 5, 10, 15, 25]
param_grid = dict(n_estimators=n_estimators, max_depth=max_depth)

In [0]:
rfc3 = RandomForestClassifier(random_state=42)

In [0]:
# perform grid search to output the best possible estimator with best possible parameters
grid = GridSearchCV(estimator=rfc3, 
                    param_grid=param_grid,
                    cv=3,
                    verbose=2,
                    n_jobs=-1)

In [61]:
grid_result = grid.fit(X_train, y_train)

Fitting 3 folds for each of 30 candidates, totalling 90 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:  6.1min
[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed: 17.6min finished


In [62]:
print(grid_result.best_estimator_)
print(grid_result.best_params_)
print(grid_result.best_score_)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=10, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=50,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)
{'max_depth': 10, 'n_estimators': 50}
0.9994668729485062


## Predicting Results for test data

In [0]:
rfc4 = grid_result.best_estimator_

In [64]:
# testSample CV score
cross_val_score(rfc4, sample_X_train, sample_y_train, cv=5).mean()


0.9992106125961702

In [65]:
# testFull CV score
cross_val_score(rfc4, X_train, y_train, cv=5).mean()

0.9994041582375823

In [0]:
# saving preditcions to new column in test dataframes

#testSample_df['FraudResult'] = rfc4.predict(sample_X_test)
test_df['FraudResult'] = rfc4.predict(X_test)


In [0]:
testSample_df['FraudResult'].value_counts()

In [69]:
test_df['FraudResult'].value_counts()

0.0    78072
1.0      103
Name: FraudResult, dtype: int64

In [70]:
test_df.head()

Unnamed: 0,TransactionId,BatchId,AccountId,SubscriptionId,CustomerId,CurrencyCode,CountryCode,ProviderId,ProductId,ProductCategory,ChannelId,Amount,Value,TransactionStartTime,PricingStrategy,FraudResult
0,TransactionId_50600,BatchId_35028,AccountId_2441,SubscriptionId_4426,CustomerId_2857,UGX,256,ProviderId_5,ProductId_3,airtime,ChannelId_3,1000.0,1000.0,2019-02-13T10:01:40Z,4.0,0.0
1,TransactionId_95109,BatchId_45139,AccountId_3439,SubscriptionId_2643,CustomerId_3874,UGX,256,ProviderId_5,ProductId_15,financial_services,ChannelId_3,2000.0,2000.0,2019-02-13T10:02:12Z,2.0,0.0
2,TransactionId_47357,BatchId_74887,AccountId_4841,SubscriptionId_3829,CustomerId_2857,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-50.0,50.0,2019-02-13T10:02:30Z,2.0,0.0
3,TransactionId_28185,BatchId_11025,AccountId_2685,SubscriptionId_4626,CustomerId_3105,UGX,256,ProviderId_5,ProductId_10,airtime,ChannelId_3,3000.0,3000.0,2019-02-13T10:02:38Z,4.0,0.0
4,TransactionId_22140,BatchId_29804,AccountId_4841,SubscriptionId_3829,CustomerId_3105,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-60.0,60.0,2019-02-13T10:02:58Z,2.0,0.0


## Preparing for Submission

In [0]:
submission_df = test_df.drop(['BatchId', 'AccountId',	'SubscriptionId',	'CustomerId',	'CurrencyCode',	'CountryCode',	'ProviderId',	'ProductId',	'ProductCategory',	'ChannelId',	'Amount',	'Value',	'TransactionStartTime',	'PricingStrategy'], axis = 1)

In [81]:
submission_df.head() #[submission_df['TransactionId']=='']

Unnamed: 0,TransactionId,FraudResult
0,TransactionId_50600,0.0
1,TransactionId_95109,0.0
2,TransactionId_47357,0.0
3,TransactionId_28185,0.0
4,TransactionId_22140,0.0


In [0]:
# manually inputting 3 missing data points
df_append = pd.DataFrame({"TransactionId":['TransactionId_45455', 'TransactionId_33520','TransactionId_10014'], "FraudResult":[0.0, 0.0, 0.0]}) 

In [94]:
df_append

Unnamed: 0,TransactionId,FraudResult
0,TransactionId_45455,0.0
1,TransactionId_33520,0.0
2,TransactionId_10014,0.0


In [0]:
submissionUpdated = submission_df.append(df_append, ignore_index = True)

In [79]:
sample_df.shape

(45019, 2)

In [0]:
submissionUpdated.to_csv('damonSubmission4.csv', index=False)