In [47]:
import pandas as pd 
import numpy as np

from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

import matplotlib.pyplot as plt

plt.style.use('ggplot')

In [32]:
#download and explore dataset
df = pd.read_csv('https://dl.dropboxusercontent.com/s/4fh13p7t6v4rmuh/datasets_20470_26463_creditcardcsvpresent.csv?dl=0')
df.head()

Unnamed: 0,Merchant_id,Transaction date,Average Amount/transaction/day,Transaction_amount,Is declined,Total Number of declines/day,isForeignTransaction,isHighRiskCountry,Daily_chargeback_avg_amt,6_month_avg_chbk_amt,6-month_chbk_freq,isFradulent
0,3160040998,,100.0,3000.0,N,5,Y,Y,0,0.0,0,Y
1,3160040998,,100.0,4300.0,N,5,Y,Y,0,0.0,0,Y
2,3160041896,,185.5,4823.0,Y,5,N,N,0,0.0,0,Y
3,3160141996,,185.5,5008.5,Y,8,N,N,0,0.0,0,Y
4,3160241992,,500.0,26000.0,N,0,Y,Y,800,677.2,6,Y


In [33]:
df.columns

Index(['Merchant_id', 'Transaction date', 'Average Amount/transaction/day',
       'Transaction_amount', 'Is declined', 'Total Number of declines/day',
       'isForeignTransaction', 'isHighRiskCountry', 'Daily_chargeback_avg_amt',
       '6_month_avg_chbk_amt', '6-month_chbk_freq', 'isFradulent'],
      dtype='object')

In [34]:
#Remove the transaction_date column

df = df.drop('Transaction date',axis=1)

In [35]:
#Now, encode the values of N/Y to 0/1 using LabelEncoder

encode_cols = ['Is declined', 'isForeignTransaction','isHighRiskCountry', 'isFradulent']

#Reference to the df with these 4 specific columns only
#df[encode_cols] will access all the rows but only these 4 columns. apply the LabelEncoder which automatically encodes all strings to a number.
#Must fit the encoder before being able to use it to transform 
df[encode_cols] = df[encode_cols].apply(LabelEncoder().fit_transform)

In [36]:
df[encode_cols]

Unnamed: 0,Is declined,isForeignTransaction,isHighRiskCountry,isFradulent
0,0,1,1,1
1,0,1,1,1
2,1,0,0,1
3,1,0,0,1
4,0,1,1,1
...,...,...,...,...
3070,1,0,0,0
3071,1,0,0,0
3072,1,0,0,0
3073,1,1,1,0


In [60]:
#This sets y to be the values in column 'IsFradulent', while deleting this column in the original dataframe
y = df.pop('isFradulent')

KeyError: ignored

In [63]:
#Drop the Merchant_id column as it is not an important feature. 
#Including this feature may lower accuracy, as the model is training on data that does not determine if something is fradulant in the first place (Merchant_id)

x = df.drop('Merchant_id', axis=1)

In [64]:
x_train, x_test , y_train , y_test = train_test_split(x,y,test_size=0.3)

In [65]:
model = LogisticRegression()

model.fit(x_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [66]:
score = model.score(x_test,y_test)
#This is the same as:
#accuracy_score(y_test,model.predict(x_test))
#Just that it's easier to do it this way


print(f'Score is: {(score*100).round(5)}')

Score is: 98.37486


In [103]:
inputs = {}

for i in x.columns:
  val = input(f'What is the {i}:')
  #Must convert to float so the model can predict it (Since the model was trained on float values)
  inputs[i] = float(val)

What is the Average Amount/transaction/day:100
What is the Transaction_amount:500000
What is the Is declined:1
What is the Total Number of declines/day:0
What is the isForeignTransaction:1
What is the isHighRiskCountry:1
What is the Daily_chargeback_avg_amt:010
What is the 6_month_avg_chbk_amt:12
What is the 6-month_chbk_freq:2


In [104]:
inputs

{'6-month_chbk_freq': 2.0,
 '6_month_avg_chbk_amt': 12.0,
 'Average Amount/transaction/day': 100.0,
 'Daily_chargeback_avg_amt': 10.0,
 'Is declined': 1.0,
 'Total Number of declines/day': 0.0,
 'Transaction_amount': 500000.0,
 'isForeignTransaction': 1.0,
 'isHighRiskCountry': 1.0}

In [105]:
own_eg = []

for i in inputs.values():
  own_eg.append(i)

own_eg

[100.0, 500000.0, 1.0, 0.0, 1.0, 1.0, 10.0, 12.0, 2.0]

In [107]:
own_eg_pred = model.predict([own_eg])[0]

if own_eg_pred == 0:
  print('It is not fradulant!')
else:
  print('Fradulent!')

Fradulent!
