In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import Perceptron
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

import datetime as dt
import io

pd.set_option('display.max_columns',500)

In [2]:
data = pd.read_csv("train.csv", parse_dates=["dob"])
data.drop(columns=["trans_date_trans_time","cc_num", "first", "last", "street", "lat", "long", "trans_num", "unix_time"],
       axis=1, inplace=True)

In [3]:
now = pd.Timestamp("now")
data['dob'] = data['dob'].where(data['dob'] < now, data['dob'] -  np.timedelta64(100, 'Y'))
data['age'] = (now - data['dob']).astype('<m8[Y]')
data.drop(columns=["dob"], inplace=True)
data.head()

Unnamed: 0,id_txn,merchant,category,amt,gender,city,state,zip,city_pop,job,merch_lat,merch_long,is_fraud,age
0,TX000881306,fraud_Parker-Kunde,personal_care,55.78,F,Jordanville,NY,13361,824,Travel agency manager,41.918721,-74.367377,0,35.0
1,TX001044619,fraud_Schmitt Inc,gas_transport,58.46,M,Burbank,OK,74633,471,Sub,36.919963,-97.675321,0,79.0
2,TX000885984,fraud_Bednar Group,misc_net,3.9,M,Roseland,NE,68973,463,"Nurse, mental health",40.036737,-97.720023,0,72.0
3,TX000393423,fraud_Thiel-Thiel,entertainment,54.23,F,Gadsden,AL,35901,67082,Physiological scientist,33.374831,-86.541588,0,57.0
4,TX001009052,fraud_Predovic Inc,shopping_net,41.28,M,Arcadia,MI,49613,680,"Designer, jewellery",45.429028,-85.833936,0,54.0


In [4]:
var = ["id_txn", "amt", "city_pop", "age"]

In [5]:
xtemp = data[var]
y = data["is_fraud"]

x2 = [v for v in xtemp if v != "id_txn"]
x = data[x2]
x.head()

Unnamed: 0,amt,city_pop,age
0,55.78,824,35.0
1,58.46,471,79.0
2,3.9,463,72.0
3,54.23,67082,57.0
4,41.28,680,54.0


## Splitting data

In [6]:
x_train, x_test, y_train, y_test = train_test_split(x,y, train_size=0.7)

## Standard Scaler

In [7]:
sc = StandardScaler()

In [8]:
sc.fit(x_train)
xs = sc.transform(x_train)
xt = sc.transform(x_test)

## Perceptron

In [9]:
ppn = Perceptron(n_jobs=-1)
ppn.fit(xs,y_train)

Perceptron(n_jobs=-1)

In [10]:
y_pred = ppn.predict(xt)

In [11]:
print("accuracy: {0:.2f}%".
     format(accuracy_score(y_test, y_pred)*100))

accuracy: 99.27%


In [12]:
y_prob = ppn.score(xt,y_test)
y_prob

0.9926992824143781

## With test data

In [13]:
test = pd.read_csv("test.csv", parse_dates=["dob"])
test.drop(columns=["trans_date_trans_time","cc_num", "first", "last", "street", "lat", "long", "trans_num", "unix_time"],
       axis=1, inplace=True)
now = pd.Timestamp("now")

In [14]:
test['dob'] = test['dob'].where(test['dob'] < now, test['dob'] -  np.timedelta64(100, 'Y'))
test['age'] = (now - test['dob']).astype('<m8[Y]')
test.drop(columns=["dob"], inplace=True)

In [15]:
var = ["id_txn", "amt", "city_pop", "age"]
xtemp = test[var]

x = xtemp.drop(columns=["id_txn"], axis=True)
x.head()

Unnamed: 0,amt,city_pop,age
0,56.93,23805,50.0
1,9.4,1656,51.0
2,6.02,516,54.0
3,21.52,3688,47.0
4,269.34,237282,59.0


In [16]:
testing_pred = ppn.predict(x)

In [17]:
prediction = pd.DataFrame(testing_pred, columns=['predictions']).to_csv('prediction.csv')