In [2]:
import seaborn as sns
import pandas as pd
import numpy as np

fraud = pd.read_csv("/content/Fraud_Data.csv")
location = pd.read_csv("/content/IpAddress_to_Country.csv")
fraud.head(5)

Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,source,browser,sex,age,ip_address,class
0,22058,2015-02-24 22:55:49,2015-04-18 02:47:11,34,QVPSPJUOCKZAR,SEO,Chrome,M,39,732758400.0,0
1,333320,2015-06-07 20:39:50,2015-06-08 01:38:54,16,EOGFQPIZPYXFZ,Ads,Chrome,F,53,350311400.0,0
2,1359,2015-01-01 18:52:44,2015-01-01 18:52:45,15,YSSKYOSJHPPLJ,SEO,Opera,M,53,2621474000.0,1
3,150084,2015-04-28 21:13:25,2015-05-04 13:54:50,44,ATGTXKYKUDUQN,SEO,Safari,M,41,3840542000.0,0
4,221365,2015-07-21 07:09:52,2015-09-09 18:40:53,39,NAUITBZFJKHWW,Ads,Safari,M,45,415583100.0,0


In [3]:
#convert scientific notation and float to readable integer
def sci_to_int(d):
  iplist = []
  for i in range(len(d)):
    iplist.append(int(d[i]))
  return iplist

In [4]:
fraud['ip_address'] = sci_to_int(fraud['ip_address'])
location['lower_bound_ip_address'] = sci_to_int(location['lower_bound_ip_address'])

In [5]:
def get_country(ip) :
  try:
    return location.country[(location.lower_bound_ip_address <= ip) & (location.upper_bound_ip_address >= ip)].iloc[0]
  except IndexError :
    "empty"

In [6]:
fraud["country"] = fraud.ip_address.apply(get_country)

In [7]:
fraud.head(10)

Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,source,browser,sex,age,ip_address,class,country
0,22058,2015-02-24 22:55:49,2015-04-18 02:47:11,34,QVPSPJUOCKZAR,SEO,Chrome,M,39,732758368,0,Japan
1,333320,2015-06-07 20:39:50,2015-06-08 01:38:54,16,EOGFQPIZPYXFZ,Ads,Chrome,F,53,350311387,0,United States
2,1359,2015-01-01 18:52:44,2015-01-01 18:52:45,15,YSSKYOSJHPPLJ,SEO,Opera,M,53,2621473820,1,United States
3,150084,2015-04-28 21:13:25,2015-05-04 13:54:50,44,ATGTXKYKUDUQN,SEO,Safari,M,41,3840542443,0,
4,221365,2015-07-21 07:09:52,2015-09-09 18:40:53,39,NAUITBZFJKHWW,Ads,Safari,M,45,415583117,0,United States
5,159135,2015-05-21 06:03:03,2015-07-09 08:05:14,42,ALEYXFXINSXLZ,Ads,Chrome,M,18,2809315199,0,Canada
6,50116,2015-08-01 22:40:52,2015-08-27 03:37:57,11,IWKVZHJOCLPUR,Ads,Chrome,F,19,3987484328,0,
7,360585,2015-04-06 07:35:45,2015-05-25 17:21:14,27,HPUCUYLMJBYFW,Ads,Opera,M,34,1692458727,0,United States
8,159045,2015-04-21 23:38:34,2015-06-02 14:01:54,30,ILXYDOZIHOOHT,SEO,IE,F,43,3719094257,0,China
9,182338,2015-01-25 17:49:49,2015-03-23 23:05:42,62,NRFFPPHZYFUVC,Ads,IE,M,31,341674739,0,United States


In [17]:
#calculer temps entre l'inscription et l'achat
import time as timelibrary
import datetime

def get_delta(end,start):
  list_duration = []
  for i in range(len(end)):
    end_date = timelibrary.mktime(datetime.datetime.strptime(end[i], "%Y-%m-%d %H:%M:%S").timetuple())
    start_date = timelibrary.mktime(datetime.datetime.strptime(start[i], "%Y-%m-%d %H:%M:%S").timetuple())
    list_duration.append(end_date - start_date)
  return list_duration

In [23]:
# Prepare X axis
X = fraud.drop(['signup_time','purchase_time','device_id','user_id','device_id','class'],axis="columns")
X['signup_purchase_delta'] = sci_to_int(get_delta(fraud['purchase_time'],fraud['signup_time']))
# Transform valuable strings data

X['browser'] = fraud['browser'].astype('category').cat.codes
X['sex'] = fraud['sex'].astype('category').cat.codes
X['source'] = fraud['source'].astype('category').cat.codes
X['country'] = fraud['country'].astype('category').cat.codes
X.head(5)

Unnamed: 0,purchase_value,source,browser,sex,age,ip_address,country,signup_purchase_delta
0,34,2,0,1,39,732758368,84,4506682
1,16,0,0,0,53,350311387,171,17944
2,15,2,3,1,53,2621473820,171,1
3,44,2,4,1,41,3840542443,-1,492085
4,39,0,4,1,45,415583117,171,4361461


In [25]:
# Prepare Target
Y = fraud['class']
Y.head()

0    0
1    0
2    1
3    0
4    0
Name: class, dtype: int64

In [35]:
# Split train test
from sklearn.ensemble import RandomForestClassifier as rfc
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=55, stratify=Y)

In [36]:
rf = rfc()
rf.fit(X_train,Y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [30]:
# Accuracy
def accuracy(preds, target):
    M = target.shape[0] # Nombre de lignes
    total_correctes = (preds == target).sum()
    return total_correctes / M # Accuracy

In [37]:
predictions = rf.predict(X_train)
accuracy(predictions, Y_train)

0.9999117652249104