In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler,  ClusterCentroids
from imblearn.metrics import classification_report_imbalanced, geometric_mean_score
from sklearn.metrics import f1_score
from sklearn.svm import SVC

In [3]:
fraud_df = pd.read_csv('https://assets-datascientest.s3-eu-west-1.amazonaws.com/de/total/fraud.csv', 
                 parse_dates = ['signup_time','purchase_time'])
fraud_df.head()

Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,source,browser,sex,age,ip_address,is_fraud
0,22058,2015-02-24 22:55:49,2015-04-18 02:47:11,34,QVPSPJUOCKZAR,SEO,Chrome,M,39,732758400.0,0
1,333320,2015-06-07 20:39:50,2015-06-08 01:38:54,16,EOGFQPIZPYXFZ,Ads,Chrome,F,53,350311400.0,0
2,1359,2015-01-01 18:52:44,2015-01-01 18:52:45,15,YSSKYOSJHPPLJ,SEO,Opera,M,53,2621474000.0,1
3,150084,2015-04-28 21:13:25,2015-05-04 13:54:50,44,ATGTXKYKUDUQN,SEO,Safari,M,41,3840542000.0,0
4,221365,2015-07-21 07:09:52,2015-09-09 18:40:53,39,NAUITBZFJKHWW,Ads,Safari,M,45,415583100.0,0


In [4]:
fraud_df = fraud_df[fraud_df['purchase_time']>'2015-01-16'].reset_index(drop=True)
fraud_df.head()

Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,source,browser,sex,age,ip_address,is_fraud
0,22058,2015-02-24 22:55:49,2015-04-18 02:47:11,34,QVPSPJUOCKZAR,SEO,Chrome,M,39,732758400.0,0
1,333320,2015-06-07 20:39:50,2015-06-08 01:38:54,16,EOGFQPIZPYXFZ,Ads,Chrome,F,53,350311400.0,0
2,150084,2015-04-28 21:13:25,2015-05-04 13:54:50,44,ATGTXKYKUDUQN,SEO,Safari,M,41,3840542000.0,0
3,221365,2015-07-21 07:09:52,2015-09-09 18:40:53,39,NAUITBZFJKHWW,Ads,Safari,M,45,415583100.0,0
4,159135,2015-05-21 06:03:03,2015-07-09 08:05:14,42,ALEYXFXINSXLZ,Ads,Chrome,M,18,2809315000.0,0


In [5]:
def process_df(original_df):
    df = original_df.copy()
    df = df.drop(columns=['device_id', 'ip_address', 'user_id'])
    df['delay_signup_purchase'] = (df['signup_time'] - df['purchase_time']).dt.seconds
    for cat_col in ['source','browser','sex']:
        cat_mapper = dict(zip(df[cat_col].unique(), range(1, len(df[cat_col].unique())+1)))
        df[cat_col] = df[cat_col].replace(cat_mapper)
    for col_prefix in ['signup', 'purchase']:
        #df[f'{col_prefix}_year'] = df[f'{col_prefix}_time'].dt.year
        df[f'{col_prefix}_month'] = df[f'{col_prefix}_time'].dt.month
        #df[f'cos_{col_prefix}_month'] = np.cos(2.*np.pi*df[f'{col_prefix}_month']/12.)
        #df[f'sin_{col_prefix}_month'] = np.sin(2.*np.pi*df[f'{col_prefix}_month']/12.)
        df[f'{col_prefix}_hour'] = df[f'{col_prefix}_time'].dt.hour
        #df[f'cos_{col_prefix}_hour'] = np.cos(2.*np.pi*df[f'{col_prefix}_hour']/24.)
        #df[f'sin_{col_prefix}_hour'] = np.sin(2.*np.pi*df[f'{col_prefix}_hour']/24.)
        #df[f'{col_prefix}_minute'] = df[f'{col_prefix}_time'].dt.minute
        #df[f'cos_{col_prefix}_minute'] = np.cos(2.*np.pi*df[f'{col_prefix}_minute']/60.)
        #df[f'sin_{col_prefix}_minute'] = np.sin(2.*np.pi*df[f'{col_prefix}_minute']/60.)
        df[f'{col_prefix}_weekDay'] = df[f'{col_prefix}_time'].dt.dayofweek
        #df[f'cos_{col_prefix}_weekDay'] = np.cos(2.*np.pi*df[f'{col_prefix}_weekDay']/7.)
        #df[f'sin_{col_prefix}_weekDay'] = np.sin(2.*np.pi*df[f'{col_prefix}_weekDay']/7.)
        df = df.drop(columns=[f'{col_prefix}_time'])
    return df

In [6]:
processed_fraud_df = process_df(fraud_df)

In [7]:
target = fraud_df["is_fraud"]

In [8]:
x_train, x_test, y_train, y_test= train_test_split(processed_fraud_df, target, test_size=.2, random_state=123)
x_train = x_train.drop("is_fraud", axis = 1)
x_test = x_test.drop("is_fraud", axis = 1)

In [9]:
cols = ['purchase_value', 'age']

sc = StandardScaler()
x_train[cols] = sc.fit_transform(x_train[cols])
x_test[cols] = sc.transform(x_test[cols])

In [10]:
target.value_counts(normalize = True)

0    0.954348
1    0.045652
Name: is_fraud, dtype: float64

In [11]:
svm = SVC(gamma = 'scale')
svm.fit(x_train, y_train)

print('Score sur ensemble test', svm.score(x_test, y_test))

Score sur ensemble test 0.9549366734308306


In [12]:
y_pred = svm.predict(x_test)

print(pd.crosstab(y_test, y_pred, colnames= ['Predictions']))

Predictions      0
is_fraud          
0            27294
1             1288


In [13]:
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.95      1.00      0.00      0.98      0.00      0.00     27294
          1       0.00      0.00      1.00      0.00      0.00      0.00      1288

avg / total       0.91      0.95      0.05      0.93      0.00      0.00     28582



  _warn_prf(average, modifier, msg_start, len(result))


In [14]:
rOs = RandomOverSampler()
X_ro, y_ro = rOs.fit_resample(x_train, y_train)
print("Classes échantillon oversampled :", dict(pd.Series(y_ro).value_counts()))

#SMOTE
smo = SMOTE()
X_sm, y_sm = smo.fit_resample(x_train, y_train)
print("Classes échantillon SMOTE :", dict(pd.Series(y_sm).value_counts()))

Classes échantillon oversampled : {0: 109090, 1: 109090}
Classes échantillon SMOTE : {0: 109090, 1: 109090}


In [None]:
svm = SVC(gamma='scale')
svm.fit(X_ro, y_ro)

y_pred = svm.predict(x_test)
print(pd.crosstab(y_test, y_pred))

print(classification_report_imbalanced(y_test, y_pred))

In [None]:
svm = SVC(gamma='scale')
svm.fit(X_sm, y_sm)

y_pred = svm.predict(x_test)
print(pd.crosstab(y_test, y_pred))
print(classification_report_imbalanced(y_test, y_pred))