In [1]:
import pandas as pd
import numpy as np
from sklearn.neighbors import LocalOutlierFactor

In [2]:
labeled_df = pd.read_csv('labeled_df.csv')
labeled_df.shape

(90960, 16)

In [3]:
labeled_df = labeled_df.drop(columns='Unnamed: 0')
labeled_df.head()

Unnamed: 0,customerId,timestamp,country,amount,businessName,userAgent_isPc,userAgent_browserFamily,userAgent_osFamily,IBAN_countryCode,IBAN_bankCode,SIA,CAP,SAE,RAE,companyType
0,93000801,2021-09-15 16:13:35.147,6,150.0,c7543b6f04850eae288aa52783b33363,1,4,5,17,699,2013,144,19,113,5
1,93000801,2021-08-24 10:34:24.487,5,3200.0,c7543b6f04850eae288aa52783b33363,1,4,5,17,699,2013,144,19,113,5
2,93000801,2021-09-14 16:21:43.33,6,8350.0,c7543b6f04850eae288aa52783b33363,1,4,5,17,699,2013,144,19,113,5
3,93001001,2021-06-30 09:07:16.733,6,500.0,3f241dfcc38064261fec0f303c2784e5,1,0,5,17,123,1781,90,8,99,10
4,93001001,2021-07-12 16:36:02.857,6,1895.68,a5d1eaffa8e325d370665f43624339a1,1,0,5,17,285,1781,90,8,99,10


In [4]:
from sklearn.ensemble import IsolationForest
X=labeled_df.copy()
X=X.drop(columns=['businessName','customerId','timestamp'])
clf = LocalOutlierFactor(n_neighbors=20)
clf.fit_predict(X)

array([1, 1, 1, ..., 1, 1, 1])

In [5]:
import pickle
encoder_dict=pickle.load(open('enc_dict.pkl','rb'))
columns_to_label = ['country', 'userAgent_browserFamily', 'userAgent_osFamily', 'IBAN_countryCode', 'IBAN_bankCode','CAP', 'SIA', 'SAE', 'RAE','companyType']

In [6]:
def inverse_and_scores(if_df,labeled_df,enc_dict,columns,model):
    df_copy=labeled_df.copy()
    label=model.fit_predict(X)
    scores=model.negative_outlier_factor_

    for clmn in columns:
        df_copy[clmn]=enc_dict[clmn].inverse_transform(df_copy[clmn])

    df_copy['scores'] = scores
    df_copy['label'] = label

    return df_copy

df_scores=inverse_and_scores(X,labeled_df,encoder_dict,columns_to_label,clf)

df_scores.head()

Unnamed: 0,customerId,timestamp,country,amount,businessName,userAgent_isPc,userAgent_browserFamily,userAgent_osFamily,IBAN_countryCode,IBAN_bankCode,SIA,CAP,SAE,RAE,companyType,scores,label
0,93000801,2021-09-15 16:13:35.147,Italy,150.0,c7543b6f04850eae288aa52783b33363,1,Firefox,Windows,IT,5856,CLFUY,33077,492,630,SAS,-1.008318,1
1,93000801,2021-08-24 10:34:24.487,Greece,3200.0,c7543b6f04850eae288aa52783b33363,1,Firefox,Windows,IT,5856,CLFUY,33077,492,630,SAS,-1.047175,1
2,93000801,2021-09-14 16:21:43.33,Italy,8350.0,c7543b6f04850eae288aa52783b33363,1,Firefox,Windows,IT,5856,CLFUY,33077,492,630,SAS,-1.019783,1
3,93001001,2021-06-30 09:07:16.733,Italy,500.0,3f241dfcc38064261fec0f303c2784e5,1,Chrome,Windows,IT,2008,CEFGV,31039,430,505,SRL,-1.090301,1
4,93001001,2021-07-12 16:36:02.857,Italy,1895.68,a5d1eaffa8e325d370665f43624339a1,1,Chrome,Windows,IT,3069,CEFGV,31039,430,505,SRL,-1.006947,1


In [7]:
df_scores[df_scores['label']==-1].sort_values('scores')

Unnamed: 0,customerId,timestamp,country,amount,businessName,userAgent_isPc,userAgent_browserFamily,userAgent_osFamily,IBAN_countryCode,IBAN_bankCode,SIA,CAP,SAE,RAE,companyType,scores,label
27552,2814040702,2021-09-06 11:04:53.9,Italy,388.00,91db8a441bec53b138f729d3f8eeee5e,1,Chrome,Windows,IT,8749,CLMWZ,31040,430,231,SRL,-1.007498e+12,-1
10911,11276952,2021-07-22 09:26:33.55,Italy,183.00,215aeb868bb6b98f67c945bfbde88acb,1,Chrome,Windows,IT,8749,CKZYT,31038,430,314,SRL,-8.370701e+11,-1
52053,58140407,2021-07-02 10:40:59.86,Italy,300.00,ef3a64279e80b976c722908182588b52,1,Chrome,Mac OS X,BE,967,CKMJR,31020,430,161,SRL,-8.301205e+11,-1
3479,4094604,2021-08-03 09:46:18.273,Italy,260.00,a56e10b47d580b91c1472ecdab10f863,1,Chrome,Windows,IT,8904,CLKXT,35010,430,314,SRL,-7.522633e+11,-1
68488,76176189,2021-08-05 11:03:05.137,Italy,320.00,98e7daa3bcad19fc71ede7e3a5f548ee,1,Chrome,Mac OS X,IT,8728,CM5CR,30174,492,614,SAS,-7.247043e+11,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68091,76175189,2021-09-06 17:30:32.343,Italy,130.50,539c0972470ff80a630a68e29d404c75,1,Chrome,Windows,IT,6045,AWEMQ,39040,430,505,SRL,-1.501008e+00,-1
71839,8127180601,2021-07-16 16:23:43.93,Italy,1599.00,d400355f609ee2ed39a27a6a42681eb8,1,Firefox,Windows,IT,36081,BH4SZ,36020,430,353,SRL,-1.500807e+00,-1
20813,2093556604,2021-08-21 22:48:11.467,Italy,364.90,9221b60d2049a76e9e1ca98773b6e58c,1,Edge,Windows,IT,5142,AVJHW,39100,430,617,SRL,-1.500609e+00,-1
42334,44152022,2021-09-15 13:30:50.57,Italy,1724.00,b26be0aca65be9abbddeff2c9a20a777,1,Chrome,Windows,IT,5856,B2XJX,36028,430,723,SRL,-1.500333e+00,-1


In [73]:
df_scores.to_csv('LOF_scores.csv')