#LightGBM
##- XGBoost 이후로 나온 최신 부스팅 모델
##- 트리 기반 최신 알고리즘

#장점
##- XGBoost보다 빠르고 높은 정확도
##- 예측에 영향을 미친 변수의 중요도 확인 가능
##- 변수 종류가 많고 데이터가 클수록 상대적으로 뛰어난 성능

#단점
##- 복잡한 모델인만큼, 해석에 어려움이 있음
##- 하이퍼파라미터 튜닝이 까다로움

#정형 데이터
##- CatBoost
##- LightGBM
##- XGBoost

#XGBoost vs LightGBM
##LightGBM의 장점
##1. 빠른 학습 및 예측
##2. 더 적은 메모리 사용
##3. 데이터셋 자동 변환 및 최적 분할

#XGBoost
##균형 분할 방식
##- 좌우 노드 수가 균등함

#LightGBM
##리프 중심 트리 분할 방식
##- 좌우 노두 수가 균등하지 않고, 가지가 깊게 펼쳐짐


#미션 : 카드 거래 내역 데이터셋을 이용해 이상거래 예측
#알고리즘 : LightGBM
#종속변수 : Is_fraud(이상거래)
#문제유형 : 분류
#평가지표 : 정확도, 혼동 행렬, 분류 리포트, ROC AUC 점수


#1단계 : 문제정의
#2단계 : 라이브러리 및 데이터 불러오기
#3단계 : 전처리
##ㄴ데이터 클리닝
##ㄴ피처 엔지니어링
##4단계 : 모델링 및 평가하기

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
import kagglehub
import pandas as pd
import os

# 1. 데이터셋 다운로드
path = kagglehub.dataset_download("kartik2112/fraud-detection")

print("Path to dataset files:", path)

# 2. 다운로드된 경로 확인
print("Path to dataset files:", path)

# 3. 파일 목록 확인
print("Files in directory:", os.listdir(path))

Path to dataset files: /kaggle/input/fraud-detection
Path to dataset files: /kaggle/input/fraud-detection
Files in directory: ['fraudTest.csv', 'fraudTrain.csv']


In [3]:
# 4. CSV 파일 읽기
csv_file = os.path.join(path, "fraudTest.csv")
df = pd.read_csv(csv_file, encoding='latin1')  # 이 데이터는 latin1로 인코딩된 경우가 많음

# 5. 데이터 출력
df.head()

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,0,2020-06-21 12:14:25,2291163933867244,fraud_Kirlin and Sons,personal_care,2.86,Jeff,Elliott,M,351 Darlene Green,...,33.9659,-80.9355,333497,Mechanical engineer,1968-03-19,2da90c7d74bd46a0caf3777415b3ebd3,1371816865,33.986391,-81.200714,0
1,1,2020-06-21 12:14:33,3573030041201292,fraud_Sporer-Keebler,personal_care,29.84,Joanne,Williams,F,3638 Marsh Union,...,40.3207,-110.436,302,"Sales professional, IT",1990-01-17,324cc204407e99f51b0d6ca0055005e7,1371816873,39.450498,-109.960431,0
2,2,2020-06-21 12:14:53,3598215285024754,"fraud_Swaniawski, Nitzsche and Welch",health_fitness,41.28,Ashley,Lopez,F,9333 Valentine Point,...,40.6729,-73.5365,34496,"Librarian, public",1970-10-21,c81755dbbbea9d5c77f094348a7579be,1371816893,40.49581,-74.196111,0
3,3,2020-06-21 12:15:15,3591919803438423,fraud_Haley Group,misc_pos,60.05,Brian,Williams,M,32941 Krystal Mill Apt. 552,...,28.5697,-80.8191,54767,Set designer,1987-07-25,2159175b9efe66dc301f149d3d5abf8c,1371816915,28.812398,-80.883061,0
4,4,2020-06-21 12:15:17,3526826139003047,fraud_Johnston-Casper,travel,3.19,Nathan,Massey,M,5783 Evan Roads Apt. 465,...,44.2529,-85.017,1126,Furniture designer,1955-07-06,57ff021bd3f328f8738bb535c302a31b,1371816917,44.959148,-85.884734,0


In [4]:
df.drop('Unnamed: 0', axis = 1, inplace  = True)
df.head()

Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,city,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,2020-06-21 12:14:25,2291163933867244,fraud_Kirlin and Sons,personal_care,2.86,Jeff,Elliott,M,351 Darlene Green,Columbia,...,33.9659,-80.9355,333497,Mechanical engineer,1968-03-19,2da90c7d74bd46a0caf3777415b3ebd3,1371816865,33.986391,-81.200714,0
1,2020-06-21 12:14:33,3573030041201292,fraud_Sporer-Keebler,personal_care,29.84,Joanne,Williams,F,3638 Marsh Union,Altonah,...,40.3207,-110.436,302,"Sales professional, IT",1990-01-17,324cc204407e99f51b0d6ca0055005e7,1371816873,39.450498,-109.960431,0
2,2020-06-21 12:14:53,3598215285024754,"fraud_Swaniawski, Nitzsche and Welch",health_fitness,41.28,Ashley,Lopez,F,9333 Valentine Point,Bellmore,...,40.6729,-73.5365,34496,"Librarian, public",1970-10-21,c81755dbbbea9d5c77f094348a7579be,1371816893,40.49581,-74.196111,0
3,2020-06-21 12:15:15,3591919803438423,fraud_Haley Group,misc_pos,60.05,Brian,Williams,M,32941 Krystal Mill Apt. 552,Titusville,...,28.5697,-80.8191,54767,Set designer,1987-07-25,2159175b9efe66dc301f149d3d5abf8c,1371816915,28.812398,-80.883061,0
4,2020-06-21 12:15:17,3526826139003047,fraud_Johnston-Casper,travel,3.19,Nathan,Massey,M,5783 Evan Roads Apt. 465,Falmouth,...,44.2529,-85.017,1126,Furniture designer,1955-07-06,57ff021bd3f328f8738bb535c302a31b,1371816917,44.959148,-85.884734,0


In [5]:
pd.options.display.max_columns = 22

In [6]:
df.head()

Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,city,state,zip,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,2020-06-21 12:14:25,2291163933867244,fraud_Kirlin and Sons,personal_care,2.86,Jeff,Elliott,M,351 Darlene Green,Columbia,SC,29209,33.9659,-80.9355,333497,Mechanical engineer,1968-03-19,2da90c7d74bd46a0caf3777415b3ebd3,1371816865,33.986391,-81.200714,0
1,2020-06-21 12:14:33,3573030041201292,fraud_Sporer-Keebler,personal_care,29.84,Joanne,Williams,F,3638 Marsh Union,Altonah,UT,84002,40.3207,-110.436,302,"Sales professional, IT",1990-01-17,324cc204407e99f51b0d6ca0055005e7,1371816873,39.450498,-109.960431,0
2,2020-06-21 12:14:53,3598215285024754,"fraud_Swaniawski, Nitzsche and Welch",health_fitness,41.28,Ashley,Lopez,F,9333 Valentine Point,Bellmore,NY,11710,40.6729,-73.5365,34496,"Librarian, public",1970-10-21,c81755dbbbea9d5c77f094348a7579be,1371816893,40.49581,-74.196111,0
3,2020-06-21 12:15:15,3591919803438423,fraud_Haley Group,misc_pos,60.05,Brian,Williams,M,32941 Krystal Mill Apt. 552,Titusville,FL,32780,28.5697,-80.8191,54767,Set designer,1987-07-25,2159175b9efe66dc301f149d3d5abf8c,1371816915,28.812398,-80.883061,0
4,2020-06-21 12:15:17,3526826139003047,fraud_Johnston-Casper,travel,3.19,Nathan,Massey,M,5783 Evan Roads Apt. 465,Falmouth,MI,49632,44.2529,-85.017,1126,Furniture designer,1955-07-06,57ff021bd3f328f8738bb535c302a31b,1371816917,44.959148,-85.884734,0


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 555719 entries, 0 to 555718
Data columns (total 22 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   trans_date_trans_time  555719 non-null  object 
 1   cc_num                 555719 non-null  int64  
 2   merchant               555719 non-null  object 
 3   category               555719 non-null  object 
 4   amt                    555719 non-null  float64
 5   first                  555719 non-null  object 
 6   last                   555719 non-null  object 
 7   gender                 555719 non-null  object 
 8   street                 555719 non-null  object 
 9   city                   555719 non-null  object 
 10  state                  555719 non-null  object 
 11  zip                    555719 non-null  int64  
 12  lat                    555719 non-null  float64
 13  long                   555719 non-null  float64
 14  city_pop               555719 non-nu

In [8]:
df.info(show_counts = True)
#결측치는 없는듯

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 555719 entries, 0 to 555718
Data columns (total 22 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   trans_date_trans_time  555719 non-null  object 
 1   cc_num                 555719 non-null  int64  
 2   merchant               555719 non-null  object 
 3   category               555719 non-null  object 
 4   amt                    555719 non-null  float64
 5   first                  555719 non-null  object 
 6   last                   555719 non-null  object 
 7   gender                 555719 non-null  object 
 8   street                 555719 non-null  object 
 9   city                   555719 non-null  object 
 10  state                  555719 non-null  object 
 11  zip                    555719 non-null  int64  
 12  lat                    555719 non-null  float64
 13  long                   555719 non-null  float64
 14  city_pop               555719 non-nu

In [9]:
round(df.describe(),2)
#amt, city_pop의 경우 max값의 수치가 과하다.. -> 확인 필요

Unnamed: 0,cc_num,amt,zip,lat,long,city_pop,unix_time,merch_lat,merch_long,is_fraud
count,555719.0,555719.0,555719.0,555719.0,555719.0,555719.0,555719.0,555719.0,555719.0,555719.0
mean,4.178387e+17,69.39,48842.63,38.54,-90.23,88221.89,1380679000.0,38.54,-90.23,0.0
std,1.309837e+18,156.75,26855.28,5.06,13.72,300390.89,5201104.0,5.1,13.73,0.06
min,60416210000.0,1.0,1257.0,20.03,-165.67,23.0,1371817000.0,19.03,-166.67,0.0
25%,180042900000000.0,9.63,26292.0,34.67,-96.8,741.0,1376029000.0,34.76,-96.91,0.0
50%,3521417000000000.0,47.29,48174.0,39.37,-87.48,2408.0,1380762000.0,39.38,-87.45,0.0
75%,4635331000000000.0,83.01,72011.0,41.89,-80.18,19685.0,1385867000.0,41.95,-80.26,0.0
max,4.992346e+18,22768.11,99921.0,65.69,-67.95,2906700.0,1388534000.0,66.68,-66.95,1.0


In [10]:
data = df.copy()

In [11]:
data.drop(['first','last','street','city','state','zip','trans_num','unix_time','job','merchant'],axis = 1, inplace = True)

In [12]:
data['trans_date_trans_time'] = pd.to_datetime(data['trans_date_trans_time'])

In [13]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 555719 entries, 0 to 555718
Data columns (total 12 columns):
 #   Column                 Non-Null Count   Dtype         
---  ------                 --------------   -----         
 0   trans_date_trans_time  555719 non-null  datetime64[ns]
 1   cc_num                 555719 non-null  int64         
 2   category               555719 non-null  object        
 3   amt                    555719 non-null  float64       
 4   gender                 555719 non-null  object        
 5   lat                    555719 non-null  float64       
 6   long                   555719 non-null  float64       
 7   city_pop               555719 non-null  int64         
 8   dob                    555719 non-null  object        
 9   merch_lat              555719 non-null  float64       
 10  merch_long             555719 non-null  float64       
 11  is_fraud               555719 non-null  int64         
dtypes: datetime64[ns](1), float64(5), int64(3), 

In [14]:
#결제 금액은 범주, 범위를 고려해야 하기 때문에 정규화가 필요하다
amt_info = data.groupby('cc_num').agg(['mean','std'])['amt'].reset_index()

TypeError: agg function failed [how->mean,dtype->object]

In [15]:
amt_info = data.groupby('cc_num')['amt'].agg(['mean', 'std']).reset_index()
amt_info
#카드 번호에 따른 결제 금액의 평균과 표준편차

Unnamed: 0,cc_num,mean,std
0,60416207185,66.499484,180.015519
1,60422928733,57.433109,60.384158
2,60423098130,54.613684,60.577321
3,60427851591,96.435535,93.624727
4,60487002085,91.845732,249.454342
...,...,...,...
919,4958589671582726883,68.914378,102.371028
920,4973530368125489546,69.495222,109.905318
921,4980323467523543940,60.990196,87.641555
922,4989847570577635369,104.785625,138.977971


In [16]:
data = data.merge(amt_info, on = 'cc_num', how = 'left')

In [17]:
data.head()

Unnamed: 0,trans_date_trans_time,cc_num,category,amt,gender,lat,long,city_pop,dob,merch_lat,merch_long,is_fraud,mean,std
0,2020-06-21 12:14:25,2291163933867244,personal_care,2.86,M,33.9659,-80.9355,333497,1968-03-19,33.986391,-81.200714,0,61.893062,158.690646
1,2020-06-21 12:14:33,3573030041201292,personal_care,29.84,F,40.3207,-110.436,302,1990-01-17,39.450498,-109.960431,0,53.596523,102.395948
2,2020-06-21 12:14:53,3598215285024754,health_fitness,41.28,F,40.6729,-73.5365,34496,1970-10-21,40.49581,-74.196111,0,83.053812,103.740968
3,2020-06-21 12:15:15,3591919803438423,misc_pos,60.05,M,28.5697,-80.8191,54767,1987-07-25,28.812398,-80.883061,0,59.207526,108.530732
4,2020-06-21 12:15:17,3526826139003047,travel,3.19,M,44.2529,-85.017,1126,1955-07-06,44.959148,-85.884734,0,55.274501,76.159193


In [18]:
data['amt_z_score'] = (data['amt'] - data['mean']) / data['std']

In [19]:
data.head()

Unnamed: 0,trans_date_trans_time,cc_num,category,amt,gender,lat,long,city_pop,dob,merch_lat,merch_long,is_fraud,mean,std,amt_z_score
0,2020-06-21 12:14:25,2291163933867244,personal_care,2.86,M,33.9659,-80.9355,333497,1968-03-19,33.986391,-81.200714,0,61.893062,158.690646,-0.372001
1,2020-06-21 12:14:33,3573030041201292,personal_care,29.84,F,40.3207,-110.436,302,1990-01-17,39.450498,-109.960431,0,53.596523,102.395948,-0.232006
2,2020-06-21 12:14:53,3598215285024754,health_fitness,41.28,F,40.6729,-73.5365,34496,1970-10-21,40.49581,-74.196111,0,83.053812,103.740968,-0.402674
3,2020-06-21 12:15:15,3591919803438423,misc_pos,60.05,M,28.5697,-80.8191,54767,1987-07-25,28.812398,-80.883061,0,59.207526,108.530732,0.007763
4,2020-06-21 12:15:17,3526826139003047,travel,3.19,M,44.2529,-85.017,1126,1955-07-06,44.959148,-85.884734,0,55.274501,76.159193,-0.68389


In [20]:
data[['amt','mean','std','amt_z_score']].head()

Unnamed: 0,amt,mean,std,amt_z_score
0,2.86,61.893062,158.690646,-0.372001
1,29.84,53.596523,102.395948,-0.232006
2,41.28,83.053812,103.740968,-0.402674
3,60.05,59.207526,108.530732,0.007763
4,3.19,55.274501,76.159193,-0.68389


In [21]:
data.drop(['mean','std'],axis = 1, inplace=True)

In [22]:
category_info = data.groupby(['cc_num','category'])['amt'].agg(['mean','std']).reset_index()

In [23]:
category_info.head()

Unnamed: 0,cc_num,category,mean,std
0,60416207185,entertainment,43.049118,40.27719
1,60416207185,food_dining,29.134848,46.869619
2,60416207185,gas_transport,60.966087,17.625537
3,60416207185,grocery_net,51.674348,17.676489
4,60416207185,grocery_pos,101.167544,20.088836


In [24]:
data = data.merge(category_info, on=['cc_num','category'], how = 'left')

In [25]:
data['cat_z_score'] = (data['amt'] - data['mean']) / data['std']

In [26]:
data.drop(['mean','std'],axis = 1, inplace=True)

In [27]:
data.head()

Unnamed: 0,trans_date_trans_time,cc_num,category,amt,gender,lat,long,city_pop,dob,merch_lat,merch_long,is_fraud,amt_z_score,cat_z_score
0,2020-06-21 12:14:25,2291163933867244,personal_care,2.86,M,33.9659,-80.9355,333497,1968-03-19,33.986391,-81.200714,0,-0.372001,-0.815329
1,2020-06-21 12:14:33,3573030041201292,personal_care,29.84,F,40.3207,-110.436,302,1990-01-17,39.450498,-109.960431,0,-0.232006,-0.22748
2,2020-06-21 12:14:53,3598215285024754,health_fitness,41.28,F,40.6729,-73.5365,34496,1970-10-21,40.49581,-74.196111,0,-0.402674,-0.375667
3,2020-06-21 12:15:15,3591919803438423,misc_pos,60.05,M,28.5697,-80.8191,54767,1987-07-25,28.812398,-80.883061,0,0.007763,-0.053865
4,2020-06-21 12:15:17,3526826139003047,travel,3.19,M,44.2529,-85.017,1126,1955-07-06,44.959148,-85.884734,0,-0.68389,-0.496103


In [28]:
import geopy.distance

In [29]:
data['merch_coord'] = pd.Series(zip(data['merch_lat'],data['merch_long']))#merch_lat, merch_long을 하나로 묶어서 merch_coord
data['cust_coord'] = pd.Series(zip(data['lat'],data['long']))

In [30]:
data['distance'] = data.apply(lambda x: geopy.distance.distance(x['merch_coord'],x['cust_coord']).km, axis = 1)

In [31]:
#distance가 평균보다 먼 곳에서 이루어졌는지 확인
distance_info = data.groupby('cc_num')['distance'].agg(['mean','std']).reset_index()

In [32]:
data = data.merge(distance_info, on='cc_num', how = 'left')

In [33]:
data['distance_z_score'] = (data['distance'] - data['mean'])/data['std']

In [34]:
data.drop(['mean','std'],axis = 1, inplace=True)

In [35]:
data.head()

Unnamed: 0,trans_date_trans_time,cc_num,category,amt,gender,lat,long,city_pop,dob,merch_lat,merch_long,is_fraud,amt_z_score,cat_z_score,merch_coord,cust_coord,distance,distance_z_score
0,2020-06-21 12:14:25,2291163933867244,personal_care,2.86,M,33.9659,-80.9355,333497,1968-03-19,33.986391,-81.200714,0,-0.372001,-0.815329,"(33.986391, -81.200714)","(33.9659, -80.9355)",24.613746,-1.950022
1,2020-06-21 12:14:33,3573030041201292,personal_care,29.84,F,40.3207,-110.436,302,1990-01-17,39.450498,-109.960431,0,-0.232006,-0.22748,"(39.450498, -109.960431)","(40.3207, -110.436)",104.834043,1.003337
2,2020-06-21 12:14:53,3598215285024754,health_fitness,41.28,F,40.6729,-73.5365,34496,1970-10-21,40.49581,-74.196111,0,-0.402674,-0.375667,"(40.49581, -74.196111)","(40.6729, -73.5365)",59.204796,-0.56679
3,2020-06-21 12:15:15,3591919803438423,misc_pos,60.05,M,28.5697,-80.8191,54767,1987-07-25,28.812398,-80.883061,0,0.007763,-0.053865,"(28.812398, -80.883061)","(28.5697, -80.8191)",27.615117,-1.774967
4,2020-06-21 12:15:17,3526826139003047,travel,3.19,M,44.2529,-85.017,1126,1955-07-06,44.959148,-85.884734,0,-0.68389,-0.496103,"(44.959148, -85.884734)","(44.2529, -85.01700000000001)",104.423175,1.136519


In [36]:
data['age'] = 2025 - pd.to_datetime(data['dob']).dt.year

In [37]:
data.head()

Unnamed: 0,trans_date_trans_time,cc_num,category,amt,gender,lat,long,city_pop,dob,merch_lat,merch_long,is_fraud,amt_z_score,cat_z_score,merch_coord,cust_coord,distance,distance_z_score,age
0,2020-06-21 12:14:25,2291163933867244,personal_care,2.86,M,33.9659,-80.9355,333497,1968-03-19,33.986391,-81.200714,0,-0.372001,-0.815329,"(33.986391, -81.200714)","(33.9659, -80.9355)",24.613746,-1.950022,57
1,2020-06-21 12:14:33,3573030041201292,personal_care,29.84,F,40.3207,-110.436,302,1990-01-17,39.450498,-109.960431,0,-0.232006,-0.22748,"(39.450498, -109.960431)","(40.3207, -110.436)",104.834043,1.003337,35
2,2020-06-21 12:14:53,3598215285024754,health_fitness,41.28,F,40.6729,-73.5365,34496,1970-10-21,40.49581,-74.196111,0,-0.402674,-0.375667,"(40.49581, -74.196111)","(40.6729, -73.5365)",59.204796,-0.56679,55
3,2020-06-21 12:15:15,3591919803438423,misc_pos,60.05,M,28.5697,-80.8191,54767,1987-07-25,28.812398,-80.883061,0,0.007763,-0.053865,"(28.812398, -80.883061)","(28.5697, -80.8191)",27.615117,-1.774967,38
4,2020-06-21 12:15:17,3526826139003047,travel,3.19,M,44.2529,-85.017,1126,1955-07-06,44.959148,-85.884734,0,-0.68389,-0.496103,"(44.959148, -85.884734)","(44.2529, -85.01700000000001)",104.423175,1.136519,70


In [38]:
data.drop(['cc_num','lat','long','merch_lat','merch_long','dob','merch_coord','cust_coord'],axis=1,inplace=True)
#필요한 데이터 확보가 끝났으니 필요 없는 데이터 제거

In [39]:
data.head()
#category, gender 데이터는 숫자로 변환 필요

Unnamed: 0,trans_date_trans_time,category,amt,gender,city_pop,is_fraud,amt_z_score,cat_z_score,distance,distance_z_score,age
0,2020-06-21 12:14:25,personal_care,2.86,M,333497,0,-0.372001,-0.815329,24.613746,-1.950022,57
1,2020-06-21 12:14:33,personal_care,29.84,F,302,0,-0.232006,-0.22748,104.834043,1.003337,35
2,2020-06-21 12:14:53,health_fitness,41.28,F,34496,0,-0.402674,-0.375667,59.204796,-0.56679,55
3,2020-06-21 12:15:15,misc_pos,60.05,M,54767,0,0.007763,-0.053865,27.615117,-1.774967,38
4,2020-06-21 12:15:17,travel,3.19,M,1126,0,-0.68389,-0.496103,104.423175,1.136519,70


In [40]:
data = pd.get_dummies(data, columns=['category','gender'], drop_first = True)

In [41]:
data.head()

Unnamed: 0,trans_date_trans_time,amt,city_pop,is_fraud,amt_z_score,cat_z_score,distance,distance_z_score,age,category_food_dining,category_gas_transport,...,category_grocery_pos,category_health_fitness,category_home,category_kids_pets,category_misc_net,category_misc_pos,category_personal_care,category_shopping_net,category_shopping_pos,category_travel,gender_M
0,2020-06-21 12:14:25,2.86,333497,0,-0.372001,-0.815329,24.613746,-1.950022,57,False,False,...,False,False,False,False,False,False,True,False,False,False,True
1,2020-06-21 12:14:33,29.84,302,0,-0.232006,-0.22748,104.834043,1.003337,35,False,False,...,False,False,False,False,False,False,True,False,False,False,False
2,2020-06-21 12:14:53,41.28,34496,0,-0.402674,-0.375667,59.204796,-0.56679,55,False,False,...,False,True,False,False,False,False,False,False,False,False,False
3,2020-06-21 12:15:15,60.05,54767,0,0.007763,-0.053865,27.615117,-1.774967,38,False,False,...,False,False,False,False,False,True,False,False,False,False,True
4,2020-06-21 12:15:17,3.19,1126,0,-0.68389,-0.496103,104.423175,1.136519,70,False,False,...,False,False,False,False,False,False,False,False,False,True,True


In [42]:
data.set_index('trans_date_trans_time',inplace=True)

In [43]:
data.head()

Unnamed: 0_level_0,amt,city_pop,is_fraud,amt_z_score,cat_z_score,distance,distance_z_score,age,category_food_dining,category_gas_transport,category_grocery_net,category_grocery_pos,category_health_fitness,category_home,category_kids_pets,category_misc_net,category_misc_pos,category_personal_care,category_shopping_net,category_shopping_pos,category_travel,gender_M
trans_date_trans_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2020-06-21 12:14:25,2.86,333497,0,-0.372001,-0.815329,24.613746,-1.950022,57,False,False,False,False,False,False,False,False,False,True,False,False,False,True
2020-06-21 12:14:33,29.84,302,0,-0.232006,-0.22748,104.834043,1.003337,35,False,False,False,False,False,False,False,False,False,True,False,False,False,False
2020-06-21 12:14:53,41.28,34496,0,-0.402674,-0.375667,59.204796,-0.56679,55,False,False,False,False,True,False,False,False,False,False,False,False,False,False
2020-06-21 12:15:15,60.05,54767,0,0.007763,-0.053865,27.615117,-1.774967,38,False,False,False,False,False,False,False,False,True,False,False,False,False,True
2020-06-21 12:15:17,3.19,1126,0,-0.68389,-0.496103,104.423175,1.136519,70,False,False,False,False,False,False,False,False,False,False,False,False,True,True


#리프 중심 트리 분할(leaf-wise tree growth)
##-XGBoost와 LightGBM의 중요한 차이점
##- 동일한 레벨로 노드를 확장하지 않고 불규칙적으로 노드를 뻗어 나가기 때문에 더욱 빠르고 높은 예측율을 보이나 오버피팅을 유의해야 함

In [58]:
train = data[data.index < '2020-11-01']
test = data[data.index >= '2020-11-01']

In [59]:
len(test)/len(data)

0.3817990747122197

In [55]:
data.head()

Unnamed: 0_level_0,amt,city_pop,is_fraud,amt_z_score,cat_z_score,distance,distance_z_score,age,category_food_dining,category_gas_transport,category_grocery_net,category_grocery_pos,category_health_fitness,category_home,category_kids_pets,category_misc_net,category_misc_pos,category_personal_care,category_shopping_net,category_shopping_pos,category_travel,gender_M
trans_date_trans_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2020-06-21 12:14:25,2.86,333497,0,-0.372001,-0.815329,24.613746,-1.950022,57,False,False,False,False,False,False,False,False,False,True,False,False,False,True
2020-06-21 12:14:33,29.84,302,0,-0.232006,-0.22748,104.834043,1.003337,35,False,False,False,False,False,False,False,False,False,True,False,False,False,False
2020-06-21 12:14:53,41.28,34496,0,-0.402674,-0.375667,59.204796,-0.56679,55,False,False,False,False,True,False,False,False,False,False,False,False,False,False
2020-06-21 12:15:15,60.05,54767,0,0.007763,-0.053865,27.615117,-1.774967,38,False,False,False,False,False,False,False,False,True,False,False,False,False,True
2020-06-21 12:15:17,3.19,1126,0,-0.68389,-0.496103,104.423175,1.136519,70,False,False,False,False,False,False,False,False,False,False,False,False,True,True


In [54]:
data.tail()

Unnamed: 0_level_0,amt,city_pop,is_fraud,amt_z_score,cat_z_score,distance,distance_z_score,age,category_food_dining,category_gas_transport,category_grocery_net,category_grocery_pos,category_health_fitness,category_home,category_kids_pets,category_misc_net,category_misc_pos,category_personal_care,category_shopping_net,category_shopping_pos,category_travel,gender_M
trans_date_trans_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2020-12-31 23:59:07,43.77,519,0,-0.181437,-0.041261,77.032467,0.073641,59,False,False,False,False,True,False,False,False,False,False,False,False,False,True
2020-12-31 23:59:09,111.84,28739,0,0.604336,1.105618,100.023736,0.702,26,False,False,False,False,False,False,True,False,False,False,False,False,False,True
2020-12-31 23:59:15,86.88,3684,0,-0.040486,0.508358,80.887812,0.299454,44,False,False,False,False,False,False,True,False,False,False,False,False,False,False
2020-12-31 23:59:24,7.99,129,0,-0.634221,0.732834,53.060882,-0.750497,60,False,False,False,False,False,False,False,False,False,False,False,False,True,True
2020-12-31 23:59:34,38.13,116001,0,-0.170473,-0.411246,72.38099,-0.110232,32,False,False,False,False,False,False,False,False,False,False,False,False,False,True


In [62]:
X_train = train.drop('is_fraud',axis=1)
X_test = test.drop('is_fraud',axis = 1)
y_train = train['is_fraud']
y_test = test['is_fraud']

In [63]:
import lightgbm as lgb

In [64]:
model1 = lgb.LGBMClassifier(random_state=100)
model1.fit(X_train,y_train)
pred1 = model1.predict(X_test)

[LightGBM] [Info] Number of positive: 1593, number of negative: 341953
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.017277 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1634
[LightGBM] [Info] Number of data points in the train set: 343546, number of used features: 21
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.004637 -> initscore=-5.369054
[LightGBM] [Info] Start training from score -5.369054


In [66]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score

In [68]:
accuracy_score(y_test, pred1)

0.9966395347193092

In [69]:
confusion_matrix(y_test,pred1)

array([[211165,    456],
       [   257,    295]])

In [71]:
print(classification_report(y_test,pred1))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00    211621
           1       0.39      0.53      0.45       552

    accuracy                           1.00    212173
   macro avg       0.70      0.77      0.73    212173
weighted avg       1.00      1.00      1.00    212173



#사기 거래를 잘 발견하기..
#실제 이상 거래를 얼마나 예측하는지 : 재현율 이 중요

In [79]:
proba1 = model1.predict_proba(X_test)

In [80]:
proba1

array([[9.99997202e-01, 2.79804846e-06],
       [9.99998455e-01, 1.54471421e-06],
       [1.00000000e+00, 0.00000000e+00],
       ...,
       [9.99924807e-01, 7.51933012e-05],
       [9.99968752e-01, 3.12480760e-05],
       [9.99923666e-01, 7.63341414e-05]])

In [81]:
proba1 = proba1[:,1]#1 - proba1[1] == proba1[0]

In [82]:
proba_int1 = (proba1 > 0.2).astype('int')#기준을 0.5에서 0.2로 바꾸기
proba_int2 = (proba1 > 0.8).astype('int')

In [88]:
print(confusion_matrix(y_test,proba_int1))

[[210788    833]
 [   214    338]]


In [89]:
print(classification_report(y_test,proba_int1))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00    211621
           1       0.29      0.61      0.39       552

    accuracy                           1.00    212173
   macro avg       0.64      0.80      0.69    212173
weighted avg       1.00      1.00      1.00    212173



In [90]:
print(confusion_matrix(y_test,proba_int2))
#기준을 0.8로 올렸더니 참 양성이 236건으로 많이 줄었음
#하지만 거짓 양성과 거짓 음성이 늘었음
#정상 거래를 이상 거래로 잘못 의심하는 경우는 줄었지만
#이상 거래를 놓치는 경우가 많아짐

[[211261    360]
 [   316    236]]


In [91]:
print(classification_report(y_test,proba_int2))#재현율이 떨어짐

              precision    recall  f1-score   support

           0       1.00      1.00      1.00    211621
           1       0.40      0.43      0.41       552

    accuracy                           1.00    212173
   macro avg       0.70      0.71      0.70    212173
weighted avg       1.00      1.00      1.00    212173



#분석의 목적에 따라서 분석의 기준을 바꿀 수 있다

In [92]:
roc_auc_score(y_test, proba1)#보통 0.8 이상이면 높은 수치
#하지만 이상 거래 샘플 자체가 적기 때문에 모든 거래에 대해 정상 거래라고 찍어도 높은 수치가 나온다
#따라서 위의 수치를 그대로 신뢰하긴 어렵다

np.float64(0.8920377309750293)

#L1, L2 정규화
##- 둘 다 매개변수에 패널티를 가해서 그 영향력을 감소시키는 방법으로 오버피팅을 방지하는 목적으로 쓰임
##- L1 정규화는 일부 매개변수가 0이 되도록 패널티를 가할 수 있기 때문에 피처 셀렉션 효과도 있음

#ROC 곡선과 AUC
##- 이진분류 모델을 평가하는 방법으로, 기준점에 영향을 받지 않기 때문에 여러 모델을 비교할 때 요긴하게 사용됨
##- AUC는 ROC 곡선의 아래 면적을 의미하고, 0.5~1사이의 값을 지니며 높을수록 좋은 모델임

#혼동 행렬
##print(confusion_matrix(y_test,proba_int1))
##[[210788    833] 833 : 거짓 양성
##[   214    338]] 214 : 거짓 음성 , 1182 : 참 양성

#분류 리포트 확인
##print(classification_report(y_test,proba_int1))
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    211621
           1       0.29      0.61      0.39       552

    accuracy                           1.00    212173
    macro avg       0.64      0.80      0.69    212173
    weighted avg       1.00      1.00      1.00    212173


#AUC(area under the curve)
##- ROC 곡선의 ROC커프의 아래쪽 면적

#ROC 곡선
##- 민감도와 특이도의 개념을 통해 만들어짐
##- 민감도 : TPR = TP(참 양성) / (TP(참 양성) + FN(거짓 음성))
##- 특이도 : FPR = FP(거짓 양성) / (FP(거짓 양성) + TN(참 음성))

##- 곡선의 기울기가 클 수록 완벽하게 분류된 정상거래라 볼 수 있음
##- 곡선의 기울기가 1에 가까울 수록(직선) 모델이 전혀 분로를 하지 못하는 경우라 볼 수 있음

#AUC
##- 0과 1을 잘 분리하는지 기준점과 상관없이 보여줌
##- 여러 모델 비교 : 적합한 객관적인 지표
##기준선 0.5
## 0.5를 넘으면 1로 간주, 못 넘으면 0으로 간주

#중첩(AUC = 0.7)
## 참 음성(TN) 거짓음성(FN) 0.5 거짓양성(FP) 참 양성(TP)

#완벽한 예측 모델(AUC  = 1)
## 참 음성(TN) 0.5 참 양성(TP)
## 이상 거래는 모두 0.5이상, 정상 거래는 모두 0.5이하, 모두 완벽하게 예측

#아무런 예측도 하지 못하는 모델(AUC=1)
## 정상 거래와 이상 거래 모두 곡선이 비슷함







#1단계 : 문제정의
##- 카드 거래 내역 데이터셋을 LightGBM으로 학습하여 이상거래를 예측
#2단계 : 라이브러리 및 데이터 불러오기
##- 판다스, 넘파이, 맷플롯립, 시본 라이브러리를 임포트
##- 프로젝트에 쓸 예제 데이터셋 불러옴
#3단계 : 전처리
##ㄴ데이터 클리닝
##- 불필요한 변수를 삭제하고, 변수의 속성을 적절하게 변경
##ㄴ피처 엔지니어링
##- 결제금액, 범주, 거리, 나이 등을 기반으로 새로운 변수 만듦
##4단계 : 모델링 및 평가하기
##- LightGBM으로 모델링하여 0.89라는 AUC르 얻음
##- 일반적으로 좋은 수치