## Importing important libraries

In [1]:
import pandas as pd
from sklearn.metrics import classification_report, confusion_matrix
import  lightgbm as lgb


## Reading the dataset

In [2]:
data = pd.read_csv('fraudTrain.csv')
print(data.head())
print(data.info())
print(data.describe())

   Unnamed: 0 trans_date_trans_time            cc_num  \
0           0   2019-01-01 00:00:18  2703186189652095   
1           1   2019-01-01 00:00:44      630423337322   
2           2   2019-01-01 00:00:51    38859492057661   
3           3   2019-01-01 00:01:16  3534093764340240   
4           4   2019-01-01 00:03:06   375534208663984   

                             merchant       category     amt      first  \
0          fraud_Rippin, Kub and Mann       misc_net    4.97   Jennifer   
1     fraud_Heller, Gutmann and Zieme    grocery_pos  107.23  Stephanie   
2                fraud_Lind-Buckridge  entertainment  220.11     Edward   
3  fraud_Kutch, Hermiston and Farrell  gas_transport   45.00     Jeremy   
4                 fraud_Keeling-Crist       misc_pos   41.96      Tyler   

      last gender                        street  ...      lat      long  \
0    Banks      F                561 Perry Cove  ...  36.0788  -81.1781   
1     Gill      F  43039 Riley Greens Suite 393  ...  48

### Unnessary columns 

In [3]:
data = data.drop(columns=['Unnamed: 0', 'cc_num', 'first', 'last', 'street', 'trans_num', 'zip'])

### date time format

In [4]:
data['trans_date_trans_time']=pd.to_datetime(data['trans_date_trans_time'])

In [5]:
data['hour']=data['trans_date_trans_time'].dt.hour

In [6]:
data['day']=data['trans_date_trans_time'].dt.day

In [7]:
data['month']=data['trans_date_trans_time'].dt.month

In [8]:
data['weekday'] = data['trans_date_trans_time'].dt.weekday

In [9]:
data['dob']=pd.to_datetime(data['dob'])

In [10]:
data['age']=data['trans_date_trans_time'].dt.year-data['dob'].dt.year

In [11]:
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1296675 entries, 0 to 1296674
Data columns (total 21 columns):
 #   Column                 Non-Null Count    Dtype         
---  ------                 --------------    -----         
 0   trans_date_trans_time  1296675 non-null  datetime64[ns]
 1   merchant               1296675 non-null  object        
 2   category               1296675 non-null  object        
 3   amt                    1296675 non-null  float64       
 4   gender                 1296675 non-null  object        
 5   city                   1296675 non-null  object        
 6   state                  1296675 non-null  object        
 7   lat                    1296675 non-null  float64       
 8   long                   1296675 non-null  float64       
 9   city_pop               1296675 non-null  int64         
 10  job                    1296675 non-null  object        
 11  dob                    1296675 non-null  datetime64[ns]
 12  unix_time              12966

In [12]:
data

Unnamed: 0,trans_date_trans_time,merchant,category,amt,gender,city,state,lat,long,city_pop,...,dob,unix_time,merch_lat,merch_long,is_fraud,hour,day,month,weekday,age
0,2019-01-01 00:00:18,"fraud_Rippin, Kub and Mann",misc_net,4.97,F,Moravian Falls,NC,36.0788,-81.1781,3495,...,1988-03-09,1325376018,36.011293,-82.048315,0,0,1,1,1,31
1,2019-01-01 00:00:44,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,F,Orient,WA,48.8878,-118.2105,149,...,1978-06-21,1325376044,49.159047,-118.186462,0,0,1,1,1,41
2,2019-01-01 00:00:51,fraud_Lind-Buckridge,entertainment,220.11,M,Malad City,ID,42.1808,-112.2620,4154,...,1962-01-19,1325376051,43.150704,-112.154481,0,0,1,1,1,57
3,2019-01-01 00:01:16,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.00,M,Boulder,MT,46.2306,-112.1138,1939,...,1967-01-12,1325376076,47.034331,-112.561071,0,0,1,1,1,52
4,2019-01-01 00:03:06,fraud_Keeling-Crist,misc_pos,41.96,M,Doe Hill,VA,38.4207,-79.4629,99,...,1986-03-28,1325376186,38.674999,-78.632459,0,0,1,1,1,33
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1296670,2020-06-21 12:12:08,fraud_Reichel Inc,entertainment,15.56,M,Hatch,UT,37.7175,-112.4777,258,...,1961-11-24,1371816728,36.841266,-111.690765,0,12,21,6,6,59
1296671,2020-06-21 12:12:19,fraud_Abernathy and Sons,food_dining,51.70,M,Tuscarora,MD,39.2667,-77.5101,100,...,1979-12-11,1371816739,38.906881,-78.246528,0,12,21,6,6,41
1296672,2020-06-21 12:12:32,fraud_Stiedemann Ltd,food_dining,105.93,M,High Rolls Mountain Park,NM,32.9396,-105.8189,899,...,1967-08-30,1371816752,33.619513,-105.130529,0,12,21,6,6,53
1296673,2020-06-21 12:13:36,"fraud_Reinger, Weissnat and Strosin",food_dining,74.90,M,Manderson,SD,43.3526,-102.5411,1126,...,1980-08-18,1371816816,42.788940,-103.241160,0,12,21,6,6,40


In [13]:
data=data.drop(columns=['trans_date_trans_time','dob'])

### checking for nonnmurical data

In [14]:
categorical_columns = data.select_dtypes(include=['object']).columns

In [15]:
counts = data[categorical_columns].nunique()

In [16]:
print(counts)

merchant    693
category     14
gender        2
city        894
state        51
job         494
dtype: int64


In [17]:
data

Unnamed: 0,merchant,category,amt,gender,city,state,lat,long,city_pop,job,unix_time,merch_lat,merch_long,is_fraud,hour,day,month,weekday,age
0,"fraud_Rippin, Kub and Mann",misc_net,4.97,F,Moravian Falls,NC,36.0788,-81.1781,3495,"Psychologist, counselling",1325376018,36.011293,-82.048315,0,0,1,1,1,31
1,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,F,Orient,WA,48.8878,-118.2105,149,Special educational needs teacher,1325376044,49.159047,-118.186462,0,0,1,1,1,41
2,fraud_Lind-Buckridge,entertainment,220.11,M,Malad City,ID,42.1808,-112.2620,4154,Nature conservation officer,1325376051,43.150704,-112.154481,0,0,1,1,1,57
3,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.00,M,Boulder,MT,46.2306,-112.1138,1939,Patent attorney,1325376076,47.034331,-112.561071,0,0,1,1,1,52
4,fraud_Keeling-Crist,misc_pos,41.96,M,Doe Hill,VA,38.4207,-79.4629,99,Dance movement psychotherapist,1325376186,38.674999,-78.632459,0,0,1,1,1,33
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1296670,fraud_Reichel Inc,entertainment,15.56,M,Hatch,UT,37.7175,-112.4777,258,Geoscientist,1371816728,36.841266,-111.690765,0,12,21,6,6,59
1296671,fraud_Abernathy and Sons,food_dining,51.70,M,Tuscarora,MD,39.2667,-77.5101,100,"Production assistant, television",1371816739,38.906881,-78.246528,0,12,21,6,6,41
1296672,fraud_Stiedemann Ltd,food_dining,105.93,M,High Rolls Mountain Park,NM,32.9396,-105.8189,899,Naval architect,1371816752,33.619513,-105.130529,0,12,21,6,6,53
1296673,"fraud_Reinger, Weissnat and Strosin",food_dining,74.90,M,Manderson,SD,43.3526,-102.5411,1126,Volunteer coordinator,1371816816,42.788940,-103.241160,0,12,21,6,6,40


In [18]:
test=pd.read_csv('fraudTest.csv')

In [19]:
test = test.drop(columns=['Unnamed: 0', 'cc_num', 'first', 'last', 'street', 'trans_num', 'zip'])

In [20]:
test['trans_date_trans_time']=pd.to_datetime(test['trans_date_trans_time'])

In [21]:
test['hour']=test['trans_date_trans_time'].dt.hour

In [22]:
test['day']=test['trans_date_trans_time'].dt.day
test['month']=test['trans_date_trans_time'].dt.month
test['weekday'] = test['trans_date_trans_time'].dt.weekday


In [23]:
test['dob']=pd.to_datetime(test['dob'])

In [24]:
test['age']=test['trans_date_trans_time'].dt.year-test['dob'].dt.year

In [25]:
test=test.drop(columns=['trans_date_trans_time','dob'])

In [26]:
Columns=['merchant','city','job']

for c in Columns:
    freq=data[c].value_counts()
    data[f'{c}Encoded']=data[c].map(freq)
    test[f'{c}Encoded']=test[c].map(freq)
    test[f'{c}Encoded'].fillna(0,inplace=True)

In [27]:
data

Unnamed: 0,merchant,category,amt,gender,city,state,lat,long,city_pop,job,...,merch_long,is_fraud,hour,day,month,weekday,age,merchantEncoded,cityEncoded,jobEncoded
0,"fraud_Rippin, Kub and Mann",misc_net,4.97,F,Moravian Falls,NC,36.0788,-81.1781,3495,"Psychologist, counselling",...,-82.048315,0,0,1,1,1,31,1267,2028,3545
1,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,F,Orient,WA,48.8878,-118.2105,149,Special educational needs teacher,...,-118.186462,0,0,1,1,1,41,2503,3545,5099
2,fraud_Lind-Buckridge,entertainment,220.11,M,Malad City,ID,42.1808,-112.2620,4154,Nature conservation officer,...,-112.154481,0,0,1,1,1,57,1895,503,511
3,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.00,M,Boulder,MT,46.2306,-112.1138,1939,Patent attorney,...,-112.561071,0,0,1,1,1,52,2613,493,2530
4,fraud_Keeling-Crist,misc_pos,41.96,M,Doe Hill,VA,38.4207,-79.4629,99,Dance movement psychotherapist,...,-78.632459,0,0,1,1,1,33,1592,2017,2017
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1296670,fraud_Reichel Inc,entertainment,15.56,M,Hatch,UT,37.7175,-112.4777,258,Geoscientist,...,-111.690765,0,12,21,6,6,59,1907,1513,5620
1296671,fraud_Abernathy and Sons,food_dining,51.70,M,Tuscarora,MD,39.2667,-77.5101,100,"Production assistant, television",...,-78.246528,0,12,21,6,6,41,1751,531,531
1296672,fraud_Stiedemann Ltd,food_dining,105.93,M,High Rolls Mountain Park,NM,32.9396,-105.8189,899,Naval architect,...,-105.130529,0,12,21,6,6,53,1853,2070,8684
1296673,"fraud_Reinger, Weissnat and Strosin",food_dining,74.90,M,Manderson,SD,43.3526,-102.5411,1126,Volunteer coordinator,...,-103.241160,0,12,21,6,6,40,1910,2024,2024


In [28]:
test

Unnamed: 0,merchant,category,amt,gender,city,state,lat,long,city_pop,job,...,merch_long,is_fraud,hour,day,month,weekday,age,merchantEncoded,cityEncoded,jobEncoded
0,fraud_Kirlin and Sons,personal_care,2.86,M,Columbia,SC,33.9659,-80.9355,333497,Mechanical engineer,...,-81.200714,0,12,21,6,6,52,1816,1561.0,5632.0
1,fraud_Sporer-Keebler,personal_care,29.84,F,Altonah,UT,40.3207,-110.4360,302,"Sales professional, IT",...,-109.960431,0,12,21,6,6,30,1825,2089.0,5517.0
2,"fraud_Swaniawski, Nitzsche and Welch",health_fitness,41.28,F,Bellmore,NY,40.6729,-73.5365,34496,"Librarian, public",...,-74.196111,0,12,21,6,6,50,1716,2577.0,6186.0
3,fraud_Haley Group,misc_pos,60.05,M,Titusville,FL,28.5697,-80.8191,54767,Set designer,...,-80.883061,0,12,21,6,6,33,1629,1526.0,1545.0
4,fraud_Johnston-Casper,travel,3.19,M,Falmouth,MI,44.2529,-85.0170,1126,Furniture designer,...,-85.884734,0,12,21,6,6,65,831,2034.0,2043.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
555714,fraud_Reilly and Sons,health_fitness,43.77,M,Luray,MO,40.4931,-91.8912,519,Town planner,...,-91.333331,0,23,31,12,3,54,1682,1567.0,4151.0
555715,fraud_Hoppe-Parisian,kids_pets,111.84,M,Lake Jackson,TX,29.0393,-95.4401,28739,Futures trader,...,-96.186633,0,23,31,12,3,21,2282,2556.0,4136.0
555716,fraud_Rau-Robel,kids_pets,86.88,F,Burbank,WA,46.1966,-118.9017,3684,Musician,...,-119.715054,0,23,31,12,3,39,2249,4159.0,6178.0
555717,fraud_Breitenberg LLC,travel,7.99,M,Mesa,ID,44.6255,-116.4493,129,Cartographer,...,-117.080888,0,23,31,12,3,55,800,2047.0,2038.0


In [29]:
data

Unnamed: 0,merchant,category,amt,gender,city,state,lat,long,city_pop,job,...,merch_long,is_fraud,hour,day,month,weekday,age,merchantEncoded,cityEncoded,jobEncoded
0,"fraud_Rippin, Kub and Mann",misc_net,4.97,F,Moravian Falls,NC,36.0788,-81.1781,3495,"Psychologist, counselling",...,-82.048315,0,0,1,1,1,31,1267,2028,3545
1,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,F,Orient,WA,48.8878,-118.2105,149,Special educational needs teacher,...,-118.186462,0,0,1,1,1,41,2503,3545,5099
2,fraud_Lind-Buckridge,entertainment,220.11,M,Malad City,ID,42.1808,-112.2620,4154,Nature conservation officer,...,-112.154481,0,0,1,1,1,57,1895,503,511
3,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.00,M,Boulder,MT,46.2306,-112.1138,1939,Patent attorney,...,-112.561071,0,0,1,1,1,52,2613,493,2530
4,fraud_Keeling-Crist,misc_pos,41.96,M,Doe Hill,VA,38.4207,-79.4629,99,Dance movement psychotherapist,...,-78.632459,0,0,1,1,1,33,1592,2017,2017
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1296670,fraud_Reichel Inc,entertainment,15.56,M,Hatch,UT,37.7175,-112.4777,258,Geoscientist,...,-111.690765,0,12,21,6,6,59,1907,1513,5620
1296671,fraud_Abernathy and Sons,food_dining,51.70,M,Tuscarora,MD,39.2667,-77.5101,100,"Production assistant, television",...,-78.246528,0,12,21,6,6,41,1751,531,531
1296672,fraud_Stiedemann Ltd,food_dining,105.93,M,High Rolls Mountain Park,NM,32.9396,-105.8189,899,Naval architect,...,-105.130529,0,12,21,6,6,53,1853,2070,8684
1296673,"fraud_Reinger, Weissnat and Strosin",food_dining,74.90,M,Manderson,SD,43.3526,-102.5411,1126,Volunteer coordinator,...,-103.241160,0,12,21,6,6,40,1910,2024,2024


### Encoding

In [30]:
data=pd.get_dummies(data,columns=['gender'],drop_first=True)

In [31]:
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1296675 entries, 0 to 1296674
Data columns (total 22 columns):
 #   Column           Non-Null Count    Dtype  
---  ------           --------------    -----  
 0   merchant         1296675 non-null  object 
 1   category         1296675 non-null  object 
 2   amt              1296675 non-null  float64
 3   city             1296675 non-null  object 
 4   state            1296675 non-null  object 
 5   lat              1296675 non-null  float64
 6   long             1296675 non-null  float64
 7   city_pop         1296675 non-null  int64  
 8   job              1296675 non-null  object 
 9   unix_time        1296675 non-null  int64  
 10  merch_lat        1296675 non-null  float64
 11  merch_long       1296675 non-null  float64
 12  is_fraud         1296675 non-null  int64  
 13  hour             1296675 non-null  int32  
 14  day              1296675 non-null  int32  
 15  month            1296675 non-null  int32  
 16  weekday          1

In [32]:
data['gender_M']=data['gender_M'].astype(int)

In [33]:
test=pd.get_dummies(test,columns=['gender'],drop_first=True)

In [34]:
test

Unnamed: 0,merchant,category,amt,city,state,lat,long,city_pop,job,unix_time,...,is_fraud,hour,day,month,weekday,age,merchantEncoded,cityEncoded,jobEncoded,gender_M
0,fraud_Kirlin and Sons,personal_care,2.86,Columbia,SC,33.9659,-80.9355,333497,Mechanical engineer,1371816865,...,0,12,21,6,6,52,1816,1561.0,5632.0,True
1,fraud_Sporer-Keebler,personal_care,29.84,Altonah,UT,40.3207,-110.4360,302,"Sales professional, IT",1371816873,...,0,12,21,6,6,30,1825,2089.0,5517.0,False
2,"fraud_Swaniawski, Nitzsche and Welch",health_fitness,41.28,Bellmore,NY,40.6729,-73.5365,34496,"Librarian, public",1371816893,...,0,12,21,6,6,50,1716,2577.0,6186.0,False
3,fraud_Haley Group,misc_pos,60.05,Titusville,FL,28.5697,-80.8191,54767,Set designer,1371816915,...,0,12,21,6,6,33,1629,1526.0,1545.0,True
4,fraud_Johnston-Casper,travel,3.19,Falmouth,MI,44.2529,-85.0170,1126,Furniture designer,1371816917,...,0,12,21,6,6,65,831,2034.0,2043.0,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
555714,fraud_Reilly and Sons,health_fitness,43.77,Luray,MO,40.4931,-91.8912,519,Town planner,1388534347,...,0,23,31,12,3,54,1682,1567.0,4151.0,True
555715,fraud_Hoppe-Parisian,kids_pets,111.84,Lake Jackson,TX,29.0393,-95.4401,28739,Futures trader,1388534349,...,0,23,31,12,3,21,2282,2556.0,4136.0,True
555716,fraud_Rau-Robel,kids_pets,86.88,Burbank,WA,46.1966,-118.9017,3684,Musician,1388534355,...,0,23,31,12,3,39,2249,4159.0,6178.0,False
555717,fraud_Breitenberg LLC,travel,7.99,Mesa,ID,44.6255,-116.4493,129,Cartographer,1388534364,...,0,23,31,12,3,55,800,2047.0,2038.0,True


In [35]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 555719 entries, 0 to 555718
Data columns (total 22 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   merchant         555719 non-null  object 
 1   category         555719 non-null  object 
 2   amt              555719 non-null  float64
 3   city             555719 non-null  object 
 4   state            555719 non-null  object 
 5   lat              555719 non-null  float64
 6   long             555719 non-null  float64
 7   city_pop         555719 non-null  int64  
 8   job              555719 non-null  object 
 9   unix_time        555719 non-null  int64  
 10  merch_lat        555719 non-null  float64
 11  merch_long       555719 non-null  float64
 12  is_fraud         555719 non-null  int64  
 13  hour             555719 non-null  int32  
 14  day              555719 non-null  int32  
 15  month            555719 non-null  int32  
 16  weekday          555719 non-null  int3

In [36]:
test['gender_M']=test['gender_M'].astype(int)

In [37]:
test=test.drop(columns=['merchant','city','job'])

In [38]:
data=data.drop(columns=['merchant','city','job'])

In [39]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 555719 entries, 0 to 555718
Data columns (total 19 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   category         555719 non-null  object 
 1   amt              555719 non-null  float64
 2   state            555719 non-null  object 
 3   lat              555719 non-null  float64
 4   long             555719 non-null  float64
 5   city_pop         555719 non-null  int64  
 6   unix_time        555719 non-null  int64  
 7   merch_lat        555719 non-null  float64
 8   merch_long       555719 non-null  float64
 9   is_fraud         555719 non-null  int64  
 10  hour             555719 non-null  int32  
 11  day              555719 non-null  int32  
 12  month            555719 non-null  int32  
 13  weekday          555719 non-null  int32  
 14  age              555719 non-null  int32  
 15  merchantEncoded  555719 non-null  int64  
 16  cityEncoded      555719 non-null  floa

In [40]:
Columns=['category','state']

for c in Columns:
    freq=data[c].value_counts()
    data[f'{c}Encoded']=data[c].map(freq)
    test[f'{c}Encoded']=test[c].map(freq)
    test[f'{c}Encoded'].fillna(0,inplace=True)

In [41]:
data

Unnamed: 0,category,amt,state,lat,long,city_pop,unix_time,merch_lat,merch_long,is_fraud,...,day,month,weekday,age,merchantEncoded,cityEncoded,jobEncoded,gender_M,categoryEncoded,stateEncoded
0,misc_net,4.97,NC,36.0788,-81.1781,3495,1325376018,36.011293,-82.048315,0,...,1,1,1,31,1267,2028,3545,0,63287,30266
1,grocery_pos,107.23,WA,48.8878,-118.2105,149,1325376044,49.159047,-118.186462,0,...,1,1,1,41,2503,3545,5099,0,123638,18924
2,entertainment,220.11,ID,42.1808,-112.2620,4154,1325376051,43.150704,-112.154481,0,...,1,1,1,57,1895,503,511,1,94014,5545
3,gas_transport,45.00,MT,46.2306,-112.1138,1939,1325376076,47.034331,-112.561071,0,...,1,1,1,52,2613,493,2530,1,131659,11754
4,misc_pos,41.96,VA,38.4207,-79.4629,99,1325376186,38.674999,-78.632459,0,...,1,1,1,33,1592,2017,2017,1,79655,29250
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1296670,entertainment,15.56,UT,37.7175,-112.4777,258,1371816728,36.841266,-111.690765,0,...,21,6,6,59,1907,1513,5620,1,94014,10699
1296671,food_dining,51.70,MD,39.2667,-77.5101,100,1371816739,38.906881,-78.246528,0,...,21,6,6,41,1751,531,531,1,91461,26193
1296672,food_dining,105.93,NM,32.9396,-105.8189,899,1371816752,33.619513,-105.130529,0,...,21,6,6,53,1853,2070,8684,1,91461,16407
1296673,food_dining,74.90,SD,43.3526,-102.5411,1126,1371816816,42.788940,-103.241160,0,...,21,6,6,40,1910,2024,2024,1,91461,12324


In [42]:
test=test.drop(columns=['category','state'])

In [43]:
data=data.drop(columns=['category','state'])

In [44]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1296675 entries, 0 to 1296674
Data columns (total 19 columns):
 #   Column           Non-Null Count    Dtype  
---  ------           --------------    -----  
 0   amt              1296675 non-null  float64
 1   lat              1296675 non-null  float64
 2   long             1296675 non-null  float64
 3   city_pop         1296675 non-null  int64  
 4   unix_time        1296675 non-null  int64  
 5   merch_lat        1296675 non-null  float64
 6   merch_long       1296675 non-null  float64
 7   is_fraud         1296675 non-null  int64  
 8   hour             1296675 non-null  int32  
 9   day              1296675 non-null  int32  
 10  month            1296675 non-null  int32  
 11  weekday          1296675 non-null  int32  
 12  age              1296675 non-null  int32  
 13  merchantEncoded  1296675 non-null  int64  
 14  cityEncoded      1296675 non-null  int64  
 15  jobEncoded       1296675 non-null  int64  
 16  gender_M         1

In [45]:
x=data.drop('is_fraud',axis=1)
y=data['is_fraud']

In [47]:
from imblearn.under_sampling import RandomUnderSampler
# Initialize the sampler
sampler = RandomUnderSampler(random_state=42)



# Perform random undersampling
xr, yr = sampler.fit_resample(x, y)

In [48]:
from sklearn.ensemble import RandomForestClassifier

In [49]:
!pip install lightgbm



In [50]:
model = lgb.LGBMClassifier(n_estimators=100, max_depth=6, learning_rate=0.1, is_unbalance=True, n_jobs=-1, random_state=42)

model.fit(xr, yr)

[LightGBM] [Info] Number of positive: 7506, number of negative: 7506
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000585 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2733
[LightGBM] [Info] Number of data points in the train set: 15012, number of used features: 18
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


 ## Evaluation

In [51]:
xt=test.drop('is_fraud',axis=1)
yt=test['is_fraud']

In [52]:


#Calculating Prediction
y_pred = model.predict(xr)
y_pred_prob = model.predict_proba(xt)
print('Predicted Value for RandomForestClassifierModel is : ' , y_pred[:10])
print('Prediction Probabilities Value for RandomForestClassifierModel is : ' , y_pred_prob[:10])

Predicted Value for RandomForestClassifierModel is :  [0 0 0 0 0 0 0 0 0 0]
Prediction Probabilities Value for RandomForestClassifierModel is :  [[0.99706913 0.00293087]
 [0.99268903 0.00731097]
 [0.99703765 0.00296235]
 [0.99845337 0.00154663]
 [0.98145678 0.01854322]
 [0.20823764 0.79176236]
 [0.99530321 0.00469679]
 [0.89779221 0.10220779]
 [0.99832718 0.00167282]
 [0.98761886 0.01238114]]


## **Our focus on Recall**

In [53]:
y_pred = model.predict(xt)


print(confusion_matrix(yt, y_pred))
print(classification_report(yt, y_pred))

[[540016  13558]
 [    61   2084]]
              precision    recall  f1-score   support

           0       1.00      0.98      0.99    553574
           1       0.13      0.97      0.23      2145

    accuracy                           0.98    555719
   macro avg       0.57      0.97      0.61    555719
weighted avg       1.00      0.98      0.98    555719

