In [1]:
import numpy as np
import pandas as pd
from lazypredict import LazyClassifier
import pickle
from sklearn.model_selection import train_test_split
import pathlib
import os
from sklearn.preprocessing import StandardScaler, OneHotEncoder



In [2]:
PROJECT_PATH = os.getcwd()
PROJECT_PATH = os.path.join(PROJECT_PATH, "..")
MODELS_DIR = pathlib.Path(PROJECT_PATH) / "store" / "models"

DATASET_DIR = pathlib.Path(PROJECT_PATH) / "data"

# Loan Approval

In [3]:
DATASET_NAME = "load_approval"

DATASET_PATH = DATASET_DIR / DATASET_NAME / "dataset.csv"


In [4]:
dataset = pd.read_csv(DATASET_PATH)
dataset

Unnamed: 0,person_age,person_gender,person_education,person_income,person_emp_exp,person_home_ownership,loan_amnt,loan_intent,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,credit_score,previous_loan_defaults_on_file,loan_status
0,22.00,female,Master,71948.00,0,RENT,35000.00,PERSONAL,16.02,0.49,3.00,561,No,1
1,21.00,female,High School,12282.00,0,OWN,1000.00,EDUCATION,11.14,0.08,2.00,504,Yes,0
2,25.00,female,High School,12438.00,3,MORTGAGE,5500.00,MEDICAL,12.87,0.44,3.00,635,No,1
3,23.00,female,Bachelor,79753.00,0,RENT,35000.00,MEDICAL,15.23,0.44,2.00,675,No,1
4,24.00,male,Master,66135.00,1,RENT,35000.00,MEDICAL,14.27,0.53,4.00,586,No,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44995,27.00,male,Associate,47971.00,6,RENT,15000.00,MEDICAL,15.66,0.31,3.00,645,No,1
44996,37.00,female,Associate,65800.00,17,RENT,9000.00,HOMEIMPROVEMENT,14.07,0.14,11.00,621,No,1
44997,33.00,male,Associate,56942.00,7,RENT,2771.00,DEBTCONSOLIDATION,10.02,0.05,10.00,668,No,1
44998,29.00,male,Bachelor,33164.00,4,RENT,12000.00,EDUCATION,13.23,0.36,6.00,604,No,1


In [6]:
dataset.isna().sum().sum()

0

# Secondary Mushroom

In [3]:
DATASET_NAME = "secondary_mushroom"

DATASET_PATH = DATASET_DIR / DATASET_NAME / "dataset.csv"


In [4]:

dataset = pd.read_csv(DATASET_PATH)
dataset

Unnamed: 0,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,stem-width,...,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season,class
0,6.76,b,t,w,f,e,c,w,12.59,12.37,...,y,w,u,w,t,l,,d,u,p
1,6.83,b,t,w,f,e,c,w,13.84,13.54,...,y,w,u,w,t,l,,d,u,p
2,9.10,b,t,w,f,e,c,w,13.79,13.68,...,y,w,u,w,t,e,,d,a,p
3,8.27,b,t,w,f,e,c,w,12.63,12.46,...,y,w,u,w,t,l,,d,a,p
4,7.95,b,t,w,f,e,c,w,13.10,13.78,...,y,w,u,w,t,e,,d,a,p
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23293,1.18,s,s,y,f,f,f,f,3.93,6.22,...,,y,,,f,f,,d,a,p
23294,1.27,f,s,y,f,f,f,f,3.18,5.43,...,,y,,,f,f,,d,a,p
23295,1.27,s,s,y,f,f,f,f,3.86,6.37,...,,y,,,f,f,,d,u,p
23296,1.24,f,s,y,f,f,f,f,3.56,5.44,...,,y,,,f,f,,d,u,p


In [5]:
dataset.isna().sum()

cap-diameter                0
cap-shape                   0
cap-surface                 0
cap-color                   0
does-bruise-or-bleed        0
gill-attachment             0
gill-spacing                0
gill-color                  0
stem-height                 0
stem-width                  0
stem-root               20827
stem-surface            12002
stem-color                  0
veil-type               22945
veil-color              21886
has-ring                    0
ring-type                1059
spore-print-color       21180
habitat                     0
season                      0
class                       0
dtype: int64

In [13]:
dataset.shape

(61069, 21)

In [21]:
dataset['class'].value_counts()

class
p    33888
e    27181
Name: count, dtype: int64

In [26]:
dataset.dropna(subset=['gill-spacing', 'gill-attachment', 'cap-surface']).to_csv(DATASET_PATH, index=False)

# Dry Bean

In [27]:
DATASET_NAME = "dry_bean"

DATASET_PATH = DATASET_DIR / DATASET_NAME
DATASET_PATH.mkdir(parents=True, exist_ok=True)

In [28]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
  
path = DATASET_PATH / "dataset.csv"

if path.exists():
    dataset = pd.read_csv(path)
    X, y = dataset.iloc[:, :-1], dataset.iloc[:, -1]

else:
    # fetch dataset 
    dry_bean = fetch_ucirepo(id=602) 
      
    # data (as pandas dataframes) 
    X = dry_bean.data.features 
    y = dry_bean.data.targets 
      
    pd.concat([X, y], axis=1).to_csv(DATASET_PATH / "dataset.csv", index=False)
      
y.value_counts()

Class   
DERMASON    3546
SIRA        2636
SEKER       2027
HOROZ       1928
CALI        1630
BARBUNYA    1322
BOMBAY       522
Name: count, dtype: int64

In [29]:
y.replace({"DERMASON": 0, "SIRA": 1, "SEKER": 2, "HOROZ": 3, "CALI":4, "BARBUNYA":5, "BOMBAY": 6}, inplace=True)
y.value_counts()

Class
0        3546
1        2636
2        2027
3        1928
4        1630
5        1322
6         522
Name: count, dtype: int64

In [30]:
X_train, y_train, X_test, y_test = preprocess(X, y, DATASET_PATH)


Scaling numerical columns...
X_train shape: (12249, 16)
X_test shape: (1362, 16)
y_train shape: (12249, 1)
y_test shape: (1362, 1)
X_sample shape: (2450, 16)
y_sample shape: (2450, 1)


In [31]:
train_and_save_models(X_train, y_train, X_test, y_test)


 97%|█████████▋| 28/29 [00:41<00:01,  1.02s/it]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000658 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4080
[LightGBM] [Info] Number of data points in the train set: 12249, number of used features: 16
[LightGBM] [Info] Start training from score -1.345110
[LightGBM] [Info] Start training from score -1.641711
[LightGBM] [Info] Start training from score -1.904412
[LightGBM] [Info] Start training from score -1.954437
[LightGBM] [Info] Start training from score -2.122225
[LightGBM] [Info] Start training from score -2.331491
[LightGBM] [Info] Start training from score -3.260467


100%|██████████| 29/29 [00:42<00:00,  1.48s/it]






Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
SVC,0.93,0.94,,0.93,1.0
KNeighborsClassifier,0.92,0.93,,0.92,0.09
RandomForestClassifier,0.93,0.93,,0.93,9.57
LGBMClassifier,0.93,0.93,,0.93,1.54
ExtraTreesClassifier,0.93,0.93,,0.93,1.15
XGBClassifier,0.92,0.93,,0.92,1.53
LogisticRegression,0.92,0.93,,0.92,0.31
LabelSpreading,0.91,0.93,,0.91,13.89
LabelPropagation,0.91,0.92,,0.91,4.29
QuadraticDiscriminantAnalysis,0.91,0.92,,0.91,0.05


# Bank Marketing

In [29]:
DATASET_NAME = "bank_marketing"

DATASET_PATH = DATASET_DIR / DATASET_NAME / "dataset.csv"


In [30]:
dataset = pd.read_csv(DATASET_PATH)
dataset

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day_of_week,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,,5,may,261,1,-1,0,,no
1,44,technician,single,secondary,no,29,yes,no,,5,may,151,1,-1,0,,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,,5,may,76,1,-1,0,,no
3,47,blue-collar,married,,no,1506,yes,no,,5,may,92,1,-1,0,,no
4,33,,single,,no,1,no,no,,5,may,198,1,-1,0,,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,technician,married,tertiary,no,825,no,no,cellular,17,nov,977,3,-1,0,,yes
45207,71,retired,divorced,primary,no,1729,no,no,cellular,17,nov,456,2,-1,0,,yes
45208,72,retired,married,secondary,no,5715,no,no,cellular,17,nov,1127,5,184,3,success,yes
45209,57,blue-collar,married,secondary,no,668,no,no,telephone,17,nov,508,4,-1,0,,no


In [31]:
dataset.isna().sum()

age                0
job              288
marital            0
education       1857
default            0
balance            0
housing            0
loan               0
contact        13020
day_of_week        0
month              0
duration           0
campaign           0
pdays              0
previous           0
poutcome       36959
y                  0
dtype: int64

In [35]:
dataset.dropna(subset=['contact', 'education', 'job']).to_csv(DATASET_PATH, index=False)

# Adult

In [37]:
DATASET_NAME = "adult"

DATASET_PATH = DATASET_DIR / DATASET_NAME / "dataset.csv"

In [39]:
dataset = pd.read_csv(DATASET_PATH)
dataset

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,39,Private,215419,Bachelors,13,Divorced,Prof-specialty,Not-in-family,White,Female,0,0,36,United-States,<=50K.
48838,64,,321403,HS-grad,9,Widowed,,Other-relative,Black,Male,0,0,40,United-States,<=50K.
48839,38,Private,374983,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,50,United-States,<=50K.
48840,44,Private,83891,Bachelors,13,Divorced,Adm-clerical,Own-child,Asian-Pac-Islander,Male,5455,0,40,United-States,<=50K.


In [40]:
dataset.isna().sum()

age                 0
workclass         963
fnlwgt              0
education           0
education-num       0
marital-status      0
occupation        966
relationship        0
race                0
sex                 0
capital-gain        0
capital-loss        0
hours-per-week      0
native-country    274
income              0
dtype: int64

In [42]:
dataset.dropna(subset=['occupation', 'workclass']).to_csv(DATASET_PATH, index=False)

# HELOC

In [43]:
DATASET_NAME = "heloc"

DATASET_PATH = DATASET_DIR / DATASET_NAME / "dataset.csv"


In [44]:
dataset = pd.read_csv(DATASET_PATH)
dataset

Unnamed: 0,RiskPerformance,ExternalRiskEstimate,MSinceOldestTradeOpen,MSinceMostRecentTradeOpen,AverageMInFile,NumSatisfactoryTrades,NumTrades60Ever2DerogPubRec,NumTrades90Ever2DerogPubRec,PercentTradesNeverDelq,MSinceMostRecentDelq,...,PercentInstallTrades,MSinceMostRecentInqexcl7days,NumInqLast6M,NumInqLast6Mexcl7days,NetFractionRevolvingBurden,NetFractionInstallBurden,NumRevolvingTradesWBalance,NumInstallTradesWBalance,NumBank2NatlTradesWHighUtilization,PercentTradesWBalance
0,Bad,55,144,4,84,20,3,0,83,2,...,43,0,0,0,33,-8,8,1,1,69
1,Bad,61,58,15,41,2,4,4,100,-7,...,67,0,0,0,0,-8,0,-8,-8,0
2,Bad,67,66,5,24,9,0,0,100,-7,...,44,0,4,4,53,66,4,2,1,86
3,Bad,66,169,1,73,28,1,1,93,76,...,57,0,5,4,72,83,6,4,3,91
4,Bad,81,333,27,132,12,0,0,100,-7,...,25,0,1,1,51,89,3,1,0,80
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10454,Good,73,131,5,57,21,0,0,95,80,...,19,7,0,0,26,-8,5,2,0,100
10455,Bad,65,147,39,68,11,0,0,92,28,...,42,1,1,1,86,53,2,2,1,80
10456,Bad,74,129,6,64,18,1,1,100,-7,...,33,3,4,4,6,-8,5,-8,0,56
10457,Bad,72,234,12,113,42,2,2,96,35,...,20,6,0,0,19,-8,4,1,0,38


In [45]:
dataset.isna().sum()

RiskPerformance                       0
ExternalRiskEstimate                  0
MSinceOldestTradeOpen                 0
MSinceMostRecentTradeOpen             0
AverageMInFile                        0
NumSatisfactoryTrades                 0
NumTrades60Ever2DerogPubRec           0
NumTrades90Ever2DerogPubRec           0
PercentTradesNeverDelq                0
MSinceMostRecentDelq                  0
MaxDelq2PublicRecLast12M              0
MaxDelqEver                           0
NumTotalTrades                        0
NumTradesOpeninLast12M                0
PercentInstallTrades                  0
MSinceMostRecentInqexcl7days          0
NumInqLast6M                          0
NumInqLast6Mexcl7days                 0
NetFractionRevolvingBurden            0
NetFractionInstallBurden              0
NumRevolvingTradesWBalance            0
NumInstallTradesWBalance              0
NumBank2NatlTradesWHighUtilization    0
PercentTradesWBalance                 0
dtype: int64

In [22]:
y = y.replace({"Bad":0, "Good": 1}).astype(int)
y.value_counts()

RiskPerformance
0    5459
1    5000
Name: count, dtype: int64

# Gesture Phase

In [37]:
DATASET_NAME = "gesture_phase"

DATASET_PATH = DATASET_DIR / DATASET_NAME / "dataset.csv"

In [46]:
dataset = pd.read_csv(DATASET_PATH)

dataset

Unnamed: 0,RiskPerformance,ExternalRiskEstimate,MSinceOldestTradeOpen,MSinceMostRecentTradeOpen,AverageMInFile,NumSatisfactoryTrades,NumTrades60Ever2DerogPubRec,NumTrades90Ever2DerogPubRec,PercentTradesNeverDelq,MSinceMostRecentDelq,...,PercentInstallTrades,MSinceMostRecentInqexcl7days,NumInqLast6M,NumInqLast6Mexcl7days,NetFractionRevolvingBurden,NetFractionInstallBurden,NumRevolvingTradesWBalance,NumInstallTradesWBalance,NumBank2NatlTradesWHighUtilization,PercentTradesWBalance
0,Bad,55,144,4,84,20,3,0,83,2,...,43,0,0,0,33,-8,8,1,1,69
1,Bad,61,58,15,41,2,4,4,100,-7,...,67,0,0,0,0,-8,0,-8,-8,0
2,Bad,67,66,5,24,9,0,0,100,-7,...,44,0,4,4,53,66,4,2,1,86
3,Bad,66,169,1,73,28,1,1,93,76,...,57,0,5,4,72,83,6,4,3,91
4,Bad,81,333,27,132,12,0,0,100,-7,...,25,0,1,1,51,89,3,1,0,80
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10454,Good,73,131,5,57,21,0,0,95,80,...,19,7,0,0,26,-8,5,2,0,100
10455,Bad,65,147,39,68,11,0,0,92,28,...,42,1,1,1,86,53,2,2,1,80
10456,Bad,74,129,6,64,18,1,1,100,-7,...,33,3,4,4,6,-8,5,-8,0,56
10457,Bad,72,234,12,113,42,2,2,96,35,...,20,6,0,0,19,-8,4,1,0,38


In [47]:
dataset.isna().sum()

RiskPerformance                       0
ExternalRiskEstimate                  0
MSinceOldestTradeOpen                 0
MSinceMostRecentTradeOpen             0
AverageMInFile                        0
NumSatisfactoryTrades                 0
NumTrades60Ever2DerogPubRec           0
NumTrades90Ever2DerogPubRec           0
PercentTradesNeverDelq                0
MSinceMostRecentDelq                  0
MaxDelq2PublicRecLast12M              0
MaxDelqEver                           0
NumTotalTrades                        0
NumTradesOpeninLast12M                0
PercentInstallTrades                  0
MSinceMostRecentInqexcl7days          0
NumInqLast6M                          0
NumInqLast6Mexcl7days                 0
NetFractionRevolvingBurden            0
NetFractionInstallBurden              0
NumRevolvingTradesWBalance            0
NumInstallTradesWBalance              0
NumBank2NatlTradesWHighUtilization    0
PercentTradesWBalance                 0
dtype: int64

In [40]:
y = y.replace({"S":0,"D":1,"P":2, "R":3, "H":4}).astype(int)
y.value_counts()

Phase
0    2950
1    2741
2    2097
3    1087
4     998
Name: count, dtype: int64

# TUNADROMD

In [10]:
DATASET_NAME = "tunadromd"

DATASET_PATH = DATASET_DIR / DATASET_NAME / "TUNADROMD.csv"

dataset = pd.read_csv('/Users/eden.yavin/Projects/Tabular-Cloud-ML/data/tunadromd/TUANDROMD.csv')

dataset

Unnamed: 0,ACCESS_ALL_DOWNLOADS,ACCESS_CACHE_FILESYSTEM,ACCESS_CHECKIN_PROPERTIES,ACCESS_COARSE_LOCATION,ACCESS_COARSE_UPDATES,ACCESS_FINE_LOCATION,ACCESS_LOCATION_EXTRA_COMMANDS,ACCESS_MOCK_LOCATION,ACCESS_MTK_MMHW,ACCESS_NETWORK_STATE,...,Landroid/telephony/TelephonyManager;->getLine1Number,Landroid/telephony/TelephonyManager;->getNetworkOperator,Landroid/telephony/TelephonyManager;->getNetworkOperatorName,Landroid/telephony/TelephonyManager;->getNetworkCountryIso,Landroid/telephony/TelephonyManager;->getSimOperator,Landroid/telephony/TelephonyManager;->getSimOperatorName,Landroid/telephony/TelephonyManager;->getSimCountryIso,Landroid/telephony/TelephonyManager;->getSimSerialNumber,Lorg/apache/http/impl/client/DefaultHttpClient;->execute,Label
0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00,...,1.00,1.00,1.00,0.00,0.00,0.00,0.00,0.00,1.00,1.00
1,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00,...,0.00,0.00,0.00,1.00,0.00,0.00,1.00,0.00,0.00,1.00
2,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00
3,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,1.00,1.00,1.00,1.00,0.00,1.00,0.00,0.00,1.00
4,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4460,0.00,1.00,0.00,0.00,0.00,0.00,0.00,1.00,1.00,1.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00,0.00
4461,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00,...,0.00,1.00,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
4462,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
4463,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00


In [14]:
dataset.describe()

Unnamed: 0,ACCESS_ALL_DOWNLOADS,ACCESS_CACHE_FILESYSTEM,ACCESS_CHECKIN_PROPERTIES,ACCESS_COARSE_LOCATION,ACCESS_COARSE_UPDATES,ACCESS_FINE_LOCATION,ACCESS_LOCATION_EXTRA_COMMANDS,ACCESS_MOCK_LOCATION,ACCESS_MTK_MMHW,ACCESS_NETWORK_STATE,...,Landroid/telephony/TelephonyManager;->getLine1Number,Landroid/telephony/TelephonyManager;->getNetworkOperator,Landroid/telephony/TelephonyManager;->getNetworkOperatorName,Landroid/telephony/TelephonyManager;->getNetworkCountryIso,Landroid/telephony/TelephonyManager;->getSimOperator,Landroid/telephony/TelephonyManager;->getSimOperatorName,Landroid/telephony/TelephonyManager;->getSimCountryIso,Landroid/telephony/TelephonyManager;->getSimSerialNumber,Lorg/apache/http/impl/client/DefaultHttpClient;->execute,Label
count,4464.0,4464.0,4464.0,4464.0,4464.0,4464.0,4464.0,4464.0,4464.0,4464.0,...,4464.0,4464.0,4464.0,4464.0,4464.0,4464.0,4464.0,4464.0,4464.0,4464.0
mean,0.0,0.0,0.0,0.08,0.01,0.09,0.02,0.02,0.0,0.6,...,0.17,0.25,0.22,0.09,0.09,0.06,0.13,0.01,0.15,0.8
std,0.03,0.03,0.07,0.28,0.08,0.28,0.15,0.15,0.01,0.49,...,0.38,0.43,0.41,0.29,0.28,0.23,0.34,0.12,0.36,0.4
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [12]:
dataset.isna().sum().sum()

242

In [13]:
dataset = dataset.dropna()
dataset.isna().sum().sum(), dataset.shape

(0, (4464, 242))

# Phishing


In [15]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
phishing_websites = fetch_ucirepo(id=327) 
  
# data (as pandas dataframes) 
X = phishing_websites.data.features 
y = phishing_websites.data.targets 
  



{'uci_id': 327, 'name': 'Phishing Websites', 'repository_url': 'https://archive.ics.uci.edu/dataset/327/phishing+websites', 'data_url': 'https://archive.ics.uci.edu/static/public/327/data.csv', 'abstract': 'This dataset collected mainly from: PhishTank archive, MillerSmiles archive, Googleâ€™s searching operators.', 'area': 'Computer Science', 'tasks': ['Classification'], 'characteristics': ['Tabular'], 'num_instances': 11055, 'num_features': 30, 'feature_types': ['Integer'], 'demographics': [], 'target_col': ['result'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 2012, 'last_updated': 'Tue Mar 05 2024', 'dataset_doi': '10.24432/C51W2X', 'creators': ['Rami Mohammad', 'Lee McCluskey'], 'intro_paper': {'ID': 396, 'type': 'NATIVE', 'title': 'An assessment of features related to phishing websites using an automated technique', 'authors': 'R. Mohammad, F. Thabtah, L. Mccluskey', 'venue': 'International Conference for Internet Tec

In [20]:
y = y.replace({-1: 0})
y.value_counts()

result
1         6157
0         4898
Name: count, dtype: int64

In [21]:
DATASET_NAME = "phishing"
DATASET_PATH = DATASET_DIR / DATASET_NAME / "dataset.csv"

dataset['label'] = y
dataset.to_csv(DATASET_PATH, index=False)