# Checking the new libraries and how they work

# Sweetviz

### Installing sweetviz library

In [12]:
pip install sweetviz

Note: you may need to restart the kernel to use updated packages.


### Importing Libraries

In [2]:
import sweetviz as sv
import pandas as pd

### Load the dataset

In [3]:
df = pd.read_csv('gender_classification_v7.csv')
df

Unnamed: 0,long_hair,forehead_width_cm,forehead_height_cm,nose_wide,nose_long,lips_thin,distance_nose_to_lip_long,gender
0,1,11.8,6.1,1,0,1,1,Male
1,0,14.0,5.4,0,0,1,0,Female
2,0,11.8,6.3,1,1,1,1,Male
3,0,14.4,6.1,0,1,1,1,Male
4,1,13.5,5.9,0,0,0,0,Female
...,...,...,...,...,...,...,...,...
4996,1,13.6,5.1,0,0,0,0,Female
4997,1,11.9,5.4,0,0,0,0,Female
4998,1,12.9,5.7,0,0,0,0,Female
4999,1,13.2,6.2,0,0,0,0,Female


### Generate and display the report

In [4]:
my_report = sv.analyze(df)
my_report.show_html('sweetviz_report.html')

                                             |                                             | [  0%]   00:00 ->…

Report sweetviz_report.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.


# LazyPredict

### Installing lazypredict

In [5]:
pip install lazypredict

Collecting lazypredict
  Downloading lazypredict-0.2.12-py2.py3-none-any.whl (12 kB)
Collecting xgboost
  Downloading xgboost-2.0.3-py3-none-win_amd64.whl (99.8 MB)
     ---------------------------------------- 99.8/99.8 MB 5.0 MB/s eta 0:00:00
Collecting lightgbm
  Downloading lightgbm-4.3.0-py3-none-win_amd64.whl (1.3 MB)
     ---------------------------------------- 1.3/1.3 MB 9.5 MB/s eta 0:00:00
Installing collected packages: xgboost, lightgbm, lazypredict
Successfully installed lazypredict-0.2.12 lightgbm-4.3.0 xgboost-2.0.3
Note: you may need to restart the kernel to use updated packages.


### Importing lazypredict library

In [6]:
from lazypredict.Supervised import LazyClassifier, LazyRegressor

### Importing libararies

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

### Load the dataset 

In [None]:
df = pd.read_csv('gender_classification_v7.csv')

### Assume the target variable is in the 'nose_wide' column

In [None]:
X = df.drop('nose_wide', axis=1)
y = df['nose_wide']

### Split the data into training and testing sets

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
from lazypredict.Supervised import LazyClassifier

# Create a LazyClassifier
clf = LazyClassifier()

# Fit the LazyClassifier on your training data
models, predictions = clf.fit(X_train, X_test, y_train, y_test)

100%|██████████████████████████████████████████████████████████████████████████████████| 29/29 [00:06<00:00,  4.38it/s]

[LightGBM] [Info] Number of positive: 1976, number of negative: 2024
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000136 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 81
[LightGBM] [Info] Number of data points in the train set: 4000, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.494000 -> initscore=-0.024001
[LightGBM] [Info] Start training from score -0.024001





In [10]:
from lazypredict.Supervised import LazyRegressor

# Create a LazyRegressor
reg = LazyRegressor()

# Fit the LazyRegressor on your training data
models, predictions = reg.fit(X_train, X_test, y_train, y_test)

100%|██████████████████████████████████████████████████████████████████████████████████| 42/42 [24:35<00:00, 35.14s/it]

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000561 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 81
[LightGBM] [Info] Number of data points in the train set: 4000, number of used features: 8
[LightGBM] [Info] Start training from score 0.494000





### Print model and predictions 

In [11]:
print(models)
print(predictions)

                               Adjusted R-Squared  R-Squared  RMSE  Time Taken
Model                                                                         
LassoLarsIC                                  0.57       0.57  0.33        0.03
OrthogonalMatchingPursuitCV                  0.57       0.57  0.33        0.04
LassoCV                                      0.56       0.57  0.33        0.13
LassoLarsCV                                  0.56       0.57  0.33        0.05
ElasticNetCV                                 0.56       0.57  0.33        0.16
TransformedTargetRegressor                   0.56       0.57  0.33        0.03
LinearRegression                             0.56       0.57  0.33        0.03
RidgeCV                                      0.56       0.57  0.33        0.03
Ridge                                        0.56       0.57  0.33        0.06
KernelRidge                                  0.56       0.57  0.33        1.17
BayesianRidge                                0.56   

# Autoviz

### Installing autoviz library

In [17]:
pip install autoviz

Collecting autoviz
  Using cached autoviz-0.1.804-py3-none-any.whl (68 kB)
Collecting pyamg
  Using cached pyamg-5.0.1-cp310-cp310-win_amd64.whl (1.6 MB)
Collecting wordcloud
  Using cached wordcloud-1.9.3-cp310-cp310-win_amd64.whl (299 kB)
Collecting holoviews~=1.14.9
  Using cached holoviews-1.14.9-py2.py3-none-any.whl (4.3 MB)
Collecting hvplot~=0.7.3
  Using cached hvplot-0.7.3-py2.py3-none-any.whl (3.1 MB)
Collecting textblob
  Using cached textblob-0.17.1-py2.py3-none-any.whl (636 kB)
Collecting pandas-dq>=1.29
  Using cached pandas_dq-1.29-py3-none-any.whl (29 kB)
Installing collected packages: pyamg, wordcloud, textblob, pandas-dq, holoviews, hvplot, autoviz
  Attempting uninstall: holoviews
    Found existing installation: holoviews 1.15.4
    Uninstalling holoviews-1.15.4:
      Successfully uninstalled holoviews-1.15.4
  Attempting uninstall: hvplot
    Found existing installation: hvplot 0.8.2
    Uninstalling hvplot-0.8.2:
      Successfully uninstalled hvplot-0.8.2
Succes

### Import autoviz library

In [18]:
from autoviz.AutoViz_Class import AutoViz_Class

Imported v0.1.804. After importing autoviz, you must run '%matplotlib inline' to display charts inline.
    AV = AutoViz_Class()
    dfte = AV.AutoViz(filename, sep=',', depVar='', dfte=None, header=0, verbose=1, lowess=False,
               chart_format='svg',max_rows_analyzed=150000,max_cols_analyzed=30, save_plot_dir=None)


### Load the dataset

In [19]:
import pandas as pd
df = pd.read_csv('gender_classification_v7.csv')

### Create an autoviz object

In [20]:
AV = AutoViz_Class()
report = AV.AutoViz('gender_classification_v7.csv')

Shape of your Data Set loaded: (5001, 8)
#######################################################################################
######################## C L A S S I F Y I N G  V A R I A B L E S  ####################
#######################################################################################
Classifying variables in data set...
    Number of Numeric Columns =  2
    Number of Integer-Categorical Columns =  0
    Number of String-Categorical Columns =  0
    Number of Factor-Categorical Columns =  0
    Number of String-Boolean Columns =  1
    Number of Numeric-Boolean Columns =  5
    Number of Discrete String Columns =  0
    Number of NLP String Columns =  0
    Number of Date Time Columns =  0
    Number of ID Columns =  0
    Number of Columns to Delete =  0
    8 Predictors classified...
        No variables removed since no ID or low-information variables found in data set
To fix these data quality issues in the dataset, import FixDQ from autoviz...
There are 1768 du

Unnamed: 0,Data Type,Missing Values%,Unique Values%,Minimum Value,Maximum Value,DQ Issue
long_hair,int64,0.0,0.0,0.0,1.0,No issue
forehead_width_cm,float64,0.0,,11.4,15.5,No issue
forehead_height_cm,float64,0.0,,5.1,7.1,No issue
nose_wide,int64,0.0,0.0,0.0,1.0,No issue
nose_long,int64,0.0,0.0,0.0,1.0,No issue
lips_thin,int64,0.0,0.0,0.0,1.0,No issue
distance_nose_to_lip_long,int64,0.0,0.0,0.0,1.0,No issue
gender,object,0.0,0.0,,,No issue


Number of All Scatter Plots = 3
All Plots done
Time to run AutoViz = 3 seconds 

 ###################### AUTO VISUALIZATION Completed ########################


# H2O

### Installing h2o libraray

In [1]:
pip install h2o

Collecting h2o
  Downloading h2o-3.44.0.3.tar.gz (265.2 MB)
     -------------------------------------- 265.2/265.2 MB 3.1 MB/s eta 0:00:00
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: h2o
  Building wheel for h2o (setup.py): started
  Building wheel for h2o (setup.py): finished with status 'done'
  Created wheel for h2o: filename=h2o-3.44.0.3-py2.py3-none-any.whl size=265293979 sha256=a4cb488489f93e968cab1bc3155163a0015752cd7b7267c752cdca6133ffcd24
  Stored in directory: c:\users\admin\appdata\local\pip\cache\wheels\55\13\0c\7b9b5c614302b9b7d227a81d14a7ce1a4b8997a0ea1651de00
Successfully built h2o
Installing collected packages: h2o
Successfully installed h2o-3.44.0.3
Note: you may need to restart the kernel to use updated packages.


### Importing Libraries

In [2]:
import h2o
from h2o.automl import H2OAutoML
import pandas as pd

### Initialize H2O cluster

In [5]:
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
; Java HotSpot(TM) 64-Bit Server VM (build 25.401-b10, mixed mode)
  Starting server from C:\Users\admin\anaconda3\Lib\site-packages\h2o\backend\bin\h2o.jar
  Ice root: C:\Users\admin\AppData\Local\Temp\tmpo5d73zi_
  JVM stdout: C:\Users\admin\AppData\Local\Temp\tmpo5d73zi_\h2o_admin_started_from_python.out
  JVM stderr: C:\Users\admin\AppData\Local\Temp\tmpo5d73zi_\h2o_admin_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,03 secs
H2O_cluster_timezone:,Asia/Kolkata
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.44.0.3
H2O_cluster_version_age:,1 month and 11 days
H2O_cluster_name:,H2O_from_python_admin_tn5xxc
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,880 Mb
H2O_cluster_total_cores:,4
H2O_cluster_allowed_cores:,4


### Load the dataset

In [7]:
df=pd.read_csv('titanic_train.csv')
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [8]:
#Assuming your DataFrame is named 'df'
# Drop rows with missing target values (if any)
#df = df.dropna(subset=['Survived'])

### Convert Pandas DataFrame to H2O Frame

In [9]:
h2o_df = h2o.H2OFrame(df)

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


### Identify predictors and response

In [11]:
x = h2o_df.columns
y = 'Parch'
x.remove(y)

### Run AutoML

In [13]:
aml = H2OAutoML(max_runtime_secs=600)
# You can adjust the max runtime as needed
aml.train(x=x, y=y, training_frame=h2o_df)

AutoML progress: |
11:49:08.354: AutoML: XGBoost is not available; skipping it.
11:49:08.460: _train param, Dropping bad and constant columns: [Name]


11:49:09.333: _train param, Dropping bad and constant columns: [Name]

█
11:49:10.801: _train param, Dropping unused columns: [Name]
11:49:10.961: _train param, Dropping bad and constant columns: [Name]

██
11:49:12.231: _train param, Dropping bad and constant columns: [Name]

█
11:49:13.218: _train param, Dropping bad and constant columns: [Name]

█
11:49:14.133: _train param, Dropping bad and constant columns: [Name]
11:49:14.778: _train param, Dropping unused columns: [Name]

██
11:49:14.962: _train param, Dropping unused columns: [Name]
11:49:15.107: _train param, Dropping bad and constant columns: [Name]
11:49:16.111: _train param, Dropping bad and constant columns: [Name]

████
11:49:16.533: _train param, Dropping bad and constant columns: [Name]
11:49:17.358: _train param, Dropping unused columns: [Name]
11:49:17.489: _train para

key,value
Stacking strategy,cross_validation
Number of base models (used / total),164/188
# GBM base models (used / total),143/167
# DRF base models (used / total),2/2
# DeepLearning base models (used / total),18/18
# GLM base models (used / total),1/1
Metalearner algorithm,GBM
Metalearner fold assignment scheme,Random
Metalearner nfolds,5
Metalearner fold_column,

Unnamed: 0,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
mae,0.2507482,0.0187411,0.2544082,0.224722,0.2648464,0.2703936,0.2393707
mean_residual_deviance,0.2332754,0.0458127,0.2507043,0.2122398,0.300267,0.2255858,0.1775802
mse,0.2332754,0.0458127,0.2507043,0.2122398,0.300267,0.2255858,0.1775802
r2,0.6160717,0.1195264,0.6926796,0.4999025,0.4724835,0.6936335,0.7216595
residual_deviance,0.2332754,0.0458127,0.2507043,0.2122398,0.300267,0.2255858,0.1775802
rmse,0.4811453,0.047099,0.5007038,0.4606948,0.5479663,0.4749587,0.4214027
rmsle,0.2644425,0.0161273,0.2592798,0.2583736,0.2715624,0.2879193,0.2450772


### View the AutoML leaderboard

In [14]:
lb = aml.leaderboard
print(lb)

model_id                                                     rmse       mse       mae     rmsle    mean_residual_deviance
StackedEnsemble_AllModels_5_AutoML_1_20240201_114908     0.480467  0.230849  0.25044   0.264092                  0.230849
StackedEnsemble_BestOfFamily_4_AutoML_1_20240201_114908  0.486308  0.236495  0.256423  0.264496                  0.236495
GBM_grid_1_AutoML_1_20240201_114908_model_70             0.486334  0.236521  0.24651   0.263506                  0.236521
StackedEnsemble_AllModels_6_AutoML_1_20240201_114908     0.487513  0.237669  0.253085  0.26178                   0.237669
StackedEnsemble_BestOfFamily_5_AutoML_1_20240201_114908  0.487842  0.23799   0.254699  0.26705                   0.23799
StackedEnsemble_AllModels_2_AutoML_1_20240201_114908     0.488428  0.238562  0.254932  0.267334                  0.238562
GBM_grid_1_AutoML_1_20240201_114908_model_124            0.489254  0.23937   0.241601  0.264335                  0.23937
GBM_grid_1_AutoML_1_202402

### Get the best model

In [15]:
best_model = aml.leader
print(best_model)

Model Details
H2OStackedEnsembleEstimator : Stacked Ensemble
Model Key: StackedEnsemble_AllModels_5_AutoML_1_20240201_114908


Model Summary for Stacked Ensemble: 
key                                        value
-----------------------------------------  ----------------
Stacking strategy                          cross_validation
Number of base models (used / total)       164/188
# GBM base models (used / total)           143/167
# DRF base models (used / total)           2/2
# DeepLearning base models (used / total)  18/18
# GLM base models (used / total)           1/1
Metalearner algorithm                      GBM
Metalearner fold assignment scheme         Random
Metalearner nfolds                         5
Metalearner fold_column
Custom metalearner hyperparameters         None

ModelMetricsRegression: stackedensemble
** Reported on train data. **

MSE: 0.034210973991665124
RMSE: 0.1849620879847141
MAE: 0.09974420307994346
RMSLE: 0.1000479555565082
Mean Residual Deviance: 0.03421097