# Machine Learning  

## Decision Tree Algorithm

In [169]:
# Importing Libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder,MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix


In [170]:
# Loading the dataset.
df = sns.load_dataset("penguins")
print(df.head())

  species     island  bill_length_mm  bill_depth_mm  flipper_length_mm  \
0  Adelie  Torgersen            39.1           18.7              181.0   
1  Adelie  Torgersen            39.5           17.4              186.0   
2  Adelie  Torgersen            40.3           18.0              195.0   
3  Adelie  Torgersen             NaN            NaN                NaN   
4  Adelie  Torgersen            36.7           19.3              193.0   

   body_mass_g     sex  
0       3750.0    Male  
1       3800.0  Female  
2       3250.0  Female  
3          NaN     NaN  
4       3450.0  Female  


In [171]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            344 non-null    object 
 1   island             344 non-null    object 
 2   bill_length_mm     342 non-null    float64
 3   bill_depth_mm      342 non-null    float64
 4   flipper_length_mm  342 non-null    float64
 5   body_mass_g        342 non-null    float64
 6   sex                333 non-null    object 
dtypes: float64(4), object(3)
memory usage: 18.9+ KB


In [172]:
# Check for missing values
print(df.isnull().sum().sort_values(ascending=False))

sex                  11
bill_depth_mm         2
bill_length_mm        2
flipper_length_mm     2
body_mass_g           2
island                0
species               0
dtype: int64


In [173]:
# Impute missing values with mode 
df["sex"] = df["sex"].fillna(df["sex"].mode()[0])
# Impute missing values with mean
df["bill_depth_mm"] = df["bill_depth_mm"].fillna(df["bill_depth_mm"].mean())
df["bill_length_mm"] = df["bill_length_mm"].fillna(df["bill_length_mm"].mean())
df["flipper_length_mm"] = df["flipper_length_mm"].fillna(df["flipper_length_mm"].mean())
df["body_mass_g"] = df["body_mass_g"].fillna(df["body_mass_g"].mean())



In [174]:
# Again check for missing values after imputing them
print(df.isnull().sum().sort_values(ascending=False))

species              0
island               0
bill_length_mm       0
bill_depth_mm        0
flipper_length_mm    0
body_mass_g          0
sex                  0
dtype: int64


In [175]:
# Let's encode the object/category column in our dataset using LabelEncoder in For Loop
label_encoder = LabelEncoder()

# Loop through categorical columns and encode them
for column in df.select_dtypes(include=['object']).columns:
    df[column] = label_encoder.fit_transform(df[column])


In [176]:
# Splitting the data into X and y
X = df.drop("island",axis=1)
y = df["island"]
# Let's split the data into train and test with 80/20 ratio
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [177]:
%%time
# train the Decision Tree model
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)

# predict the test data
y_pred_dt = dt.predict(X_test)

print('Confusion Matrix:', confusion_matrix(y_test, y_pred_dt))
print('Accuracy score: ', accuracy_score(y_test, y_pred_dt))
print('Precision score: ', precision_score(y_test, y_pred_dt, average='micro'))
print('Recall score: ', recall_score(y_test, y_pred_dt, average='micro'))
print('F1 score: ', f1_score(y_test, y_pred_dt, average='micro'))

Confusion Matrix: [[24  2  5]
 [ 2 20  3]
 [ 1  9  3]]
Accuracy score:  0.6811594202898551
Precision score:  0.6811594202898551
Recall score:  0.6811594202898551
F1 score:  0.6811594202898551
CPU times: total: 46.9 ms
Wall time: 116 ms


## Random Forest Algorithm

In [178]:
# Importing Libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder,MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix


In [179]:
%%time
# Train the Random Forest model
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

# predict the test data
y_pred_rf = rf.predict(X_test)

print('Confusion Matrix:', confusion_matrix(y_test, y_pred_rf))
print('Accuracy score: ', accuracy_score(y_test, y_pred_rf))
print('Precision score: ', precision_score(y_test, y_pred_rf, average='micro'))
print('Recall score: ', recall_score(y_test, y_pred_rf, average='micro'))
print('F1 score: ', f1_score(y_test, y_pred_rf, average='micro'))

Confusion Matrix: [[23  5  3]
 [ 3 20  2]
 [ 2 10  1]]
Accuracy score:  0.6376811594202898
Precision score:  0.6376811594202898
Recall score:  0.6376811594202898
F1 score:  0.6376811594202898
CPU times: total: 297 ms
Wall time: 475 ms


## XG Boost Algortihm

In [180]:
# Importing Libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder,MinMaxScaler
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix


In [181]:
%%time
# Train XG Boost model
xgb = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
xgb.fit(X_train, y_train)

# predict the test data
y_pred_xgb = xgb.predict(X_test)

print('Confusion Matrix:', confusion_matrix(y_test, y_pred_xgb))
print('Accuracy score: ', accuracy_score(y_test, y_pred_xgb))
print('Precision score: ', precision_score(y_test, y_pred_xgb, average='micro'))
print('Recall score: ', recall_score(y_test, y_pred_xgb, average='micro'))
print('F1 score: ', f1_score(y_test, y_pred_xgb, average='micro'))


Parameters: { "use_label_encoder" } are not used.




Confusion Matrix: [[24  4  3]
 [ 3 21  1]
 [ 3  9  1]]
Accuracy score:  0.6666666666666666
Precision score:  0.6666666666666666
Recall score:  0.6666666666666666
F1 score:  0.6666666666666666
CPU times: total: 766 ms
Wall time: 831 ms


## Let's Look the Comparison of Decision Tree | Random Forest | XG Boost in interactive Bar Plots.

In [168]:
import plotly.graph_objects as go

# Assuming y_pred for each model is already calculated
# Results dictionary with pre-calculated metrics
results = {
    "Decision Tree": {
        "Accuracy": accuracy_score(y_test, y_pred_dt),
        "Precision": precision_score(y_test, y_pred_dt, average='micro'),
        "Recall": recall_score(y_test, y_pred_dt, average='micro'),
        "F1 Score": f1_score(y_test, y_pred_dt, average='micro')
    },
    "Random Forest": {
        "Accuracy": accuracy_score(y_test, y_pred_rf),
        "Precision": precision_score(y_test, y_pred_rf, average='micro'),
        "Recall": recall_score(y_test, y_pred_rf, average='micro'),
        "F1 Score": f1_score(y_test, y_pred_rf, average='micro')
    },
    "XGBoost": {
        "Accuracy": accuracy_score(y_test, y_pred_xgb),
        "Precision": precision_score(y_test, y_pred_xgb, average='micro'),
        "Recall": recall_score(y_test, y_pred_xgb, average='micro'),
        "F1 Score": f1_score(y_test, y_pred_xgb, average='micro')
    }
}

# Create a bar plot using Plotly
fig = go.Figure()

for metric in results[next(iter(results))].keys():
    fig.add_trace(go.Bar(
        x=list(results.keys()),
        y=[results[model][metric] for model in results],
        name=metric
    ))

# Update layout
fig.update_layout(
    title='Model Comparison Metrics',
    xaxis_title='Models',
    yaxis_title='Scores',
    barmode='group',
    template='plotly',
    legend_title='Metrics',
)

fig.show()