# **Machine Learning**
# Decoding Categoric Features

In [36]:
# Importing Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [37]:
# To display max rows and columns in the output
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)

In [38]:
# Load the penguins dataset using seaborn Library
df = sns.load_dataset("penguins")
print(df.head())

  species     island  bill_length_mm  bill_depth_mm  flipper_length_mm  \
0  Adelie  Torgersen            39.1           18.7              181.0   
1  Adelie  Torgersen            39.5           17.4              186.0   
2  Adelie  Torgersen            40.3           18.0              195.0   
3  Adelie  Torgersen             NaN            NaN                NaN   
4  Adelie  Torgersen            36.7           19.3              193.0   

   body_mass_g     sex  
0       3750.0    Male  
1       3800.0  Female  
2       3250.0  Female  
3          NaN     NaN  
4       3450.0  Female  


In [39]:
# Let's first check for missing values in our dataset
print(df.isnull().sum().sort_values(ascending=False))

sex                  11
bill_depth_mm         2
bill_length_mm        2
flipper_length_mm     2
body_mass_g           2
island                0
species               0
dtype: int64


In [40]:
# Let's fill the missing values in the dataset
df["sex"] = df["sex"].fillna(df["sex"].mode()[0])
df["bill_depth_mm"] = df["bill_depth_mm"].fillna(df["bill_depth_mm"].mean())
df["bill_length_mm"] = df["bill_length_mm"].fillna(df["bill_length_mm"].mean())
df["body_mass_g"] = df["body_mass_g"].fillna(df["body_mass_g"].mean())
df["flipper_length_mm"] = df["flipper_length_mm"].fillna(df["flipper_length_mm"].mean())
# Let's check the dataset again for missing values
print(df.isnull().sum().sort_values(ascending=False))

species              0
island               0
bill_length_mm       0
bill_depth_mm        0
flipper_length_mm    0
body_mass_g          0
sex                  0
dtype: int64


In [41]:
# Call the object of LabelEncoder
le_sex = LabelEncoder()
le_species = LabelEncoder()
le_island = LabelEncoder()
# Encode the categoric features
df["sex"] = le_sex.fit_transform(df["sex"])
df["species"] = le_species.fit_transform(df["species"])
df["island"] = le_island.fit_transform(df["island"])
# Let's check the dataset after encoding
print(df.head())
print("Categorical Features Encoded Successfully!")

   species  island  bill_length_mm  bill_depth_mm  flipper_length_mm  \
0        0       2        39.10000       18.70000         181.000000   
1        0       2        39.50000       17.40000         186.000000   
2        0       2        40.30000       18.00000         195.000000   
3        0       2        43.92193       17.15117         200.915205   
4        0       2        36.70000       19.30000         193.000000   

   body_mass_g  sex  
0  3750.000000    1  
1  3800.000000    0  
2  3250.000000    0  
3  4201.754386    1  
4  3450.000000    0  
Categorical Features Encoded Successfully!


In [42]:
# Let's Decode the features back to thier original form
df["sex"] = le_sex.inverse_transform(df["sex"])
df["species"] = le_species.inverse_transform(df["species"])
df["island"] = le_island.inverse_transform(df["island"])
# Let's check the dataset after encoding
print(df.head())
print("Categorical Features Decoded Successfully!")


  species     island  bill_length_mm  bill_depth_mm  flipper_length_mm  \
0  Adelie  Torgersen        39.10000       18.70000         181.000000   
1  Adelie  Torgersen        39.50000       17.40000         186.000000   
2  Adelie  Torgersen        40.30000       18.00000         195.000000   
3  Adelie  Torgersen        43.92193       17.15117         200.915205   
4  Adelie  Torgersen        36.70000       19.30000         193.000000   

   body_mass_g     sex  
0  3750.000000    Male  
1  3800.000000  Female  
2  3250.000000  Female  
3  4201.754386    Male  
4  3450.000000  Female  
Categorical Features Decoded Successfully!


## Ordinal Encoder

In [43]:
# Load the penguins dataset using seaborn Library
df = sns.load_dataset("penguins")
print(df.head())
# Let's first check for missing values in our dataset
print(df.isnull().sum().sort_values(ascending=False))
# Let's fill the missing values in the dataset
df["sex"] = df["sex"].fillna(df["sex"].mode()[0])
df["bill_depth_mm"] = df["bill_depth_mm"].fillna(df["bill_depth_mm"].mean())
df["bill_length_mm"] = df["bill_length_mm"].fillna(df["bill_length_mm"].mean())
df["body_mass_g"] = df["body_mass_g"].fillna(df["body_mass_g"].mean())
df["flipper_length_mm"] = df["flipper_length_mm"].fillna(df["flipper_length_mm"].mean())
# Let's check the dataset again for missing values
print("After filling missing values:")
print(df.isnull().sum().sort_values(ascending=False))

  species     island  bill_length_mm  bill_depth_mm  flipper_length_mm  \
0  Adelie  Torgersen            39.1           18.7              181.0   
1  Adelie  Torgersen            39.5           17.4              186.0   
2  Adelie  Torgersen            40.3           18.0              195.0   
3  Adelie  Torgersen             NaN            NaN                NaN   
4  Adelie  Torgersen            36.7           19.3              193.0   

   body_mass_g     sex  
0       3750.0    Male  
1       3800.0  Female  
2       3250.0  Female  
3          NaN     NaN  
4       3450.0  Female  
sex                  11
bill_depth_mm         2
bill_length_mm        2
flipper_length_mm     2
body_mass_g           2
island                0
species               0
dtype: int64
After filling missing values:
species              0
island               0
bill_length_mm       0
bill_depth_mm        0
flipper_length_mm    0
body_mass_g          0
sex                  0
dtype: int64


In [45]:
# Call the object of OrdinalEncoder
ole_sex = OrdinalEncoder()
ole_species = OrdinalEncoder()
ole_island = OrdinalEncoder()
# Encode the categoric features
df["sex"] = ole_sex.fit_transform(df[["sex"]])
df["species"] = ole_species.fit_transform(df[["species"]])
df["island"] = ole_island.fit_transform(df[["island"]])
# Let's check the dataset after encoding
print(df.head())
print("Categorical Features Encoded Successfully!")

   species  island  bill_length_mm  bill_depth_mm  flipper_length_mm  \
0      0.0     2.0        39.10000       18.70000         181.000000   
1      0.0     2.0        39.50000       17.40000         186.000000   
2      0.0     2.0        40.30000       18.00000         195.000000   
3      0.0     2.0        43.92193       17.15117         200.915205   
4      0.0     2.0        36.70000       19.30000         193.000000   

   body_mass_g  sex  
0  3750.000000  1.0  
1  3800.000000  0.0  
2  3250.000000  0.0  
3  4201.754386  1.0  
4  3450.000000  0.0  
Categorical Features Encoded Successfully!


In [50]:
# Let's Decode the features back to their original form
df["sex"] = ole_sex.inverse_transform(df[["sex"]]).ravel()
df["species"] = ole_species.inverse_transform(df[["species"]]).ravel()
df["island"] = ole_island.inverse_transform(df[["island"]]).ravel()
# Let's check the dataset after decoding
print(df.head())
print("Categorical Features Decoded Successfully!")

  species     island  bill_length_mm  bill_depth_mm  flipper_length_mm  \
0  Adelie  Torgersen        39.10000       18.70000         181.000000   
1  Adelie  Torgersen        39.50000       17.40000         186.000000   
2  Adelie  Torgersen        40.30000       18.00000         195.000000   
3  Adelie  Torgersen        43.92193       17.15117         200.915205   
4  Adelie  Torgersen        36.70000       19.30000         193.000000   

   body_mass_g     sex  
0  3750.000000    Male  
1  3800.000000  Female  
2  3250.000000  Female  
3  4201.754386    Male  
4  3450.000000  Female  
Categorical Features Decoded Successfully!


## One Hot Encoder

In [54]:
# Load the Titanic dataset using the Seaborn library
df = sns.load_dataset('titanic')

# Specify the categorical columns to be encoded
cat_columns = ['sex', 'embarked']

# Initialize the OneHotEncoder
encoder = OneHotEncoder(sparse_output=False)

# Fit the encoder and transform the categorical columns into a new DataFrame
encoded_data = encoder.fit_transform(df[cat_columns])
encoded_df = pd.DataFrame(encoded_data)

# Create a dictionary to hold the original categories of each column
original_categories = {}
for i in range(len(cat_columns)):
    column_name = cat_columns[i]  # Get the column name
    categories = encoder.categories_[i]  # Get the categories for that column
    original_categories[column_name] = categories  # Add to the dictionary

# Create a list to hold the new feature names
feature_names = []
for column_name in cat_columns:
    for category in encoder.categories_[cat_columns.index(column_name)]:
        # Create a new feature name and add it to the list
        new_feature_name = f"{column_name}_{category}"  # Example: "sex_male"
        feature_names.append(new_feature_name)

# Create a new DataFrame with the encoded data and new feature names
encoded_df.columns = feature_names  # Assign new feature names to the DataFrame

# Concatenate the original DataFrame with the encoded DataFrame
df = pd.concat([df, encoded_df], axis=1)

# Optionally drop the original categorical columns
# df.drop(cat_columns, axis=1, inplace=True)

# Display the first few rows of the updated DataFrame
print(df.head())

   survived  pclass     sex   age  sibsp  parch     fare embarked  class  \
0         0       3    male  22.0      1      0   7.2500        S  Third   
1         1       1  female  38.0      1      0  71.2833        C  First   
2         1       3  female  26.0      0      0   7.9250        S  Third   
3         1       1  female  35.0      1      0  53.1000        S  First   
4         0       3    male  35.0      0      0   8.0500        S  Third   

     who  adult_male deck  embark_town alive  alone  sex_female  sex_male  \
0    man        True  NaN  Southampton    no  False         0.0       1.0   
1  woman       False    C    Cherbourg   yes  False         1.0       0.0   
2  woman       False  NaN  Southampton   yes   True         1.0       0.0   
3  woman       False    C  Southampton   yes  False         1.0       0.0   
4    man        True  NaN  Southampton    no   True         0.0       1.0   

   embarked_C  embarked_Q  embarked_S  embarked_nan  
0         0.0         0.0 