In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/gender-classification-dataset/gender_classification_v7.csv


# Introduction
In this notebook, I explore a classification problem using several supervised machine learning models.
The goal is to predict the gender of individuals based on facial features such as nose shape, lip thickness, and forehead dimensions...

After performing data cleaning and preprocessing, I applied the following models:

- Random Forest Classifier

- K-Nearest Neighbors (KNN)

For each model, I include a short definition, training process, accuracy results, and evaluation using metrics such as accuracy.


# 1.Importing Libraries & Downloading Data 

In [2]:
import pandas as pd 
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier


In [3]:
gender_df = pd.read_csv('/kaggle/input/gender-classification-dataset/gender_classification_v7.csv')


# 2.EDA ( Exploratory Data Analysis) 

In [4]:
gender_df.head()

Unnamed: 0,long_hair,forehead_width_cm,forehead_height_cm,nose_wide,nose_long,lips_thin,distance_nose_to_lip_long,gender
0,1,11.8,6.1,1,0,1,1,Male
1,0,14.0,5.4,0,0,1,0,Female
2,0,11.8,6.3,1,1,1,1,Male
3,0,14.4,6.1,0,1,1,1,Male
4,1,13.5,5.9,0,0,0,0,Female


In [5]:
gender_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5001 entries, 0 to 5000
Data columns (total 8 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   long_hair                  5001 non-null   int64  
 1   forehead_width_cm          5001 non-null   float64
 2   forehead_height_cm         5001 non-null   float64
 3   nose_wide                  5001 non-null   int64  
 4   nose_long                  5001 non-null   int64  
 5   lips_thin                  5001 non-null   int64  
 6   distance_nose_to_lip_long  5001 non-null   int64  
 7   gender                     5001 non-null   object 
dtypes: float64(2), int64(5), object(1)
memory usage: 312.7+ KB


**- We notice that we don't have any null values so we don't need to handle missing values. **

In [6]:
gender_df.describe()

Unnamed: 0,long_hair,forehead_width_cm,forehead_height_cm,nose_wide,nose_long,lips_thin,distance_nose_to_lip_long
count,5001.0,5001.0,5001.0,5001.0,5001.0,5001.0,5001.0
mean,0.869626,13.181484,5.946311,0.493901,0.507898,0.493101,0.4989
std,0.336748,1.107128,0.541268,0.500013,0.499988,0.500002,0.500049
min,0.0,11.4,5.1,0.0,0.0,0.0,0.0
25%,1.0,12.2,5.5,0.0,0.0,0.0,0.0
50%,1.0,13.1,5.9,0.0,1.0,0.0,0.0
75%,1.0,14.0,6.4,1.0,1.0,1.0,1.0
max,1.0,15.5,7.1,1.0,1.0,1.0,1.0


In [7]:
gender_df.dtypes

long_hair                      int64
forehead_width_cm            float64
forehead_height_cm           float64
nose_wide                      int64
nose_long                      int64
lips_thin                      int64
distance_nose_to_lip_long      int64
gender                        object
dtype: object

- The target column "gender" has data type "object" so we need to encode it.

# 3. Preprocessing

- Encoding the target column to :
  0 : Male 
  1 : Female

In [8]:
print(gender_df['gender'].unique())


['Male' 'Female']


In [9]:
gender_df['gender'] = gender_df['gender'].map({'Male' : 0 , 'Female' : 1})

In [10]:
gender_df['gender'].dtype


dtype('int64')

- Feature Selection: 

In [11]:
gender_df.corr()

Unnamed: 0,long_hair,forehead_width_cm,forehead_height_cm,nose_wide,nose_long,lips_thin,distance_nose_to_lip_long,gender
long_hair,1.0,-0.00653,-0.017233,0.001216,0.014432,0.011287,-0.025794,0.010767
forehead_width_cm,-0.00653,1.0,0.088596,0.251648,0.257368,0.258564,0.251328,-0.334125
forehead_height_cm,-0.017233,0.088596,1.0,0.211655,0.19412,0.205441,0.215292,-0.27719
nose_wide,0.001216,0.251648,0.211655,1.0,0.565192,0.557615,0.569303,-0.758502
nose_long,0.014432,0.257368,0.19412,0.565192,1.0,0.561229,0.559794,-0.744147
lips_thin,0.011287,0.258564,0.205441,0.557615,0.561229,1.0,0.565312,-0.743319
distance_nose_to_lip_long,-0.025794,0.251328,0.215292,0.569303,0.559794,0.565312,1.0,-0.75485
gender,0.010767,-0.334125,-0.27719,-0.758502,-0.744147,-0.743319,-0.75485,1.0


**- Train Test Split:**

In [12]:
features = ['nose_wide', 'nose_long', 'lips_thin', 
            'distance_nose_to_lip_long', 'long_hair',
            'forehead_width_cm', 'forehead_height_cm']

X = gender_df[features]
y = gender_df['gender']

X_train , X_test , y_train , y_test =  train_test_split(X , y, test_size=0.2, random_state=42)

# 4- Modeling 

 # 4.1.Random Forest #
 Before understanding Random Forest, we start with a **Decision Tree**:
a model that splits data into branches based on feature values, leading to a prediction at the leaves.

**Random Forest** is an ensemble of many decision trees.
Each tree votes, and the final prediction is based on the majority vote.

In [13]:
#1-Training: 
rfc = RandomForestClassifier(random_state = 42)
# random_state ensures every time you run your code, you get the same result.
rfc.fit(X_train, y_train)
#2-Predicting: 
y_pred = rfc.predict(X_test)

In [14]:
#3-Evaluation: 
print("Accuracy:" , accuracy_score(y_test, y_pred))

Accuracy: 0.961038961038961


# 4.2.K-Nearest Neighbors (KNN)
A simple model where we choose an odd number k, representing the **nearest neighbors**to the instance we want to predict.
The model then takes a **vote** among those neighbors, and the majority class becomes the predicted label.

**Scaling**: 
Since KNN is too sensitive to scaling, We will scale our data first

In [15]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [16]:
#1- training
knn =  KNeighborsClassifier(n_neighbors=7) 
knn.fit(X_train_scaled, y_train)
#2- Prediction 
y_pred = knn.predict(X_test_scaled)

In [17]:
#3-Evaluation
accuracy = accuracy_score(y_test , y_pred)
print("Accuracy:" , accuracy)

Accuracy: 0.9630369630369631
