# Importing Necessary Libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_curve, auc
import matplotlib.pyplot as plt
import seaborn as sns

# Loading the Dataset

In [2]:
data = pd.read_csv('gender_classification_v7.csv')

## Display the first few rows of the dataset

In [3]:
data.head()

Unnamed: 0,long_hair,forehead_width_cm,forehead_height_cm,nose_wide,nose_long,lips_thin,distance_nose_to_lip_long,gender
0,1,11.8,6.1,1,0,1,1,Male
1,0,14.0,5.4,0,0,1,0,Female
2,0,11.8,6.3,1,1,1,1,Male
3,0,14.4,6.1,0,1,1,1,Male
4,1,13.5,5.9,0,0,0,0,Female


## Summary Statistics

In [4]:
data.describe()

Unnamed: 0,long_hair,forehead_width_cm,forehead_height_cm,nose_wide,nose_long,lips_thin,distance_nose_to_lip_long
count,5001.0,5001.0,5001.0,5001.0,5001.0,5001.0,5001.0
mean,0.869626,13.181484,5.946311,0.493901,0.507898,0.493101,0.4989
std,0.336748,1.107128,0.541268,0.500013,0.499988,0.500002,0.500049
min,0.0,11.4,5.1,0.0,0.0,0.0,0.0
25%,1.0,12.2,5.5,0.0,0.0,0.0,0.0
50%,1.0,13.1,5.9,0.0,1.0,0.0,0.0
75%,1.0,14.0,6.4,1.0,1.0,1.0,1.0
max,1.0,15.5,7.1,1.0,1.0,1.0,1.0


## Data types and Missing values

In [5]:
print(data.info())
print(data.isnull().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5001 entries, 0 to 5000
Data columns (total 8 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   long_hair                  5001 non-null   int64  
 1   forehead_width_cm          5001 non-null   float64
 2   forehead_height_cm         5001 non-null   float64
 3   nose_wide                  5001 non-null   int64  
 4   nose_long                  5001 non-null   int64  
 5   lips_thin                  5001 non-null   int64  
 6   distance_nose_to_lip_long  5001 non-null   int64  
 7   gender                     5001 non-null   object 
dtypes: float64(2), int64(5), object(1)
memory usage: 312.7+ KB
None
long_hair                    0
forehead_width_cm            0
forehead_height_cm           0
nose_wide                    0
nose_long                    0
lips_thin                    0
distance_nose_to_lip_long    0
gender                       0
dtype: int64


In [6]:
from sklearn.preprocessing import LabelEncoder

In [7]:
le = LabelEncoder()

data['gender'] = le.fit_transform(data['gender'])

print(data)

      long_hair  forehead_width_cm  forehead_height_cm  nose_wide  nose_long  \
0             1               11.8                 6.1          1          0   
1             0               14.0                 5.4          0          0   
2             0               11.8                 6.3          1          1   
3             0               14.4                 6.1          0          1   
4             1               13.5                 5.9          0          0   
...         ...                ...                 ...        ...        ...   
4996          1               13.6                 5.1          0          0   
4997          1               11.9                 5.4          0          0   
4998          1               12.9                 5.7          0          0   
4999          1               13.2                 6.2          0          0   
5000          1               15.4                 5.4          1          1   

      lips_thin  distance_nose_to_lip_l

## Handling Missing Values

In [8]:
data.fillna(data.median())

Unnamed: 0,long_hair,forehead_width_cm,forehead_height_cm,nose_wide,nose_long,lips_thin,distance_nose_to_lip_long,gender
0,1,11.8,6.1,1,0,1,1,1
1,0,14.0,5.4,0,0,1,0,0
2,0,11.8,6.3,1,1,1,1,1
3,0,14.4,6.1,0,1,1,1,1
4,1,13.5,5.9,0,0,0,0,0
...,...,...,...,...,...,...,...,...
4996,1,13.6,5.1,0,0,0,0,0
4997,1,11.9,5.4,0,0,0,0,0
4998,1,12.9,5.7,0,0,0,0,0
4999,1,13.2,6.2,0,0,0,0,0


## Splitting the Data

In [9]:
x = data.drop(columns = ['gender']) #input

y = data['gender'] #output

## Standardize the Feature

In [10]:
scaler = StandardScaler()
x_scaled = scaler.fit_transform(x)

# Training the Data

In [11]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Training the Model

## Training the Logistic Model

In [12]:
model = LogisticRegression()

model.fit(x_train, y_train)

## Making the Prediction

In [13]:
y_pred = model.predict(x_test)

# Evaluating the Performance Metrics

In [14]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f'Accuracy: {accuracy:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1 Score: {f1:.2f}')

Accuracy: 0.96
Precision: 0.97
Recall: 0.95
F1 Score: 0.96


# Model Implementation

In [18]:
new_data_point = pd.DataFrame([[1, 13.5, 5.9, 0, 0, 0, 0]], columns=['long_hair', 'forehead_width_cm', 'forehead_height_cm', 'nose_wide', 'nose_long', 'lips_thin', 'distance_nose_to_lip_long'])
new_data_point_scaled = scaler.transform(new_data_point)
new_prediction = model.predict(new_data_point_scaled)
if new_prediction[0] == 1:
    print("Predicted Gender: Male")
else:
    print("Predicted Gender: Female")

Predicted Gender: Female


