In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
data = pd.read_csv('Gender.csv')

## Exploratory Data Analysis

In [3]:
data.head()

Unnamed: 0,long_hair,forehead_width_cm,forehead_height_cm,nose_wide,nose_long,lips_thin,distance_nose_to_lip_long,gender
0,1,11.8,6.1,1,0,1,1,Male
1,0,14.0,5.4,0,0,1,0,Female
2,0,11.8,6.3,1,1,1,1,Male
3,0,14.4,6.1,0,1,1,1,Male
4,1,13.5,5.9,0,0,0,0,Female


In [4]:
data.shape

(5001, 8)

In [5]:
data.columns

Index(['long_hair', 'forehead_width_cm', 'forehead_height_cm', 'nose_wide',
       'nose_long', 'lips_thin', 'distance_nose_to_lip_long', 'gender'],
      dtype='object')

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5001 entries, 0 to 5000
Data columns (total 8 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   long_hair                  5001 non-null   int64  
 1   forehead_width_cm          5001 non-null   float64
 2   forehead_height_cm         5001 non-null   float64
 3   nose_wide                  5001 non-null   int64  
 4   nose_long                  5001 non-null   int64  
 5   lips_thin                  5001 non-null   int64  
 6   distance_nose_to_lip_long  5001 non-null   int64  
 7   gender                     5001 non-null   object 
dtypes: float64(2), int64(5), object(1)
memory usage: 312.7+ KB


In [7]:
data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
long_hair,5001.0,0.869626,0.336748,0.0,1.0,1.0,1.0,1.0
forehead_width_cm,5001.0,13.181484,1.107128,11.4,12.2,13.1,14.0,15.5
forehead_height_cm,5001.0,5.946311,0.541268,5.1,5.5,5.9,6.4,7.1
nose_wide,5001.0,0.493901,0.500013,0.0,0.0,0.0,1.0,1.0
nose_long,5001.0,0.507898,0.499988,0.0,0.0,1.0,1.0,1.0
lips_thin,5001.0,0.493101,0.500002,0.0,0.0,0.0,1.0,1.0
distance_nose_to_lip_long,5001.0,0.4989,0.500049,0.0,0.0,0.0,1.0,1.0


In [8]:
data.gender.value_counts()

Female    2501
Male      2500
Name: gender, dtype: int64

# Data Preprocessing

### Categorical Encoding

In [9]:
from sklearn.preprocessing import LabelEncoder

lbn = LabelEncoder()

In [10]:
data['gender'] = lbn.fit_transform(data.gender)

In [11]:
data.gender.value_counts()

0    2501
1    2500
Name: gender, dtype: int64

0 --> Female 

1 --> Male

In [12]:
data.isna().values.any()

False

### Train & Test Split

In [14]:
x = data.drop('gender', axis=1)
y = data['gender']

In [15]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, stratify=y, random_state=0)

# Model Building

In [13]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()

In [16]:
model.fit(x_train, y_train)

In [17]:
model.score(x_test, y_test)

0.9626915389740173

In [19]:
y_pred = model.predict(x_test)
y_pred

array([0, 1, 0, ..., 1, 0, 0])

In [20]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1score = f1_score(y_test, y_pred)

In [21]:
confusion_matrix(y_test, y_pred)

array([[722,  29],
       [ 27, 723]], dtype=int64)

In [25]:
print('Accuracy : {:.2f}%'.format(accuracy*100))
print('Precision : {:.2f}%'.format(precision*100))
print('Recall : {:.2f}%'.format(recall*100))
print('F1 Score : {:.2f}%'.format(f1score*100))

Accuracy : 96.27%
Precision : 96.14%
Recall : 96.40%
F1 Score : 96.27%


In [29]:
# Predictive System

inputs = [1,13.5,5.9,0,0,0,0]
inp_data = np.asarray(inputs)
input_data = inp_data.reshape(1,-1)

prediction = model.predict(input_data)
print(prediction)

if (prediction == 0):
  print('Female')
else:
  print('Male')

[0]
Female


