In [178]:
import numpy as np 
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')

## Introduction

We are embarking on a project to ascertain the gender of individuals with calcified names, utilizing data entries from customer service at El-Araby Group. To safeguard privacy, personal information such as phone numbers has been expunged from the dataset.

In this project, the aim is to locate a suitable dataset and implement a machine learning model capable of accurately classifying the gender associated with each name. The selected algorithm for this classification task is the *Random Forest Algorithm*. Following this, the developed model will be applied to the anonymized dataset to predict the gender for each entry.

**Objectives:**
1. Clean the dataset by eliminating any potentially corrupting elements.
2. Develop a machine learning algorithm capable of accurately predicting gender based on the provided names.

We will proceed with these objectives to construct a reliable model for gender classification while ensuring the integrity and privacy of the data.


### Import  all the Dataframes I will use for this project

In [179]:
# This is the dataset in which I want to predict
freezed_df = pd.read_csv("FreezedNames_final - FreezedNames.csv")
# These two are different dataset that I will merge together and clean them both and finally train the model on
train_df = pd.read_csv("train_data.csv")
train_df2 = pd.read_csv("arabic_names - arabic_names (1).csv")


### Analyze Train_df2

In [180]:
train_df.sample(10)

Unnamed: 0,Names,Gender
1033,ايلين,F
1256,حفيظ,M
615,كتشنر,M
1722,عبدرياض,M
5916,عبوده,M
1358,فيلوباتير,M
2494,معبدصابر,M
4346,جوفانا,F
4275,قرواني,M
3371,نسان,M


I already cleaned the data on Excel and SQL 

In [181]:
train_df.describe()

Unnamed: 0,Names,Gender
count,6294,6296
unique,6283,2
top,اكرم,M
freq,2,4776


In [182]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6296 entries, 0 to 6295
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Names   6294 non-null   object
 1   Gender  6296 non-null   object
dtypes: object(2)
memory usage: 98.5+ KB


In [183]:
print(train_df["Names"].isna().sum())
print(train_df["Gender"].isna().sum())

2
0


### Remove the Null values

In [184]:
train_df = train_df[(~train_df["Names"].isna())].reset_index()

train_df["Names"].isna().sum()

0

Do the same cleaning Process on the train_df2

In [185]:
train_df2.sample(10)

Unnamed: 0,arabic_name,gender
1314,فارع,M
386,رابعة,F
289,إنعام,F
399,رجاء,F
238,أثيل,F
694,نداء,F
1221,عبدو,M
405,ردينة,F
1212,عبدالمنان,M
1439,مرشدي,M


In [186]:
train_df2.describe()

Unnamed: 0,arabic_name,gender
count,1611,1611
unique,1482,2
top,فرح,M
freq,3,980


In [187]:
train_df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1611 entries, 0 to 1610
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   arabic_name  1611 non-null   object
 1   gender       1611 non-null   object
dtypes: object(2)
memory usage: 25.3+ KB


In [188]:
print(train_df2["arabic_name"].isna().sum())
print(train_df2["gender"].isna().sum())

0
0


### Merge the Two Datasets into one 

In [189]:
train_df2 = train_df2.rename(columns = {"arabic_name" : "Names" , 'gender': 'Gender'})

In [190]:
merged_df = pd.concat([train_df, train_df2], ignore_index=True)

# Print the result
merged_df.tail(10)

Unnamed: 0,index,Names,Gender
7895,,ولاء الدين,M
7896,,وليد,M
7897,,وليف,M
7898,,وهاب,M
7899,,وهب,M
7900,,وهبة,M
7901,,ياسر,M
7902,,يحيى,M
7903,,يزيد,M
7904,,يعقوب,M


In [191]:
merged_df.isna().sum()

index     1611
Names        0
Gender       0
dtype: int64

In [192]:
train_df = merged_df

### Some names are first and last name and we only need the first name to know the gender 
### So. we need to extract the first name only.

In [193]:
def Split(x):
    if(type(x) == float):
        print(x)
    return x.split("\xa0")[-1]

test = train_df["Names"].apply(Split)
train_df["Names"] = pd.DataFrame(test)

### Find the longest Name

- Apply the len function on the Names column 
- Sort the Data Frame by length
- Display the longest names

In [194]:

length = train_df["Names"].apply(len)

train_df['Length'] = length

sorted_train_df = train_df.sort_values(by = 'Length' , ascending= True)

In [195]:
display(sorted_train_df[sorted_train_df['Length'] <= 2])

Unnamed: 0,index,Names,Gender,Length
3820,3821.0,ه,M,1
2330,2331.0,ث,M,1
2344,2345.0,ع,M,1
315,315.0,ل,M,1
2667,2668.0,ح,M,1
...,...,...,...,...
737,737.0,نم,M,2
3129,3130.0,مل,M,2
5437,5439.0,ود,F,2
665,665.0,هي,F,2


There are many enteries where the name consists of one or two letter which cannot be true for the one letter and very rare for two letters so we need to remove them

In [196]:
#Remove the enteries where the length the is 1 letter
sorted_train_df = sorted_train_df[sorted_train_df['Length'] > 2]
sorted_train_df.head(10)

Unnamed: 0,index,Names,Gender,Length
7080,,أسد,M,3
2110,2111.0,ديك,M,3
2332,2333.0,تقا,F,3
2111,2112.0,سهر,F,3
2338,2339.0,شدا,F,3
2361,2362.0,هوي,M,3
7059,,آدم,M,3
2157,2158.0,حست,M,3
2142,2143.0,هيم,M,3
2345,2346.0,همس,F,3


In [197]:
train_df = sorted_train_df[["Names" , "Gender"]].reset_index(drop=True)

In [198]:
#view the train_df
train_df.head()

Unnamed: 0,Names,Gender
0,أسد,M
1,ديك,M
2,تقا,F
3,سهر,F
4,شدا,F


# Prerpocess the freezed_df

In [199]:
#first upload the dataset for the freezed names and show a smaple of it
freezed_df.head()

Unnamed: 0,Name
0,​انور ناجى شامخ
1,​وجدي عزت متري
2,• محمود عطيه حسانين
3,1000553724
4,1001146286


there is a problem with the data where there are english and arabic names as well as errors entering the data and finally there are numbers in the name field 

since the data we need to find it all about strings we need to drop the data that has only numbers

In [200]:
#find the number of numeric cells
numeric_mask = pd.to_numeric(freezed_df['Name'], errors='coerce').notnull()
print(len(freezed_df[numeric_mask]))
freezed_df[numeric_mask].head()

1450


Unnamed: 0,Name
3,1000553724
4,1001146286
5,1001602047
6,1003100560
7,1003252205


In [201]:
#remove all the numeric values from the data set
freezed_df = freezed_df[~numeric_mask].reset_index(drop = True)
freezed_df.sample(5, random_state = 42)


Unnamed: 0,Name
92257,سامه نبيل محمد
196530,ولبيد ربيع رضوان
110790,روفيدا سليمان علي
45255,ايميل اديب تكله
170057,قمر فرغلي حسين


### 1- remove all the special charaters

In [202]:
import re

test = freezed_df.copy()
test["Name"] = test["Name"].str.replace(r"[@#$%^&*()_+{}<>?/|•٠1234567890.`]", "")
freezed_df = test


In [203]:
#check for null values
freezed_df.isna().sum()

Name    3
dtype: int64

In [204]:
freezed_df.dropna(inplace=True)
freezed_df.isna().sum()

Name    0
dtype: int64

### 2 -  here is the data where there are english and arabic names 

In [205]:
display(freezed_df.iloc[145:165])

Unnamed: 0,Name
145,LYNN MARIE
146,lمحمد ابراهيم حافظ سالم
147,lمحمد جمال الدين حسين
148,lمحمد حلمى المهدى
149,lمحمد صابر مصطفي
150,lمحمد عبد القادر
151,lمحمد عبداله
152,lمحمد فهمى
153,lمحمد يحيي ابراهيم
154,Lمحمد يحيى دسوقى


Remove the English letters if the name has arabic letters in it

In [206]:
import re
pattern = r'[A-Za-z]'
test = freezed_df.copy()
# test = test.to_dict()
pattern = r'[A-Za-z]'

def remove_english_letters(text):
  if re.search(r'[\u0600-\u06FF]', text):
    return re.sub(pattern, '', text)
  else:
    return text


test["Name"] = test["Name"].apply(remove_english_letters)

    
freezed_names = test
display(test.iloc[145:165])

Unnamed: 0,Name
145,LYNN MARIE
146,محمد ابراهيم حافظ سالم
147,محمد جمال الدين حسين
148,محمد حلمى المهدى
149,محمد صابر مصطفي
150,محمد عبد القادر
151,محمد عبداله
152,محمد فهمى
153,محمد يحيي ابراهيم
154,محمد يحيى دسوقى


next I am going to make a new column called language where it is going to assign 1 when it has english and 0 for arabic 
to be able to group the data depending on the being written in english or arabic

0: arabic <br>
1: english


In [207]:
def which_lan(text):
  if re.search(r'[\u0600-\u06FF]', text):
    return 0
  else:
    return 1


lang = []
for row in freezed_df["Name"]:
    lang.append(which_lan(row))
freezed_df = test
freezed_df = freezed_df.assign(lang = lang)
freezed_df.iloc[145:165]

Unnamed: 0,Name,lang
145,LYNN MARIE,1
146,محمد ابراهيم حافظ سالم,0
147,محمد جمال الدين حسين,0
148,محمد حلمى المهدى,0
149,محمد صابر مصطفي,0
150,محمد عبد القادر,0
151,محمد عبداله,0
152,محمد فهمى,0
153,محمد يحيي ابراهيم,0
154,محمد يحيى دسوقى,0


In [208]:
def remove_leading_spaces(name):
  return name.lstrip("\s")

freezed_df["Name"] = freezed_df["Name"].apply(remove_leading_spaces)

display(freezed_df.head())

Unnamed: 0,Name,lang
0,​انور ناجى شامخ,0
1,​وجدي عزت متري,0
2,• محمود عطيه حسانين,0
3,01150455008 سماح محمد عبد الحافظ,0
4,01212277209 صباح عبد الشافى محمد ح,0


In [209]:
#there are many entries where it consists of only a space

freezed_df = freezed_df[~(test["Name"].str.isspace())]

In [210]:
display(freezed_df.iloc[1510 :1520])

Unnamed: 0,Name,lang
1510,nagui nosseir,1
1512,​انطونيو وهبه حكيم,0
1513,0 0,1
1514,01092657150 الحسينى احمد ابو العزم,0
1515,01099244265محمود حسن عبد المجيد,0
1516,01125404015ابو الحسن عبد الرحيم الصاوى,0
1517,01285431382 هشام على فتحى,0
1518,0سيد عبد التواب عيد,0
1519,1) محمد عبدالله,0
1520,1أيهاب احمد,0


In [211]:
def get_first_name(name):
  filtered_array = list(filter(bool, name.split(" ")))
  try:
        return filtered_array[0]
  except IndexError :
    print(name , freezed_df.loc[name])
  


first_names =  freezed_df["Name"].apply(get_first_name)

test = freezed_df.copy()
test  = test.assign(first_names = first_names)
freezed_df = test
freezed_df.head()

Unnamed: 0,Name,lang,first_names
0,​انور ناجى شامخ,0,​انور
1,​وجدي عزت متري,0,​وجدي
2,• محمود عطيه حسانين,0,•
3,01150455008 سماح محمد عبد الحافظ,0,01150455008
4,01212277209 صباح عبد الشافى محمد ح,0,01212277209


In [212]:
inconsistency_mapper = {
    "أ": "ا",
    "إ": "ا",
    "آ": "ا",
    "ة": "ه",
    "ى": "ي",
    "ـ": "",
    " الدين": "الدين",
    "عبد ": "عبد",
    "ابو ": "ابو",
    "ام ": "ام",
    '.' : '',
    'ؤ' : "و"
}



In [213]:
freezed_df['length'] = freezed_df['first_names'].apply(len)
freezed_df.head()

Unnamed: 0,Name,lang,first_names,length
0,​انور ناجى شامخ,0,​انور,5
1,​وجدي عزت متري,0,​وجدي,5
2,• محمود عطيه حسانين,0,•,1
3,01150455008 سماح محمد عبد الحافظ,0,01150455008,11
4,01212277209 صباح عبد الشافى محمد ح,0,01212277209,11


In [214]:
freezed_df['first_names'] = freezed_df['first_names'].replace(inconsistency_mapper)

In [215]:
freezed_df 

Unnamed: 0,Name,lang,first_names,length
0,​انور ناجى شامخ,0,​انور,5
1,​وجدي عزت متري,0,​وجدي,5
2,• محمود عطيه حسانين,0,•,1
3,01150455008 سماح محمد عبد الحافظ,0,01150455008,11
4,01212277209 صباح عبد الشافى محمد ح,0,01212277209,11
...,...,...,...,...
205835,هنام مصطفي عبد العزيز,0,هنام,4
205836,هناى عزب,0,هناى,4
205837,هناى محمد الجيار,0,هناى,4
205838,يسرب علي محمد,0,يسرب,4


### Make a table for english and arabic written names

In [216]:
arabic_names = freezed_df[freezed_df["lang"] ==0]
english_names = freezed_df[freezed_df["lang"] ==1]


In [217]:
gourped_arabic = arabic_names.groupby("first_names").size().to_frame(name="count").reset_index()
gourped_arabic = gourped_arabic.sort_values(by=["count"], ascending=False)
gourped_arabic.head(10)

Unnamed: 0,first_names,count
2067,اسلام,2629
34105,نادى,1944
34110,نادي,1304
30182,محمد,1058
34656,نجاة,1053
27288,قمر,967
11354,جمال,932
5997,المهندس,731
22546,ع,635
1282,احمد,614


In [218]:
Freezed_arabic_first = arabic_names[["first_names" , 'length']]
Freezed_english_first = english_names[["first_names" , 'length']]

In [219]:
Freezed_arabic_first.iloc[0][0]

'\u200bانور'

There are some enteries which has irregular expressions or zero-width space 

In [220]:
import re

Freezed_arabic_first['first_names'] = Freezed_arabic_first['first_names'].str.replace(r"[@#$%^&*()_+{}<>?/|•٠1234567890.` ِ\u200bc ]", "")

Freezed_arabic_first['first_names'] = Freezed_arabic_first['first_names'].replace("" , np.nan).dropna()

any(element == "" for element in Freezed_arabic_first)

False

There are names which are very long and not suitable to be made into the model because no spaces between them.

In [221]:
Freezed_arabic_first

Unnamed: 0,first_names,length
0,​انور,5
1,​وجدي,5
2,•,1
3,01150455008,11
4,01212277209,11
...,...,...
205835,هنام,4
205836,هناى,4
205837,هناى,4
205838,يسرب,4


In [222]:
sorted_Freezed_arabic_first = Freezed_arabic_first.sort_values(by = 'length', ascending= False)
sorted_Freezed_arabic_first.head()

Unnamed: 0,first_names,length
54279,المصريةالعربيةللاستثماروالتنميةالعق,35
58294,المصريةالاوربيةللاستثمارالانمائ-واي,35
51502,العالميةلتجارةالجملةوالتجزئةللاجهزة,35
146640,مؤسسةالجادلتجارةالأجهزةالكهربائية,33
154750,نادرلصيانةوتجارةالاجهزة-نادرجلال,32


In [223]:
mask = ((Freezed_arabic_first['length'] <=14 ) & (Freezed_arabic_first['length'] >2 ))
Freezed_arabic_first = Freezed_arabic_first[mask]

In [224]:
sorted_Freezed_arabic_first = Freezed_arabic_first.sort_values(by = 'length', ascending= False)
sorted_Freezed_arabic_first.head()

Unnamed: 0,first_names,length
33530,اسلام على محمد,14
99102,رأفت.نسيم.هاشم,14
163470,محمودعبدالدايم,14
163469,محمودعبدالحميد,14
121635,صابرمحمدالعسوى,14


# Train the machine learning algorithm for arabic names

### Encode the train_df to fit inside the machine learning models

In [225]:
train_data = train_df
train_data

Unnamed: 0,Names,Gender
0,أسد,M
1,ديك,M
2,تقا,F
3,سهر,F
4,شدا,F
...,...,...
7828,السيدمحمدمكين,M
7829,محمدعبدالباقي,M
7830,محمدعبدالمقصود,M
7831,ناصرعبدالموجود,M


### Encoding the values to be numrical instead of string for the model 

In [226]:
inconsistency_mapper = {
    "أ": "ا",
    "إ": "ا",
    "آ": "ا",
    "ة": "ه",
    "ى": "ي",
    "ـ": "",
    " الدين": "الدين",
    "عبد ": "عبد",
    "ابو ": "ابو",
    "ام ": "ام",
    '.' : '',
    'ؤ' : "و",
    '-': '',
    'ئ': 'ي',
    "'ُ" : '' 
}

In [227]:

# Make a function the removes inconsistencies in a string

def remove_inconsistencies(input_string):
    # Iterate through the inconsistency_mapper and replace each key with its corresponding value
    for key, value in inconsistency_mapper.items():
        input_string = input_string.replace(key, value)
    
    return input_string

def remove_non_arabic(input_string):
    # Use regex to remove anything that is not an Arabic letter
    arabic_letters = re.sub(r'[^اأإءآبتثجحخدذرزسشصضطظعغفقكلمنهوي]', '', input_string)
    return arabic_letters



In [228]:
train_data['Names'] = train_data['Names'].apply(remove_inconsistencies)
train_data['Names'] = train_data['Names'].apply(remove_non_arabic)
train_data.head()

Unnamed: 0,Names,Gender
0,اسد,M
1,ديك,M
2,تقا,F
3,سهر,F
4,شدا,F


In [229]:
df = train_data

# Split the 'word' column into individual letters and create new columns
df = df['Names'].apply(lambda x: pd.Series(list(x)))

# Rename the columns to have meaningful names if needed
df.columns = [f'letter_{i+1}' for i in range(df.shape[1])]

# Display the resulting DataFrame
display(df)

Unnamed: 0,letter_1,letter_2,letter_3,letter_4,letter_5,letter_6,letter_7,letter_8,letter_9,letter_10,letter_11,letter_12,letter_13,letter_14,letter_15,letter_16,letter_17
0,ا,س,د,,,,,,,,,,,,,,
1,د,ي,ك,,,,,,,,,,,,,,
2,ت,ق,ا,,,,,,,,,,,,,,
3,س,ه,ر,,,,,,,,,,,,,,
4,ش,د,ا,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7828,ا,ل,س,ي,د,م,ح,م,د,م,ك,ي,ن,,,,
7829,م,ح,م,د,ع,ب,د,ا,ل,ب,ا,ق,ي,,,,
7830,م,ح,م,د,ع,ب,د,ا,ل,م,ق,ص,و,د,,,
7831,ن,ا,ص,ر,ع,ب,د,ا,ل,م,و,ج,و,د,,,


In [230]:
# The dictionray for the mapping 
arabic_alphabet_mapping = {
    'ا': 1, 'ب': 2, 'ت': 3, 'ث': 4, 'ج': 5, 'ح': 6, 'خ': 7, 'د': 8, 'ذ': 9, 'ر': 10,
    'ز': 11, 'س': 12, 'ش': 13, 'ص': 14, 'ض': 15, 'ط': 16, 'ظ': 17, 'ع': 18, 'غ': 19,
    'ف': 20, 'ق': 21, 'ك': 22, 'ل': 23, 'م': 24, 'ن': 25, 'ه': 26, 'و': 27, 'ي': 28,
    'ء' : 29
}

for col in list(df.columns):
    df[col] = df[col].replace(arabic_alphabet_mapping)
df

Unnamed: 0,letter_1,letter_2,letter_3,letter_4,letter_5,letter_6,letter_7,letter_8,letter_9,letter_10,letter_11,letter_12,letter_13,letter_14,letter_15,letter_16,letter_17
0,1.0,12.0,8.0,,,,,,,,,,,,,,
1,8.0,28.0,22.0,,,,,,,,,,,,,,
2,3.0,21.0,1.0,,,,,,,,,,,,,,
3,12.0,26.0,10.0,,,,,,,,,,,,,,
4,13.0,8.0,1.0,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7828,1.0,23.0,12.0,28.0,8.0,24.0,6.0,24.0,8.0,24.0,22.0,28.0,25.0,,,,
7829,24.0,6.0,24.0,8.0,18.0,2.0,8.0,1.0,23.0,2.0,1.0,21.0,28.0,,,,
7830,24.0,6.0,24.0,8.0,18.0,2.0,8.0,1.0,23.0,24.0,21.0,14.0,27.0,8.0,,,
7831,25.0,1.0,14.0,10.0,18.0,2.0,8.0,1.0,23.0,24.0,27.0,5.0,27.0,8.0,,,


In [231]:
def remove_spaces(x):
    if(x == ''):
        return np.nan
    else:
        return x

def process_and_encode(x):
    # Remove any special characters and convert to string
    if(type(x) == str):
        x = x.replace('[^a-zA-Z0-9]', '')
    return x

# Assuming your DataFrame is called 'x'
for column in df.columns:
    df[column] = df[column].apply(process_and_encode)
    df[column] = df[column].apply(remove_spaces)


In [232]:
from sklearn.preprocessing import LabelEncoder

labelencoder = LabelEncoder()
y = labelencoder.fit_transform(train_data.Gender)
x = df

In [233]:
labelencoder.classes_

array(['F', 'M'], dtype=object)

In [234]:
x.astype(float)

Unnamed: 0,letter_1,letter_2,letter_3,letter_4,letter_5,letter_6,letter_7,letter_8,letter_9,letter_10,letter_11,letter_12,letter_13,letter_14,letter_15,letter_16,letter_17
0,1.0,12.0,8.0,,,,,,,,,,,,,,
1,8.0,28.0,22.0,,,,,,,,,,,,,,
2,3.0,21.0,1.0,,,,,,,,,,,,,,
3,12.0,26.0,10.0,,,,,,,,,,,,,,
4,13.0,8.0,1.0,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7828,1.0,23.0,12.0,28.0,8.0,24.0,6.0,24.0,8.0,24.0,22.0,28.0,25.0,,,,
7829,24.0,6.0,24.0,8.0,18.0,2.0,8.0,1.0,23.0,2.0,1.0,21.0,28.0,,,,
7830,24.0,6.0,24.0,8.0,18.0,2.0,8.0,1.0,23.0,24.0,21.0,14.0,27.0,8.0,,,
7831,25.0,1.0,14.0,10.0,18.0,2.0,8.0,1.0,23.0,24.0,27.0,5.0,27.0,8.0,,,


In [235]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [236]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='mean')

x_imputed = pd.DataFrame(imputer.fit_transform(x), columns=x.columns)


In [237]:
x_train, x_test, y_train, y_test = train_test_split(x_imputed, y, test_size=0.2, random_state=42)

In [238]:
y_pred_list =  []

In [239]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report

import pandas as pd   # only for the leaderboard table

# 1️⃣  Define models in a dictionary
models = {
    "Logistic Regression": LogisticRegression(max_iter=500, solver="lbfgs"),
    "Random Forest": RandomForestClassifier(
        n_estimators=300, max_depth=None, random_state=42, n_jobs=-1
    ),
    "HistGradientBoosting": HistGradientBoostingClassifier()
}

# 2️⃣  Train, predict, evaluate in a loop

y_pred_dict = {}   # model name ➜ prediction array
scores = []        # to build a leaderboard

for name, model in models.items():
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)

    # store predictions
    y_pred_dict[name] = y_pred

    # compute metrics
    acc = accuracy_score(y_test, y_pred)
    scores.append({"Model": name, "Accuracy": acc})

    # pretty print per-model results
    print(f"\n\033[1m=== {name} ===\033[0m")               # bold header
    print(f"Accuracy: {acc:.4f}")
    print(classification_report(y_test, y_pred,
                                target_names=["Female", "Male"]))

# ------------------------------------------------------------------ #
# 3️⃣  Show a leaderboard sorted by accuracy
# ------------------------------------------------------------------ #
leaderboard = (pd.DataFrame(scores)
                 .sort_values("Accuracy", ascending=False)
                 .reset_index(drop=True))

print("\n\033[1mAccuracy leaderboard\033[0m")
print(leaderboard.to_string(index=False))



[1m=== Logistic Regression ===[0m
Accuracy: 0.7301
              precision    recall  f1-score   support

      Female       0.60      0.01      0.03       425
        Male       0.73      1.00      0.84      1142

    accuracy                           0.73      1567
   macro avg       0.67      0.51      0.44      1567
weighted avg       0.70      0.73      0.62      1567


[1m=== Random Forest ===[0m
Accuracy: 0.8851
              precision    recall  f1-score   support

      Female       0.81      0.75      0.78       425
        Male       0.91      0.94      0.92      1142

    accuracy                           0.89      1567
   macro avg       0.86      0.84      0.85      1567
weighted avg       0.88      0.89      0.88      1567


[1m=== HistGradientBoosting ===[0m
Accuracy: 0.8583
              precision    recall  f1-score   support

      Female       0.80      0.64      0.71       425
        Male       0.88      0.94      0.91      1142

    accuracy             

In [240]:
# Make an explicit mapping so you control the codes
gender_map = {0: 'Female',1: 'Male'}

for name, y_pred in y_pred_dict.items():
    y_pred_dict[name] = pd.Series(y_pred).replace(gender_map).to_numpy()
    
    
y_test = pd.Series(y_test).replace(gender_map).to_numpy()

In [241]:
y_pred_dict

{'Logistic Regression': array(['Male', 'Male', 'Male', ..., 'Male', 'Male', 'Male'], dtype=object),
 'Random Forest': array(['Female', 'Male', 'Male', ..., 'Male', 'Female', 'Male'],
       dtype=object),
 'HistGradientBoosting': array(['Female', 'Male', 'Male', ..., 'Male', 'Male', 'Male'],
       dtype=object)}

In [242]:
from fairlearn.metrics import (
    MetricFrame, selection_rate, false_positive_rate, true_positive_rate
)
from sklearn.metrics import accuracy_score, precision_score, recall_score
from functools import partial
import pandas as pd

# --------------------------------------------------------------
# Encode nothing; keep labels as 'Female'/'Male'
# --------------------------------------------------------------
sensitive_attr = y_test              # still strings
y_pred_dict_str = y_pred_dict        # already strings

# All metrics that need a positive class get pos_label='Male'
metrics_dict = {
    "accuracy":        accuracy_score,
    "precision":       partial(precision_score,    pos_label="Male"),
    "recall":          partial(recall_score,       pos_label="Male"),
    "FPR":             partial(false_positive_rate,pos_label="Male"),
    "selection_rate":  partial(selection_rate,    pos_label="Male")
}

gap_rows = []

for name, y_pred in y_pred_dict_str.items():
    mf = MetricFrame(
        metrics=metrics_dict,
        y_true=y_test,
        y_pred=y_pred,
        sensitive_features=sensitive_attr
    )
    
    print(f"\n\033[1m=== {name} – per-gender metrics ===\033[0m")
    display(mf.by_group)                     # rows: 'Female', 'Male'
    print("Overall accuracy:", mf.overall["accuracy"])

    # Fairness gaps (absolute difference Female ↔ Male)
    gap_rows.append({
        "Model":           name,
        "Recall gap":      abs(mf.by_group.loc["Female","recall"]
                              - mf.by_group.loc["Male","recall"]),
        "FPR gap":         abs(mf.by_group.loc["Female","FPR"]
                              - mf.by_group.loc["Male","FPR"]),
        "Sel-rate gap":    abs(mf.by_group.loc["Female","selection_rate"]
                              - mf.by_group.loc["Male","selection_rate"])
    })

# --------------------------------------------------------------
# Leaderboard sorted by smallest recall gap
# --------------------------------------------------------------
gap_df = (pd.DataFrame(gap_rows)
            .sort_values("Recall gap")
            .reset_index(drop=True))

print("\n\033[1mFairness-gap leaderboard (smaller = better)\033[0m")
display(gap_df)


[1m=== Logistic Regression – per-gender metrics ===[0m


Unnamed: 0_level_0,accuracy,precision,recall,FPR,selection_rate
sensitive_feature_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Female,0.014118,0.0,0.0,0.985882,0.985882
Male,0.996497,1.0,0.996497,0.0,0.996497


Overall accuracy: 0.7300574345883855

[1m=== Random Forest – per-gender metrics ===[0m


Unnamed: 0_level_0,accuracy,precision,recall,FPR,selection_rate
sensitive_feature_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Female,0.745882,0.0,0.0,0.254118,0.254118
Male,0.936953,1.0,0.936953,0.0,0.936953


Overall accuracy: 0.8851308232291002

[1m=== HistGradientBoosting – per-gender metrics ===[0m


Unnamed: 0_level_0,accuracy,precision,recall,FPR,selection_rate
sensitive_feature_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Female,0.642353,0.0,0.0,0.357647,0.357647
Male,0.938704,1.0,0.938704,0.0,0.938704


Overall accuracy: 0.8583280153158902

[1mFairness-gap leaderboard (smaller = better)[0m


Unnamed: 0,Model,Recall gap,FPR gap,Sel-rate gap
0,Random Forest,0.936953,0.254118,0.682835
1,HistGradientBoosting,0.938704,0.357647,0.581057
2,Logistic Regression,0.996497,0.985882,0.010615


All three models nail male names but almost never identify female ones.
Female recall sits at 0 % across the board, while male recall hovers around 94–100 %, so overall accuracy masks a huge gender gap.
The fairness table shows recall gaps ≈ 0.94–0.99 and false-positive gaps up to 0.36, confirming severe bias toward the male class.
In short: the classifiers default to predicting “Male,” making them unusable until the class imbalance and decision threshold are fixed.

In [243]:
# Make an explicit mapping so you control the codes
gender_map = {'Female': 0, 'Male':1}

y_test = pd.Series(y_test).replace(gender_map).to_numpy()

In [244]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, recall_score
from sklearn.utils import class_weight
import numpy as np, pandas as pd

# ------------------------------------------------------------
# 0️⃣  Compute class weights  (Female = 0, Male = 1)
# ------------------------------------------------------------
cw = class_weight.compute_class_weight(
        class_weight="balanced",
        classes=np.unique(y_train),
        y=y_train
     )
weight_dict = {cls: w for cls, w in zip(np.unique(y_train), cw)}
print("Class-weight dict:", weight_dict)

# ------------------------------------------------------------
# 1️⃣  Define models with bias-aware settings
# ------------------------------------------------------------
models = {
    "Logistic Regression": LogisticRegression(
        max_iter=500, solver="lbfgs", class_weight=weight_dict
    ),
    "Random Forest": RandomForestClassifier(
        n_estimators=300, random_state=42, n_jobs=-1, class_weight="balanced"
    ),
    "HistGradientBoosting": HistGradientBoostingClassifier(
        learning_rate=0.1, max_depth=None, class_weight=weight_dict
    )
}

# ------------------------------------------------------------
# 2️⃣  Train, predict (with tuned threshold), evaluate
# ------------------------------------------------------------
y_pred_dict, scores = {}, []

def tune_threshold(proba, y_true, grid=np.linspace(0.3,0.7,9)):
    """Return threshold that minimises |recall_female - recall_male|."""
    best_thr, best_gap = 0.5, 1e9
    for t in grid:
        y_hat = (proba[:,1] >= t).astype(int)
        rec_f = recall_score(y_true, y_hat, pos_label=0)
        rec_m = recall_score(y_true, y_hat, pos_label=1)
        gap   = abs(rec_f - rec_m)
        if gap < best_gap:
            best_gap, best_thr = gap, t
    return best_thr

for name, model in models.items():
    model.fit(x_train, y_train)

    # Probabilities → tune threshold for fairness
    proba = model.predict_proba(x_test)
    thr   = tune_threshold(proba, y_test)      # fairness-driven threshold
    y_pred = (proba[:,1] >= thr).astype(int)

    y_pred_dict[name] = y_pred
    acc  = accuracy_score(y_test, y_pred)
    recF = recall_score(y_test, y_pred, pos_label=0)
    recM = recall_score(y_test, y_pred, pos_label=1)

    scores.append({"Model": name, "Accuracy": acc,
                   "Recall_F": recF, "Recall_M": recM, "Gap": abs(recF-recM)})

    print(f"\n\033[1m=== {name} (thr={thr:.2f}) ===\033[0m")
    print(f"Accuracy: {acc:.4f} | Recall gap: {abs(recF-recM):.4f}")
    print(classification_report(y_test, y_pred, target_names=["Female","Male"]))

# ------------------------------------------------------------
# 3️⃣  Leaderboard (sorted by fairness gap then accuracy)
# ------------------------------------------------------------
df = pd.DataFrame(scores).sort_values(["Gap","Accuracy"]).reset_index(drop=True)
print("\n\033[1mBias-aware leaderboard\033[0m")
print(df[['Model','Accuracy','Recall_F','Recall_M','Gap']].to_string(index=False))


Class-weight dict: {0: 1.831092928112215, 1: 0.6878155872667399}

[1m=== Logistic Regression (thr=0.50) ===[0m
Accuracy: 0.5788 | Recall gap: 0.0097
              precision    recall  f1-score   support

      Female       0.34      0.57      0.42       425
        Male       0.78      0.58      0.67      1142

    accuracy                           0.58      1567
   macro avg       0.56      0.58      0.55      1567
weighted avg       0.66      0.58      0.60      1567


[1m=== Random Forest (thr=0.70) ===[0m
Accuracy: 0.8736 | Recall gap: 0.0010
              precision    recall  f1-score   support

      Female       0.72      0.87      0.79       425
        Male       0.95      0.87      0.91      1142

    accuracy                           0.87      1567
   macro avg       0.83      0.87      0.85      1567
weighted avg       0.89      0.87      0.88      1567


[1m=== HistGradientBoosting (thr=0.55) ===[0m
Accuracy: 0.8232 | Recall gap: 0.0101
              precision    r

In [245]:
# Make an explicit mapping so you control the codes
gender_map = {0: 'Female',1: 'Male'}

for name, y_pred in y_pred_dict.items():
    y_pred_dict[name] = pd.Series(y_pred).replace(gender_map).to_numpy()
    
    
y_test = pd.Series(y_test).replace(gender_map).to_numpy()

In [246]:
from fairlearn.metrics import (
    MetricFrame, selection_rate, false_positive_rate, true_positive_rate
)
from sklearn.metrics import accuracy_score, precision_score, recall_score
from functools import partial
import pandas as pd

# --------------------------------------------------------------
# Encode nothing; keep labels as 'Female'/'Male'
# --------------------------------------------------------------
sensitive_attr = y_test              # still strings
y_pred_dict_str = y_pred_dict        # already strings

# All metrics that need a positive class get pos_label='Male'
metrics_dict = {
    "accuracy":        accuracy_score,
    "precision":       partial(precision_score,    pos_label="Male"),
    "recall":          partial(recall_score,       pos_label="Male"),
    "FPR":             partial(false_positive_rate,pos_label="Male"),
    "selection_rate":  partial(selection_rate,    pos_label="Male")
}

gap_rows = []

for name, y_pred in y_pred_dict_str.items():
    mf = MetricFrame(
        metrics=metrics_dict,
        y_true=y_test,
        y_pred=y_pred,
        sensitive_features=sensitive_attr
    )
    
    print(f"\n\033[1m=== {name} – per-gender metrics ===\033[0m")
    display(mf.by_group)                     # rows: 'Female', 'Male'
    print("Overall accuracy:", mf.overall["accuracy"])

    # Fairness gaps (absolute difference Female ↔ Male)
    gap_rows.append({
        "Model":           name,
        "Recall gap":      abs(mf.by_group.loc["Female","recall"]
                              - mf.by_group.loc["Male","recall"]),
        "FPR gap":         abs(mf.by_group.loc["Female","FPR"]
                              - mf.by_group.loc["Male","FPR"]),
        "Sel-rate gap":    abs(mf.by_group.loc["Female","selection_rate"]
                              - mf.by_group.loc["Male","selection_rate"])
    })

# --------------------------------------------------------------
# Leaderboard sorted by smallest recall gap
# --------------------------------------------------------------
gap_df = (pd.DataFrame(gap_rows)
            .sort_values("Recall gap")
            .reset_index(drop=True))

print("\n\033[1mFairness-gap leaderboard (smaller = better)\033[0m")
display(gap_df)


[1m=== Logistic Regression – per-gender metrics ===[0m


Unnamed: 0_level_0,accuracy,precision,recall,FPR,selection_rate
sensitive_feature_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Female,0.571765,0.0,0.0,0.428235,0.428235
Male,0.581436,1.0,0.581436,0.0,0.581436


Overall accuracy: 0.5788130185067007

[1m=== Random Forest – per-gender metrics ===[0m


Unnamed: 0_level_0,accuracy,precision,recall,FPR,selection_rate
sensitive_feature_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Female,0.872941,0.0,0.0,0.127059,0.127059
Male,0.873905,1.0,0.873905,0.0,0.873905


Overall accuracy: 0.8736439055520102

[1m=== HistGradientBoosting – per-gender metrics ===[0m


Unnamed: 0_level_0,accuracy,precision,recall,FPR,selection_rate
sensitive_feature_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Female,0.830588,0.0,0.0,0.169412,0.169412
Male,0.82049,1.0,0.82049,0.0,0.82049


Overall accuracy: 0.8232291001914487

[1mFairness-gap leaderboard (smaller = better)[0m


Unnamed: 0,Model,Recall gap,FPR gap,Sel-rate gap
0,Logistic Regression,0.581436,0.428235,0.153201
1,HistGradientBoosting,0.82049,0.169412,0.651079
2,Random Forest,0.873905,0.127059,0.746847


## Conclusion 

The model that has the most accuracy is random forest tree with ambutation and also the HistGradientBoostingClassifier 
With accuracy 89%