In [1]:
import numpy as np 
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')

## Introduction

We are embarking on a project to ascertain the gender of individuals with calcified names, utilizing data entries from customer service at El-Araby Group. To safeguard privacy, personal information such as phone numbers has been expunged from the dataset.

In this project, the aim is to locate a suitable dataset and implement a machine learning model capable of accurately classifying the gender associated with each name. The selected algorithm for this classification task is the *Random Forest Algorithm*. Following this, the developed model will be applied to the anonymized dataset to predict the gender for each entry.

**Objectives:**
1. Clean the dataset by eliminating any potentially corrupting elements.
2. Develop a machine learning algorithm capable of accurately predicting gender based on the provided names.

We will proceed with these objectives to construct a reliable model for gender classification while ensuring the integrity and privacy of the data.


### Import  all the Dataframes I will use for this project

In [2]:
# This is the dataset in which I want to predict
freezed_df = pd.read_csv("FreezedNames_final - FreezedNames.csv")
# These two are different dataset that I will merge together and clean them both and finally train the model on
train_df = pd.read_csv("train_data.csv")
train_df2 = pd.read_csv("arabic_names - arabic_names (1).csv")


### Analyze Train_df2

In [3]:
train_df.sample(10)

Unnamed: 0,Names,Gender
46,وجيه,M
4504,مارسلينو,M
4100,مزاد,M
3498,مورينا,F
2911,فورينا,F
1210,المشتاوي,M
217,ظريف,M
2398,شمشوم,M
3270,زعيم,M
4799,محمدثابت,M


I already cleaned the data on Excel and SQL 

In [4]:
train_df.describe()

Unnamed: 0,Names,Gender
count,6294,6296
unique,6283,2
top,اكرم,M
freq,2,4776


In [5]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6296 entries, 0 to 6295
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Names   6294 non-null   object
 1   Gender  6296 non-null   object
dtypes: object(2)
memory usage: 98.5+ KB


In [6]:
print(train_df["Names"].isna().sum())
print(train_df["Gender"].isna().sum())

2
0


### Remove the Null values

In [7]:
train_df = train_df[(~train_df["Names"].isna())].reset_index()

train_df["Names"].isna().sum()

0

Do the same cleaning Process on the train_df2

In [8]:
train_df2.sample(10)

Unnamed: 0,arabic_name,gender
510,شهلاء,F
1533,نامي,M
358,حنيفة,F
37,عائدة,F
1222,عبدي,M
1023,رماح,M
1559,نصوح,M
1360,قدري,M
331,تيماء,F
535,عاطفة,F


In [9]:
train_df2.describe()

Unnamed: 0,arabic_name,gender
count,1611,1611
unique,1482,2
top,فرح,M
freq,3,980


In [10]:
train_df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1611 entries, 0 to 1610
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   arabic_name  1611 non-null   object
 1   gender       1611 non-null   object
dtypes: object(2)
memory usage: 25.3+ KB


In [11]:
print(train_df2["arabic_name"].isna().sum())
print(train_df2["gender"].isna().sum())

0
0


### Merge the Two Datasets into one 

In [12]:
train_df2 = train_df2.rename(columns = {"arabic_name" : "Names" , 'gender': 'Gender'})

In [13]:
merged_df = pd.concat([train_df, train_df2], ignore_index=True)

# Print the result
merged_df.tail(10)

Unnamed: 0,index,Names,Gender
7895,,ولاء الدين,M
7896,,وليد,M
7897,,وليف,M
7898,,وهاب,M
7899,,وهب,M
7900,,وهبة,M
7901,,ياسر,M
7902,,يحيى,M
7903,,يزيد,M
7904,,يعقوب,M


In [14]:
merged_df.isna().sum()

index     1611
Names        0
Gender       0
dtype: int64

In [15]:
train_df = merged_df

### Some names are first and last name and we only need the first name to know the gender 
### So. we need to extract the first name only.

In [16]:
def Split(x):
    if(type(x) == float):
        print(x)
    return x.split("\xa0")[-1]

test = train_df["Names"].apply(Split)
train_df["Names"] = pd.DataFrame(test)

### Find the longest Name

- Apply the len function on the Names column 
- Sort the Data Frame by length
- Display the longest names

In [17]:

length = train_df["Names"].apply(len)

train_df['Length'] = length

sorted_train_df = train_df.sort_values(by = 'Length' , ascending= True)

In [18]:
display(sorted_train_df[sorted_train_df['Length'] <= 2])

Unnamed: 0,index,Names,Gender,Length
3820,3821.0,ه,M,1
2330,2331.0,ث,M,1
2344,2345.0,ع,M,1
315,315.0,ل,M,1
2667,2668.0,ح,M,1
...,...,...,...,...
737,737.0,نم,M,2
3129,3130.0,مل,M,2
5437,5439.0,ود,F,2
665,665.0,هي,F,2


There are many enteries where the name consists of one or two letter which cannot be true for the one letter and very rare for two letters so we need to remove them

In [19]:
#Remove the enteries where the length the is 1 letter
sorted_train_df = sorted_train_df[sorted_train_df['Length'] > 2]
sorted_train_df.head(10)

Unnamed: 0,index,Names,Gender,Length
7080,,أسد,M,3
2110,2111.0,ديك,M,3
2332,2333.0,تقا,F,3
2111,2112.0,سهر,F,3
2338,2339.0,شدا,F,3
2361,2362.0,هوي,M,3
7059,,آدم,M,3
2157,2158.0,حست,M,3
2142,2143.0,هيم,M,3
2345,2346.0,همس,F,3


In [20]:
train_df = sorted_train_df[["Names" , "Gender"]].reset_index(drop=True)

In [21]:
#view the train_df
train_df.head()

Unnamed: 0,Names,Gender
0,أسد,M
1,ديك,M
2,تقا,F
3,سهر,F
4,شدا,F


# Prerpocess the freezed_df

In [22]:
#first upload the dataset for the freezed names and show a smaple of it
freezed_df.head()

Unnamed: 0,Name
0,​انور ناجى شامخ
1,​وجدي عزت متري
2,• محمود عطيه حسانين
3,1000553724
4,1001146286


there is a problem with the data where there are english and arabic names as well as errors entering the data and finally there are numbers in the name field 

since the data we need to find it all about strings we need to drop the data that has only numbers

In [23]:
#find the number of numeric cells
numeric_mask = pd.to_numeric(freezed_df['Name'], errors='coerce').notnull()
print(len(freezed_df[numeric_mask]))
freezed_df[numeric_mask].head()

1450


Unnamed: 0,Name
3,1000553724
4,1001146286
5,1001602047
6,1003100560
7,1003252205


In [24]:
#remove all the numeric values from the data set
freezed_df = freezed_df[~numeric_mask].reset_index(drop = True)
freezed_df.sample(5, random_state = 42)


Unnamed: 0,Name
92257,سامه نبيل محمد
196530,ولبيد ربيع رضوان
110790,روفيدا سليمان علي
45255,ايميل اديب تكله
170057,قمر فرغلي حسين


### 1- remove all the special charaters

In [25]:
import re

test = freezed_df.copy()
test["Name"] = test["Name"].str.replace(r"[@#$%^&*()_+{}<>?/|•٠1234567890.`]", "")
freezed_df = test


In [26]:
#check for null values
freezed_df.isna().sum()

Name    3
dtype: int64

In [27]:
freezed_df.dropna(inplace=True)
freezed_df.isna().sum()

Name    0
dtype: int64

### 2 -  here is the data where there are english and arabic names 

In [28]:
display(freezed_df.iloc[145:165])

Unnamed: 0,Name
145,LYNN MARIE
146,lمحمد ابراهيم حافظ سالم
147,lمحمد جمال الدين حسين
148,lمحمد حلمى المهدى
149,lمحمد صابر مصطفي
150,lمحمد عبد القادر
151,lمحمد عبداله
152,lمحمد فهمى
153,lمحمد يحيي ابراهيم
154,Lمحمد يحيى دسوقى


Remove the English letters if the name has arabic letters in it

In [29]:
import re
pattern = r'[A-Za-z]'
test = freezed_df.copy()
# test = test.to_dict()
pattern = r'[A-Za-z]'

def remove_english_letters(text):
  if re.search(r'[\u0600-\u06FF]', text):
    return re.sub(pattern, '', text)
  else:
    return text


test["Name"] = test["Name"].apply(remove_english_letters)

    
freezed_names = test
display(test.iloc[145:165])

Unnamed: 0,Name
145,LYNN MARIE
146,محمد ابراهيم حافظ سالم
147,محمد جمال الدين حسين
148,محمد حلمى المهدى
149,محمد صابر مصطفي
150,محمد عبد القادر
151,محمد عبداله
152,محمد فهمى
153,محمد يحيي ابراهيم
154,محمد يحيى دسوقى


next I am going to make a new column called language where it is going to assign 1 when it has english and 0 for arabic 
to be able to group the data depending on the being written in english or arabic

0: arabic <br>
1: english


In [30]:
def which_lan(text):
  if re.search(r'[\u0600-\u06FF]', text):
    return 0
  else:
    return 1


lang = []
for row in freezed_df["Name"]:
    lang.append(which_lan(row))
freezed_df = test
freezed_df = freezed_df.assign(lang = lang)
freezed_df.iloc[145:165]

Unnamed: 0,Name,lang
145,LYNN MARIE,1
146,محمد ابراهيم حافظ سالم,0
147,محمد جمال الدين حسين,0
148,محمد حلمى المهدى,0
149,محمد صابر مصطفي,0
150,محمد عبد القادر,0
151,محمد عبداله,0
152,محمد فهمى,0
153,محمد يحيي ابراهيم,0
154,محمد يحيى دسوقى,0


In [31]:
def remove_leading_spaces(name):
  return name.lstrip("\s")

freezed_df["Name"] = freezed_df["Name"].apply(remove_leading_spaces)

display(freezed_df.head())

Unnamed: 0,Name,lang
0,​انور ناجى شامخ,0
1,​وجدي عزت متري,0
2,محمود عطيه حسانين,0
3,سماح محمد عبد الحافظ,0
4,صباح عبد الشافى محمد ح,0


In [32]:
#there are many entries where it consists of only a space

freezed_df = freezed_df[~(test["Name"].str.isspace())]

In [33]:
display(freezed_df.iloc[1510 :1520])

Unnamed: 0,Name,lang
1510,nagui nosseir,1
1512,​انطونيو وهبه حكيم,0
1514,الحسينى احمد ابو العزم,0
1515,محمود حسن عبد المجيد,0
1516,ابو الحسن عبد الرحيم الصاوى,0
1517,هشام على فتحى,0
1518,سيد عبد التواب عيد,0
1519,محمد عبدالله,0
1520,أيهاب احمد,0
1521,يهاب علي موافي,0


In [34]:
def get_first_name(name):
  filtered_array = list(filter(bool, name.split(" ")))
  try:
        return filtered_array[0]
  except IndexError :
    print(name , freezed_df.loc[name])
  


first_names =  freezed_df["Name"].apply(get_first_name)

test = freezed_df.copy()
test  = test.assign(first_names = first_names)
freezed_df = test
freezed_df.head()

Unnamed: 0,Name,lang,first_names
0,​انور ناجى شامخ,0,​انور
1,​وجدي عزت متري,0,​وجدي
2,محمود عطيه حسانين,0,محمود
3,سماح محمد عبد الحافظ,0,سماح
4,صباح عبد الشافى محمد ح,0,صباح


In [35]:
inconsistency_mapper = {
    "أ": "ا",
    "إ": "ا",
    "آ": "ا",
    "ة": "ه",
    "ى": "ي",
    "ـ": "",
    " الدين": "الدين",
    "عبد ": "عبد",
    "ابو ": "ابو",
    "ام ": "ام",
    '.' : '',
    'ؤ' : "و"
}



In [36]:
freezed_df['length'] = freezed_df['first_names'].apply(len)
freezed_df.head()

Unnamed: 0,Name,lang,first_names,length
0,​انور ناجى شامخ,0,​انور,5
1,​وجدي عزت متري,0,​وجدي,5
2,محمود عطيه حسانين,0,محمود,5
3,سماح محمد عبد الحافظ,0,سماح,4
4,صباح عبد الشافى محمد ح,0,صباح,4


In [37]:
freezed_df['first_names'] = freezed_df['first_names'].replace(inconsistency_mapper)

In [38]:
freezed_df 

Unnamed: 0,Name,lang,first_names,length
0,​انور ناجى شامخ,0,​انور,5
1,​وجدي عزت متري,0,​وجدي,5
2,محمود عطيه حسانين,0,محمود,5
3,سماح محمد عبد الحافظ,0,سماح,4
4,صباح عبد الشافى محمد ح,0,صباح,4
...,...,...,...,...
205835,هنام مصطفي عبد العزيز,0,هنام,4
205836,هناى عزب,0,هناى,4
205837,هناى محمد الجيار,0,هناى,4
205838,يسرب علي محمد,0,يسرب,4


### Make a table for english and arabic written names

In [39]:
arabic_names = freezed_df[freezed_df["lang"] ==0]
english_names = freezed_df[freezed_df["lang"] ==1]


In [40]:
gourped_arabic = arabic_names.groupby("first_names").size().to_frame(name="count").reset_index()
gourped_arabic = gourped_arabic.sort_values(by=["count"], ascending=False)
gourped_arabic.head(10)

Unnamed: 0,first_names,count
1953,اسلام,2633
33801,نادى,1944
33806,نادي,1304
29889,محمد,1077
34350,نجاة,1053
27032,قمر,967
11192,جمال,933
5849,المهندس,810
22310,ع,637
1172,احمد,622


In [41]:
Freezed_arabic_first = arabic_names[["first_names" , 'length']]
Freezed_english_first = english_names[["first_names" , 'length']]

In [42]:
Freezed_arabic_first.iloc[0][0]

'\u200bانور'

There are some enteries which has irregular expressions or zero-width space 

In [43]:
import re

Freezed_arabic_first['first_names'] = Freezed_arabic_first['first_names'].str.replace(r"[@#$%^&*()_+{}<>?/|•٠1234567890.` ِ\u200bc ]", "")

Freezed_arabic_first['first_names'] = Freezed_arabic_first['first_names'].replace("" , np.nan).dropna()

any(element == "" for element in Freezed_arabic_first)

False

There are names which are very long and not suitable to be made into the model because no spaces between them.

In [44]:
Freezed_arabic_first

Unnamed: 0,first_names,length
0,انور,5
1,وجدي,5
2,محمود,5
3,سماح,4
4,صباح,4
...,...,...
205835,هنام,4
205836,هناى,4
205837,هناى,4
205838,يسرب,4


In [45]:
sorted_Freezed_arabic_first = Freezed_arabic_first.sort_values(by = 'length', ascending= False)
sorted_Freezed_arabic_first.head()

Unnamed: 0,first_names,length
51502,العالميةلتجارةالجملةوالتجزئةللاجهزة,35
58294,المصريةالاوربيةللاستثمارالانمائ-واي,35
54279,المصريةالعربيةللاستثماروالتنميةالعق,35
146640,مؤسسةالجادلتجارةالأجهزةالكهربائية,33
154750,نادرلصيانةوتجارةالاجهزة-نادرجلال,32


In [46]:
mask = ((Freezed_arabic_first['length'] <=14 ) & (Freezed_arabic_first['length'] >2 ))
Freezed_arabic_first = Freezed_arabic_first[mask]

In [47]:
sorted_Freezed_arabic_first = Freezed_arabic_first.sort_values(by = 'length', ascending= False)
sorted_Freezed_arabic_first.head()

Unnamed: 0,first_names,length
164120,محمودعبدالعظيم,14
161644,نورامحمودمحمود,14
101861,سعيدمحمدالنجار,14
86736,جلال محمد جمعة,14
101863,سعيدمحمودرمضان,14


# Train the machine learning algorithm for arabic names

### Encode the train_df to fit inside the machine learning models

In [49]:
train_data = train_df
train_data

Unnamed: 0,Names,Gender
0,أسد,M
1,ديك,M
2,تقا,F
3,سهر,F
4,شدا,F
...,...,...
7828,السيدمحمدمكين,M
7829,محمدعبدالباقي,M
7830,محمدعبدالمقصود,M
7831,ناصرعبدالموجود,M


### Encoding the values to be numrical instead of string for the model 

In [50]:
inconsistency_mapper = {
    "أ": "ا",
    "إ": "ا",
    "آ": "ا",
    "ة": "ه",
    "ى": "ي",
    "ـ": "",
    " الدين": "الدين",
    "عبد ": "عبد",
    "ابو ": "ابو",
    "ام ": "ام",
    '.' : '',
    'ؤ' : "و",
    '-': '',
    'ئ': 'ي',
    "'ُ" : '' 
}

In [55]:

# Make a function the removes inconsistencies in a string

def remove_inconsistencies(input_string):
    # Iterate through the inconsistency_mapper and replace each key with its corresponding value
    for key, value in inconsistency_mapper.items():
        input_string = input_string.replace(key, value)
    
    return input_string

def remove_non_arabic(input_string):
    # Use regex to remove anything that is not an Arabic letter
    arabic_letters = re.sub(r'[^اأإءآبتثجحخدذرزسشصضطظعغفقكلمنهوي]', '', input_string)
    return arabic_letters



In [56]:
train_data['Names'] = train_data['Names'].apply(remove_inconsistencies)
train_data['Names'] = train_data['Names'].apply(remove_non_arabic)
train_data.head()

Unnamed: 0,Names,Gender
0,اسد,M
1,ديك,M
2,تقا,F
3,سهر,F
4,شدا,F


In [57]:
df = train_data

# Split the 'word' column into individual letters and create new columns
df = df['Names'].apply(lambda x: pd.Series(list(x)))

# Rename the columns to have meaningful names if needed
df.columns = [f'letter_{i+1}' for i in range(df.shape[1])]

# Display the resulting DataFrame
display(df)

Unnamed: 0,letter_1,letter_2,letter_3,letter_4,letter_5,letter_6,letter_7,letter_8,letter_9,letter_10,letter_11,letter_12,letter_13,letter_14,letter_15,letter_16,letter_17
0,ا,س,د,,,,,,,,,,,,,,
1,د,ي,ك,,,,,,,,,,,,,,
2,ت,ق,ا,,,,,,,,,,,,,,
3,س,ه,ر,,,,,,,,,,,,,,
4,ش,د,ا,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7828,ا,ل,س,ي,د,م,ح,م,د,م,ك,ي,ن,,,,
7829,م,ح,م,د,ع,ب,د,ا,ل,ب,ا,ق,ي,,,,
7830,م,ح,م,د,ع,ب,د,ا,ل,م,ق,ص,و,د,,,
7831,ن,ا,ص,ر,ع,ب,د,ا,ل,م,و,ج,و,د,,,


In [61]:
# The dictionray for the mapping 
arabic_alphabet_mapping = {
    'ا': 1, 'ب': 2, 'ت': 3, 'ث': 4, 'ج': 5, 'ح': 6, 'خ': 7, 'د': 8, 'ذ': 9, 'ر': 10,
    'ز': 11, 'س': 12, 'ش': 13, 'ص': 14, 'ض': 15, 'ط': 16, 'ظ': 17, 'ع': 18, 'غ': 19,
    'ف': 20, 'ق': 21, 'ك': 22, 'ل': 23, 'م': 24, 'ن': 25, 'ه': 26, 'و': 27, 'ي': 28,
    'ء' : 29
}

for col in list(df.columns):
    df[col] = df[col].replace(arabic_alphabet_mapping)
df

Unnamed: 0,letter_1,letter_2,letter_3,letter_4,letter_5,letter_6,letter_7,letter_8,letter_9,letter_10,letter_11,letter_12,letter_13,letter_14,letter_15,letter_16,letter_17
0,1.0,12.0,8.0,,,,,,,,,,,,,,
1,8.0,28.0,22.0,,,,,,,,,,,,,,
2,3.0,21.0,1.0,,,,,,,,,,,,,,
3,12.0,26.0,10.0,,,,,,,,,,,,,,
4,13.0,8.0,1.0,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7828,1.0,23.0,12.0,28.0,8.0,24.0,6.0,24.0,8.0,24.0,22.0,28.0,25.0,,,,
7829,24.0,6.0,24.0,8.0,18.0,2.0,8.0,1.0,23.0,2.0,1.0,21.0,28.0,,,,
7830,24.0,6.0,24.0,8.0,18.0,2.0,8.0,1.0,23.0,24.0,21.0,14.0,27.0,8.0,,,
7831,25.0,1.0,14.0,10.0,18.0,2.0,8.0,1.0,23.0,24.0,27.0,5.0,27.0,8.0,,,


In [None]:
def remove_spaces(x):
    if(x == ''):
        return np.nan
    else:
        return x

def process_and_encode(x):
    # Remove any special characters and convert to string
    if(type(x) == str):
        x = x.replace('[^a-zA-Z0-9]', '')
    return x

# Assuming your DataFrame is called 'x'
for column in df.columns:
    df[column] = df[column].apply(process_and_encode)
    df[column] = df[column].apply(remove_spaces)

# # Convert the DataFrame to float
# for col in list(x.columns):
#     try:
#         x[col] = x[col].astype(float)
#     except ValueError as e:
#         print(col , type(x[col].dtypes))


In [62]:
from sklearn.preprocessing import LabelEncoder

labelencoder = LabelEncoder()
y = labelencoder.fit_transform(train_data.Gender)
x = df

In [63]:
x.astype(float)

Unnamed: 0,letter_1,letter_2,letter_3,letter_4,letter_5,letter_6,letter_7,letter_8,letter_9,letter_10,letter_11,letter_12,letter_13,letter_14,letter_15,letter_16,letter_17
0,1.0,12.0,8.0,,,,,,,,,,,,,,
1,8.0,28.0,22.0,,,,,,,,,,,,,,
2,3.0,21.0,1.0,,,,,,,,,,,,,,
3,12.0,26.0,10.0,,,,,,,,,,,,,,
4,13.0,8.0,1.0,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7828,1.0,23.0,12.0,28.0,8.0,24.0,6.0,24.0,8.0,24.0,22.0,28.0,25.0,,,,
7829,24.0,6.0,24.0,8.0,18.0,2.0,8.0,1.0,23.0,2.0,1.0,21.0,28.0,,,,
7830,24.0,6.0,24.0,8.0,18.0,2.0,8.0,1.0,23.0,24.0,21.0,14.0,27.0,8.0,,,
7831,25.0,1.0,14.0,10.0,18.0,2.0,8.0,1.0,23.0,24.0,27.0,5.0,27.0,8.0,,,


In [72]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [78]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='mean')

x_imputed = pd.DataFrame(imputer.fit_transform(x), columns=x.columns)


In [79]:
x_train, x_test, y_train, y_test = train_test_split(x_imputed, y, test_size=0.2, random_state=42)

In [81]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score


# Initialize and train the RandomForestClassifier
rf_classifier = RandomForestClassifier()
rf_classifier.fit(x_train, y_train)

# Predict on the test data
y_pred = rf_classifier.predict(x_test)

# Evaluate the model accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")


Accuracy: 0.8876834716017868


## Conclusion 

The model that has the most accuracy is random forest tree with ambutation and also the HistGradientBoostingClassifier 
With accuracy 89%