In [1]:
import pandas as pd

In [2]:
# Loading the datasets
arabic_names = pd.read_csv('data/pre_merge_data/Arabic_names.csv')
arabic_names_with_gender = pd.read_csv('data/pre_merge_data/arabic_names_with_gender.csv')
females_ar = pd.read_csv('data/pre_merge_data/females_ar.csv')
males_ar = pd.read_csv('data/pre_merge_data/males_ar.csv')

In [3]:
# Displaying the first few rows of each dataset to understand their structure
datasets = [arabic_names, arabic_names_with_gender, females_ar, males_ar]
dataset_names = ['Arabic Names', 'Arabic Names with Gender', 'Females Arabic', 'Males Arabic']

for name, dataset in zip(dataset_names, datasets):
    print(f"Dataset: {name}")
    print(dataset.head())
    print("\n")

Dataset: Arabic Names
    names sex
0  ابتسام   F
1  ابتهاج   F
2  ابتهال   F
3  اجتهاد   F
4  ازدهار   F


Dataset: Arabic Names with Gender
  gender    name
0    ذكر  ابانوب
1    ذكر   ميلاد
2    ذكر    حنين
3    ذكر    ناشد
4    ذكر    احمد


Dataset: Females Arabic
     Name  Gender
0    آداب  Female
1    آسية  Female
2    آلاء  Female
3    آمال  Female
4  الآيات  Female


Dataset: Males Arabic
   Name Gender
0   آدم   Male
1   آسر   Male
2   أبد   Male
3  أبدع   Male
4  أبره   Male




In [4]:
# Standardizing column names and values

# For Arabic Names
arabic_names = arabic_names.rename(columns={'names': 'name'})

# For Arabic Names with Gender
arabic_names_with_gender = arabic_names_with_gender.rename(columns={'gender': 'sex'})
arabic_names_with_gender['sex'] = arabic_names_with_gender['sex'].map({'ذكر': 'M', 'أنثى': 'F'})

females_ar = females_ar.rename(columns={'Gender': 'sex','Name':'name'})
females_ar['sex'] = females_ar['sex'].map({'Male': 'M', 'Female': 'F'})

males_ar = males_ar.rename(columns={'Gender': 'sex','Name':'name'})
males_ar['sex'] = males_ar['sex'].map({'Male': 'M', 'Female': 'F'})

# Now we concatenate all the datasets
merged_dataset = pd.concat([arabic_names, arabic_names_with_gender, females_ar, males_ar], ignore_index=True)

# Display the first few rows of the merged dataset
merged_dataset.head()


Unnamed: 0,name,sex
0,ابتسام,F
1,ابتهاج,F
2,ابتهال,F
3,اجتهاد,F
4,ازدهار,F


In [5]:
# Display the first few rows of the merged dataset
merged_dataset.shape

(18151, 2)

In [6]:
merged_dataset.sample(n=10)

Unnamed: 0,name,sex
1372,هائل,M
2928,خاطر,M
10977,اخلاص,
8578,البرنس,M
7991,ابومرز,M
12931,فت,M
6627,ندين,F
18088,نسيم,M
13360,فينى,M
7874,هاشيما,M


In [7]:
# Removing duplicated values from the merged dataset
merged_dataset_no_duplicates = merged_dataset.drop_duplicates()

In [8]:
merged_dataset_no_duplicates.shape

(10684, 2)

In [9]:
# Saving the updated dataset without duplicates as a CSV file
no_duplicates_file_path = 'data/Arabic_names.csv'
merged_dataset_no_duplicates.to_csv(no_duplicates_file_path, index=False)