In [2]:
import pandas as pd

# **Data Loading**   *(just the IN.csv)*



In [3]:
df = pd.read_csv('/content/IN.csv')

# **Data Cleaning**


*   Renaming/Naming Coloumns
*   Removing Null Values
    - Names coloumn Nan chnaged to ""
    - Nan Genders changed to M or F by searching for corresponding Firstname's other occurance in df and replacing by it gender(if not Nan).
    - Dropped few remaining Nan Gender containing records safely.








In [4]:
# prompt: rename the coloumn names

df.rename(columns={
    'Male': 'First_Name',
    'Unnamed: 1': 'Mid_Name',
    "Unnamed: 2": "Gender",
    "IN": "Country"
}, inplace=True)
df


Unnamed: 0,First_Name,Mid_Name,Gender,Country
0,Kapil Kumar,Kapil Kumar,M,IN
1,Mitali,Aggarwal,F,IN
2,Vikas,Jangra,,IN
3,Ravi,Lungay,M,IN
4,Jagat Yadav,Yadav,M,IN
...,...,...,...,...
6161585,Vikas,Chakchanpur,M,IN
6161586,Dipu,Gupta,M,IN
6161587,Riya,Naharwal,F,IN
6161588,Jashandeep,Hanjra,M,IN


In [5]:
print(df.isnull().sum())

First_Name    30646
Mid_Name      50477
Gender        24966
Country           0
dtype: int64


In [6]:
df.fillna({'First_Name':""}, inplace=True)
df.fillna({'Mid_Name':""}, inplace=True)


In [7]:
print(df.isnull().sum())

First_Name        0
Mid_Name          0
Gender        24966
Country           0
dtype: int64


In [8]:
# Create a dictionary of First_Name to Gender (excluding NaNs)
name_gender_map = df[df['Gender'].notna()].groupby('First_Name')['Gender'].first().to_dict()

# Fill missing Gender values using the dictionary
df['Gender'] = df['Gender'].fillna(df['First_Name'].map(name_gender_map))


In [9]:
print(df.isnull().sum())

First_Name       0
Mid_Name         0
Gender        2358
Country          0
dtype: int64


In [10]:
# drop records with empty genders
df = df.dropna(subset=['Gender'])

# **Grouping by Names and Gender to find Frequency of Names in document**

In [11]:
df_grouped = df.groupby(['First_Name', 'Mid_Name', 'Gender']).size().reset_index(name='Frequency_of_Name')

In [12]:
df_grouped["Full_Name"] = df_grouped["First_Name"].str.strip() + " " + df_grouped["Mid_Name"].str.strip()

In [13]:
df_grouped["Full_Name"] = df_grouped["Full_Name"].str.strip()
df_grouped["Mid_Name"] = df_grouped["Mid_Name"].str.strip()

In [14]:
df_grouped

Unnamed: 0,First_Name,Mid_Name,Gender,Frequency_of_Name,Full_Name
0,,A Ali,M,1,A Ali
1,,A Cm,M,1,A Cm
2,,A Kitoliya,M,1,A Kitoliya
3,,A Patel,M,1,A Patel
4,,A Sangma,M,1,A Sangma
...,...,...,...,...,...
3493751,ﻏﻼﻡ,سبحانی,M,1,ﻏﻼﻡ سبحانی
3493752,ﻣﺆﻣﻦ,ﺧﺎﻥ,M,1,ﻣﺆﻣﻦ ﺧﺎﻥ
3493753,ﻣﺎﮔﺮﮮ,جعفر,M,1,ﻣﺎﮔﺮﮮ جعفر
3493754,ﻣﺤﻤﺪ,حمذة,M,1,ﻣﺤﻤﺪ حمذة


In [15]:
df_grouped["Country"] = "IN"

In [16]:
df_grouped["Full_Name"].str.strip()

Unnamed: 0,Full_Name
0,A Ali
1,A Cm
2,A Kitoliya
3,A Patel
4,A Sangma
...,...
3493751,ﻏﻼﻡ سبحانی
3493752,ﻣﺆﻣﻦ ﺧﺎﻥ
3493753,ﻣﺎﮔﺮﮮ جعفر
3493754,ﻣﺤﻤﺪ حمذة



# # **Ranker Function**
*This function calculates a weighted score based on the full name, first name, and mid name. It sums the exact matches, adds partial matches (0.1 weight for first and mid names), then normalizes the final score to be between 0 and 1, based on the frequency of names.*


In [31]:


def final_sum(full_name):
  names = full_name.split()
  print(names)
  sum = 0
  partial_sum_first =0
  partial_sum_mid = 0
  sum+=df_grouped[df_grouped["Full_Name"]==full_name]['Frequency_of_Name'].sum()
  for name in names:
    partial_sum_first += df_grouped[df_grouped["First_Name"].str.contains(name, case=False, na=False)]['Frequency_of_Name'].sum()
    partial_sum_first += df_grouped[df_grouped["Mid_Name"].str.contains(name, case=False, na=False)]['Frequency_of_Name'].sum()

  final_sum = int(sum *1 +(partial_sum_first*0.1)+(partial_sum_mid*0.1))
  normalized_value = (final_sum - df_grouped['Frequency_of_Name'].min()) / (df_grouped['Frequency_of_Name'].max() - df_grouped['Frequency_of_Name'].min())

  if normalized_value > 1:
    normalized_value = 1
  elif normalized_value < 0:
    normalized_value = 0
  print("partially used",partial_sum_first, "times as first name")
  print("partially used",partial_sum_mid, "times as mid name")
  print("used exactly",sum)

  return normalized_value




# ***#User Function***


In [32]:
print("keep on asking for input strings and get score, \nPress *!q* to end ")

while True:
  user_input = input("Enter a string: ")
  if user_input == "q!":
    break
  score = final_sum(user_input)
  print("Score:", score)
  print("*-----------------------------------------------------------------*")

keep on asking for input strings and get score, 
Press *!q* to end 
Enter a string: Yves Tintin
['Yves', 'Tintin']
partially used 17 times as first name
partially used 0 times as mid name
used exactly 0
Score: 0
*-----------------------------------------------------------------*
Enter a string: Raj Kumar
['Raj', 'Kumar']
partially used 1079500 times as first name
partially used 0 times as mid name
used exactly 4709
Score: 1
*-----------------------------------------------------------------*
Enter a string: Sachin Tendulkar
['Sachin', 'Tendulkar']
partially used 16491 times as first name
partially used 0 times as mid name
used exactly 2
Score: 0.03166378814047208
*-----------------------------------------------------------------*
Enter a string: Thomas
['Thomas']
partially used 1154 times as first name
partially used 0 times as mid name
used exactly 13
Score: 0.004413740165035502
*-----------------------------------------------------------------*
Enter a string: Joseph 
['Joseph']
parti