In [1]:
# Comment if not running in google colab
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
# Install requirements, then comment out, restart runtime and run all
%%capture
%cd /content/drive/My Drive/its_all_in_the_name_light_repo/code
!pip install -r requirements.txt
!pip install unidecode
!pip install multiprocessing

# Imports

In [10]:
import os
import pandas as pd
from unidecode import unidecode
from multiprocessing import Pool
import pickle
import warnings
warnings.filterwarnings("ignore")

# Constants

In [11]:
# provide project location. If running in google colab:
base_dir = "/content/drive/My Drive/its_all_in_the_name_light_repo"

# base_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir)) # comment if running on colab

if not os.path.isdir(base_dir):
    os.mkdir(base_dir)

data_dir = base_dir+"/data/"
if not os.path.isdir(data_dir):
    os.mkdir(data_dir)

model_dir = base_dir+"/models/"
if not os.path.isdir(model_dir):
    os.mkdir(model_dir)

output_dir = base_dir+"/data/predictions/"
if not os.path.isdir(output_dir):
    os.mkdir(output_dir)

# Name of data file
dataname = "sample_data.csv"

# Check Data

In [12]:
data=pd.read_csv(data_dir + dataname, encoding='utf-8')
print(len(data))
print(data.head())
del data

5
              name              parent
0    Shaurya Mehta         Samir Mehta
1  Meran Panigrahi     Manoj Panigrahi
2          Ashrith        Prahalad Rao
3  Pranay Saraswat  Sanjeevan Saraswat
4   Abhimanyu Khan          Arjun Khan


In [13]:
# Set constants
name="name" #name of field containing individual's name
pname="parent" #field for name of an additional household member

concat_model=False # set to True for concatenated names models

# n_way="multiclass"
n_way="2class"

classifier="svm"

In [14]:
def normalize(word):
  return unidecode(word)

def clean_data():
  data=pd.read_csv(data_dir+dataname, encoding='utf-8')

  data["name_cleaned"]=data[name]
  data["name_cleaned"].fillna('', inplace = True)

  #Normalize
  p = Pool(2)
  data["name_cleaned"] = p.map(normalize, data["name_cleaned"])
  p.close()
  p.join()

  data["name_cleaned"] = data["name_cleaned"].str.upper()
  data["name_cleaned"].replace("[^A-Z .\-]"," ",regex=True,inplace=True)
  data["name_cleaned"].replace("[.-]+"," ", regex=True,inplace=True)
  data["name_cleaned"].replace("([A-Z])\\1\\1+","\\1", regex=True,inplace=True)
  data["name_cleaned"].replace("\s+"," ", regex=True,inplace=True)
  data["name_cleaned"]=data["name_cleaned"].str.strip()

  if(concat_model):
    data["pname_cleaned"]=data[pname]
    data["pname_cleaned"].fillna('', inplace = True)

    #Normalize
    p = Pool(10)
    data["pname_cleaned"] = p.map(normalize, data["pname_cleaned"])
    p.close()
    p.join()

    data["pname_cleaned"] = data["pname_cleaned"].str.upper()
    data["pname_cleaned"].replace("[^A-Z .\-]"," ",regex=True,inplace=True)
    data["pname_cleaned"].replace("[.-]+"," ", regex=True,inplace=True)
    data["pname_cleaned"].replace("([A-Z])\\1\\1+","\\1", regex=True,inplace=True)
    data["pname_cleaned"].replace("\s+"," ", regex=True,inplace=True)
    data["pname_cleaned"]=data["pname_cleaned"].str.strip()

  data.to_csv(output_dir+dataname, encoding='utf-8',index=False)

def load_data():
  data=pd.read_csv(output_dir+dataname, encoding='utf-8')
  data["name_cleaned"].fillna('', inplace = True)
  data["name_cleaned"].replace(" ","}{", regex=True,inplace=True)
  data["name_cleaned"]="{"+data["name_cleaned"].astype(str)+"}"

  if concat_model:
    data["pname_cleaned"].fillna('', inplace = True)
    data["pname_cleaned"].replace(" ","}{", regex=True,inplace=True)
    data["pname_cleaned"]="{"+data["pname_cleaned"].astype(str)+"}"
  if(concat_model):
    data["name_cleaned"]="#"+data["name_cleaned"].astype(str)+'#'+data["pname_cleaned"].astype(str)+"#"
  return data

clean_data() #run once
data=load_data()

In [15]:
if n_way=="multiclass":
  with open(model_dir+"non_neural_label_encoding_multiclass.pkl", "rb") as f:
    (category_to_id,id_to_category)=pickle.load(f)
model_name = n_way+"_"+classifier+'_concat_'+str(concat_model)+'.sav'
vectorizer = pickle.load(open(model_dir+"vectorizer_"+model_name,'rb'))
clf=pickle.load(open(model_dir+"model_"+model_name,'rb'))

In [16]:
tfidf_matrix= vectorizer.transform(data.name_cleaned)
y_pred_prob=clf.decision_function(tfidf_matrix)
y_pred=clf.predict(tfidf_matrix)

if n_way == "2class":
  if concat_model:
    df=pd.DataFrame({name:data[name], pname:data[pname], "predicted_religion":pd.Series(y_pred), "muslim_score": pd.Series(y_pred_prob)})
  else:
      df=pd.DataFrame({name:data[name], "predicted_religion":pd.Series(y_pred), "muslim_score": pd.Series(y_pred_prob)})

else:
  if concat_model:
    df=pd.DataFrame({name:data[name], pname:data[pname], "predicted_religion":pd.Series(y_pred).map(id_to_category)})
  else:
      df=pd.DataFrame({name:data[name], "predicted_religion":pd.Series(y_pred).map(id_to_category)})
  df2=pd.DataFrame(y_pred_prob,columns=list(category_to_id.keys()))
  df2.reset_index(inplace=True,drop=True)
  df.reset_index(inplace=True,drop=True)
  df=pd.concat([df,df2],axis=1)

df.to_csv(output_dir+dataname, encoding='utf-8',index=False)