# Notebook Description  
In this notebook we load the raw files, combine them into one file, and then split it into three sub-samples: `training set`, `validation set` and `test set`.  
Before splitting, the combined data looks like the following.  

| Name | Nickname |
|------|----------|
|$Name^1$ | $Nickname^1_1$ |
|$Name^1$ | $Nickname^1_2$ |
|$Name^1$ | $Nickname^1_3$ |  
|$Name^2$ | $Nickname^2_1$ |
|... | ... |
|$Name^N$ | $Nickname^N_{N_k}$ |  

Where $N_k$ is the number of different nicknames associated with name $N$.

We are splitting the data twice. First we create three different datasets that share different names (but not the same name-nickname pairs, to avoid data leakage, of course). The motivation behind this scenario is that we are probably going to see the same names in the future together with new nicknames, and we want our model to be able to generalize to such cases. In the second split, we restrict the datasets even more by creating a split that forbids names from being at more then one sample. That is, the `training`, `validation` or `test` sets don't share names nor nicknames. The idea behind this scenario is that by limiting the network from seeing the same names in the `training` and `validation` sets, it won't memorize any name-specific rules, but some underlying logic behind names and nicknames joint distribution of letters.

In [None]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import random

# Set options and load file
pd.set_option('display.float_format', lambda x: '%.3f' % x)

%load_ext google.colab.data_table
from google.colab import data_table
from google.colab import drive

drive.mount('/content/drive')
os.chdir("YOUR FOLDER HERE")
raw_data_folder = './data/raw/'
interim_data_folder = './data/interim/'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Load Raw Data

## `nicknames.csv`

In [None]:
nicknames = pd.read_csv(raw_data_folder + 'nicknames.csv',index_col=0)
nicknames.iloc[-10:]

Unnamed: 0_level_0,name,nickname
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1425,Vic,Vicky
1426,Vicki,Vicky
1427,Vince,Vinnie
1428,Wally,Walt
1429,Will,Willie
1430,Bill,Willie
1431,Bill,Billy
1432,Will,Bill
1433,Nick,Nik
1434,Nik,Nick


## `name_to_nick.csv`

In [None]:
name_to_nick = pd.read_csv(raw_data_folder + 'name_to_nick.csv',skiprows=1)
name_to_nick.iloc[:4]

Unnamed: 0,firstname,nicname1,nicname2,nicname3,nicname4,nicname5,nicname6,nicname7,nicname8,nicname9,nicname10,nicname11,nicname12,nicname13,nicname14,nicname15,nicname16
0,SOLOMON,SAL,SALMON,SAUL,SOL,SOLLY,ZOLLY,,,,,,,,,,
1,KRISTINE,CHRIS,CHRISTY,CRISSY,KRIS,KRISTY,TINA,,,,,,,,,,
2,CHRISTOPHER,CHRIS,KIT,,,,,,,,,,,,,,
3,PHILLIP,PHIL,PHIL,,,,,,,,,,,,,,


## Male and Female Diminutives Datasets

In [None]:
female_diminutives = pd.read_csv(raw_data_folder + 'female_diminutives.csv',skiprows=1)
male_diminutives = pd.read_csv(raw_data_folder + 'male_diminutives.csv',skiprows=1)
male_diminutives.iloc[:4]

Unnamed: 0,Name,col1,Nickname2,nickname1,Nickname2.1,nickname1.1,Nickname2.2,nickname1.2,Nickname2.3,nickname1.3,Nickname2.4,nickname1.4,Nickname2.5,nickname1.5
0,Abner,Ab,,,,,,,,,,,,
1,Abraham,Abe,Abey,Abie,,,,,,,,,,
2,Ace,Acey,Acie,,,,,,,,,,,
3,Alan,Al,Allie,,,,,,,,,,,


In [None]:
female_diminutives.iloc[:4]

Unnamed: 0,Name,nickname1,Nickname2,nickname1.1,Nickname2.1,nickname1.2,Nickname2.2,nickname1.3,Nickname2.3,nickname1.4,Nickname2.4,nickname1.5,Nickname2.5,nickname1.6,Nickname2.6
0,Abigail,Abbey,Abbi,Abbie,Abby,Abi,Gail,Gayle,,,,,,,
1,Adelaide,Addie,,,,,,,,,,,,,
2,Adele,Addie,,,,,,,,,,,,,
3,Adeline,Addie,,,,,,,,,,,,,


# Convet Dataset to "Long" Format

In [None]:
# name_to_nick
# ---------------

name_to_nick_long = dict({
    'name':[],
    'nickname':[]
})
for i in tqdm(range(len(name_to_nick))):
  for j in range(1, name_to_nick.shape[1]):
    if str(name_to_nick.iloc[i,j]) =='nan':
      continue
    else:
      name_to_nick_long['name'].append(name_to_nick.iloc[i,0].lower())
      name_to_nick_long['nickname'].append(name_to_nick.iloc[i,j].lower())
      
name_to_nick_long = pd.DataFrame(name_to_nick_long)


# female_diminutives
# ---------------
female_diminutives_long = dict({
    'name':[],
    'nickname':[]
})
for i in tqdm(range(len(female_diminutives))):
  for j in range(1, female_diminutives.shape[1]):
    if str(female_diminutives.iloc[i,j]) =='nan':
      continue
    else:
      female_diminutives_long['name'].append(female_diminutives.iloc[i,0].lower())
      female_diminutives_long['nickname'].append(female_diminutives.iloc[i,j].lower())
      
female_diminutives_long = pd.DataFrame(female_diminutives_long)


# male_diminutives
# ---------------
male_diminutives_long = dict({
    'name':[],
    'nickname':[]
})
for i in tqdm(range(len(male_diminutives))):
  for j in range(1, male_diminutives.shape[1]):
    if str(male_diminutives.iloc[i,j]) =='nan':
      continue
    else:
      male_diminutives_long['name'].append(male_diminutives.iloc[i,0].lower())
      male_diminutives_long['nickname'].append(male_diminutives.iloc[i,j].lower())
      
male_diminutives_long = pd.DataFrame(male_diminutives_long)
male_diminutives_long.iloc[:10]


100%|██████████| 463/463 [00:00<00:00, 1365.88it/s]
100%|██████████| 258/258 [00:00<00:00, 1692.75it/s]
100%|██████████| 287/287 [00:00<00:00, 1927.24it/s]


Unnamed: 0,name,nickname
0,abner,ab
1,abraham,abe
2,abraham,abey
3,abraham,abie
4,ace,acey
5,ace,acie
6,alan,al
7,alan,allie
8,alban,alby
9,albert,al


# Merge Datasets

In [None]:
df = pd.concat([
    name_to_nick_long,
    female_diminutives_long,
    male_diminutives_long
], axis = 0, ignore_index=True)
print(df.shape)
print(df.drop_duplicates().shape)

df = df.drop_duplicates().reset_index(drop=True)
df.iloc[:4]

(2384, 2)
(1996, 2)


Unnamed: 0,name,nickname
0,solomon,sal
1,solomon,salmon
2,solomon,saul
3,solomon,sol


# Train-Test split

## Restricted Scenario

In [None]:
names = list(set(df['name'].tolist()))
random.shuffle(names)

train_names = names[:int(0.75*len(names))]
val_names = names[int(0.75*len(names)):int(0.90*len(names))]
test_names = names[int(0.90*len(names)):]

print('Training names:', len(train_names))
print('Val names:', len(val_names))
print('Test names:', len(test_names))

train_df = df[df['name'].isin(train_names)]
val_df = df[df['name'].isin(val_names)]
test_df = df[df['name'].isin(test_names)]

print('Training observations (nicknames):', len(train_df))
print('Val observations (nicknames):', len(val_df))
print('Test observations (nicknames):', len(test_df))

Training names: 575
Val names: 115
Test names: 77
Training observations (nicknames): 1508
Val observations (nicknames): 286
Test observations (nicknames): 202


In [None]:
train_df.reset_index(drop=True,inplace=True)
val_df.reset_index(drop=True,inplace=True)
test_df.reset_index(drop=True,inplace=True)

In [None]:
train_df.to_csv(interim_data_folder + 'train_df.csv')
val_df.to_csv(interim_data_folder + 'validation_df.csv')
test_df.to_csv(interim_data_folder + '/test_df.csv')

# mixed samples

In [None]:
train_df = pd.read_csv(interim_data_folder + 'train_df.csv',index_col=0)
val_df = pd.read_csv(interim_data_folder + 'validation_df.csv',index_col=0)
test_df = pd.read_csv(interim_data_folder + 'test_df.csv',index_col=0)

In [None]:
df = pd.concat([train_df, val_df, test_df],axis=0,ignore_index=True)
print(df.shape)
df=df.sample(frac=1).reset_index(drop=True)
df.iloc[:10]

(1996, 2)


Unnamed: 0,name,nickname
0,ricardo,rico
1,jeanette,janet
2,cathleen,lena
3,gerard,gerry
4,christiana,christy
5,stephen,steenie
6,maximilian,max
7,kenneth,kendrick
8,cornelia,cornie
9,gareth,gare


In [None]:
train_df_mixed = df.iloc[:len(train_df)]
val_df_mixed = df.iloc[len(train_df):(len(train_df)+len(val_df))]
test_df_mixed = df.iloc[(len(train_df)+len(val_df)):]

print('Original samples')
print(train_df.shape)
print(val_df.shape)
print(test_df.shape)
print('Mixed samples')
print(train_df_mixed.shape)
print(val_df_mixed.shape)
print(test_df_mixed.shape)


Original samples
(1508, 2)
(286, 2)
(202, 2)
Mixed samples
(1508, 2)
(286, 2)
(202, 2)


In [None]:
train_df_mixed.iloc[:5]

Unnamed: 0,name,nickname
0,ricardo,rico
1,jeanette,janet
2,cathleen,lena
3,gerard,gerry
4,christiana,christy


In [None]:
train_df_mixed.to_csv(interim_data_folder + 'train_df_mixed.csv')
val_df_mixed.to_csv(interim_data_folder + 'validation_df_mixed.csv')
test_df_mixed.to_csv(interim_data_folder + 'test_df_mixed.csv')