# Notebook Description
This notebook takes the pre-splitted datasets (under the "restricted" or the "mixed" scenarios), augment new name-nickname pairs like suggested [here](https://www.youtube.com/watch?v=6e65XfwmIWE&t=3s&ab_channel=DynamicVisionandLearningGroup). Then, the texts are tokenized, and the `training`, `validation` and `test` datasets are saved in a matrices format, for later modeling.  
Lastly, the data is converted to audio using Google TTS API, and then to spectograms for later modeling.

In [None]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import random

# keras
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences

# Set options and load file
pd.set_option('display.float_format', lambda x: '%.3f' % x)

%load_ext google.colab.data_table
from google.colab import data_table
from google.colab import drive

drive.mount('/content/drive')
os.chdir("YOUR FOLDER HERE")
raw_data_folder = './data/raw/'
interim_data_folder = './data/interim/'
processed_data_folder = './data/processed/'

def shuffle_nicknames(df, times = 4):
  # craete a dataframe of false examples, such that if a name appears N_name times,
  # then the new dataset will also have it N_name times. 
  # The fact that names repeats over different nicknames make it impossible to just
  # to use shuffle, as eventually some names could end up with its own nicknames
  # which eventually induce more noise than signal.
  from tqdm import tqdm
  import numpy as np

  names_in_df = df['name'].drop_duplicates().tolist()
  results = pd.DataFrame(columns = ['name','nickname'])

  print('Shuffling nicknames..')
  for t in tqdm(range(times)):

    for j in tqdm(range(len(names_in_df))): 
      # get name
      name = names_in_df[j]

      # create two datasets - one with only name, and one of all but name
      df_only_name = df[df['name']==name]
      N_name = len(df_only_name) # number of examples per name
      df_without_name = df[~(df['name']==name)]

      # get N_name nicknames
      nicknames_of_other_names = df_without_name['nickname'].values
      random_index = np.random.randint(0,len(nicknames_of_other_names)-1,N_name)
      random_nicknames = nicknames_of_other_names[random_index]

      df_only_name['nickname'] = random_nicknames

      # store results
      results = pd.concat([results, df_only_name],axis=0, ignore_index=True)

  # flag positive/negative cases
  results['y'] = 0
  df['y'] = 1

  # add the positive cases as well
  results = pd.concat([results, df],axis=0, ignore_index=True)

  # Suffle
  results = results.sample(frac=1)
  
  return results


Mounted at /content/drive


# Restricted Scenario

In [None]:
train_df = pd.read_csv(interim_data_folder + 'train_df.csv',index_col=0)
validation_df = pd.read_csv(interim_data_folder + 'validation_df.csv',index_col=0)
test_df = pd.read_csv(interim_data_folder + 'test_df.csv',index_col=0)

print('Training observations (nicknames) before augmentation:', len(train_df))
print('Val observations (nicknames) before augmentation:', len(validation_df))
print('Test observations (nicknames) before augmentation:', len(test_df))

Training observations (nicknames) before augmentation: 1508
Val observations (nicknames) before augmentation: 286
Test observations (nicknames) before augmentation: 202


# Augmentation

In [None]:
train_df_shuffled = shuffle_nicknames(train_df)
validation_df_shuffled = shuffle_nicknames(validation_df)
test_df_shuffled = shuffle_nicknames(test_df)

print('Training:')
print(train_df_shuffled['y'].value_counts())
print('Val:')
print(validation_df_shuffled['y'].value_counts())
print('Test:')
print(test_df_shuffled['y'].value_counts())

train_df_shuffled.iloc[:15]

Shuffling nicknames..


  0%|          | 0/4 [00:00<?, ?it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  8%|▊         | 48/575 [00:00<00:01, 472.31it/s][A
 17%|█▋        | 96/575 [00:00<00:01, 461.99it/s][A
 25%|██▍       | 143/575 [00:00<00:01, 367.80it/s][A
 32%|███▏      | 184/575 [00:00<00:01, 380.97it/s][A
 39%|███▉      | 224/575 [00:00<00:00, 368.60it/s][A
 46%|████▌     | 262/575 [00:00<00:00, 327.79it/s][A
 53%|█████▎    | 307/575 [00:00<00:00, 361.51it/s][A
 61%|██████▏   | 353/575 [00:00<00:00, 388.42it/s][A
 69%|██████▊   | 394/575 [00:01<00:00, 358.34it/s][A
 76%|███████▋  | 439/575 [00:01<00:00, 381.89it/s][A
 83%|████████▎ | 479/575 [00:01<00:00, 352.50it/s][A
 92%|█████████▏| 528/575 [00:01<00:00, 388.77it/s][A
100%|██████████| 575/575 [00:01<00:00, 375.67it/s]
 

Shuffling nicknames..


  0%|          | 0/4 [00:00<?, ?it/s]
  0%|          | 0/115 [00:00<?, ?it/s][A
100%|██████████| 115/115 [00:00<00:00, 644.14it/s]
 25%|██▌       | 1/4 [00:00<00:00,  5.31it/s]
  0%|          | 0/115 [00:00<?, ?it/s][A
100%|██████████| 115/115 [00:00<00:00, 803.71it/s]
 50%|█████     | 2/4 [00:00<00:00,  5.96it/s]
  0%|          | 0/115 [00:00<?, ?it/s][A
100%|██████████| 115/115 [00:00<00:00, 786.28it/s]
 75%|███████▌  | 3/4 [00:00<00:00,  6.15it/s]
  0%|          | 0/115 [00:00<?, ?it/s][A
100%|██████████| 115/115 [00:00<00:00, 789.39it/s]
100%|██████████| 4/4 [00:00<00:00,  6.08it/s]


Shuffling nicknames..


  0%|          | 0/4 [00:00<?, ?it/s]
100%|██████████| 77/77 [00:00<00:00, 777.51it/s]
 25%|██▌       | 1/4 [00:00<00:00,  9.03it/s]
100%|██████████| 77/77 [00:00<00:00, 934.25it/s]

100%|██████████| 77/77 [00:00<00:00, 837.80it/s]
 75%|███████▌  | 3/4 [00:00<00:00, 10.00it/s]
  0%|          | 0/77 [00:00<?, ?it/s][A
100%|██████████| 77/77 [00:00<00:00, 653.56it/s]
100%|██████████| 4/4 [00:00<00:00,  9.24it/s]

Training:
0    6032
1    1508
Name: y, dtype: int64
Val:
0    1144
1     286
Name: y, dtype: int64
Test:
0    808
1    202
Name: y, dtype: int64





Unnamed: 0,name,nickname,y
5554,philinda,issy,0
2248,rebecca,court,0
4786,katherine,jody,0
6785,charles,chuck,1
2914,george,kit,0
821,lawrence,maxie,0
1019,theodore,benjie,0
5362,mary,sam,0
7254,stephanie,steve,1
2319,kathleen,dan,0


In [None]:
# save
train_df_shuffled.to_csv(interim_data_folder + 'train_df_shuffled.csv')
validation_df_shuffled.to_csv(interim_data_folder + 'validation_df_shuffled.csv')
test_df_shuffled.to_csv(interim_data_folder + 'test_df_shuffled.csv')

# load
train_df_shuffled = pd.read_csv(interim_data_folder + 'train_df_shuffled.csv', index_col=0)
validation_df_shuffled = pd.read_csv(interim_data_folder + 'validation_df_shuffled.csv', index_col=0)
test_df_shuffled = pd.read_csv(interim_data_folder + 'test_df_shuffled.csv', index_col=0)


## Tokenize

In [None]:
# Tokenizer
names_and_nicknames = list(set(train_df_shuffled['name'].tolist()+ 
                               validation_df_shuffled['name'].tolist()+ 
                               test_df_shuffled['name'].tolist()+
                               train_df_shuffled['nickname'].tolist()+ 
                               validation_df_shuffled['nickname'].tolist()+ 
                               test_df_shuffled['nickname'].tolist()))
names_and_nicknames=names_and_nicknames[1:]
tk = Tokenizer(num_words=None, char_level=True, oov_token='UNK')
tk.fit_on_texts(names_and_nicknames)
print(f"There are {len(tk.word_index)} unique tokens in Train+Val+Test sets.")
tk.word_index

There are 29 unique tokens in Train+Val+Test sets.


{'UNK': 1,
 'e': 2,
 'a': 3,
 'i': 4,
 'n': 5,
 'l': 6,
 'r': 7,
 's': 8,
 't': 9,
 'o': 10,
 'd': 11,
 'y': 12,
 'c': 13,
 'h': 14,
 'm': 15,
 'b': 16,
 'u': 17,
 'g': 18,
 'k': 19,
 'j': 20,
 'p': 21,
 'f': 22,
 'v': 23,
 'z': 24,
 'w': 25,
 'x': 26,
 'q': 27,
 '.': 28,
 'ó': 29}

In [None]:
L = [len(x) for x in names_and_nicknames]
print(f'Longest name has {max(L)} caracters.')

Longest name has 13 caracters.


In [None]:
# construct a new vocabulary 
# --------------------

# alphabet="abcdefghijklmnopqrstuvwxyz0123456789-,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{}"
alphabet=''.join(list(tk.word_index.keys())).replace('UNK','')
print('Alphabet')
print(alphabet)
print('')

char_dict = {}
for i, char in enumerate(alphabet):
    char_dict[char] = i + 1
    
# Use char_dict to replace the tk.word_index
tk.word_index = char_dict 
# Add 'UNK' to the vocabulary 
tk.word_index[tk.oov_token] = max(char_dict.values()) + 1


# Extract Targets
# -------------------
train_targets = train_df_shuffled['y'].values
validation_targets = validation_df_shuffled['y'].values
test_targets = test_df_shuffled['y'].values


# convert names and nicknames into sequences
# -------------------

train_sequences_names = tk.texts_to_sequences(train_df_shuffled['name'].tolist())
train_sequences_nicknames = tk.texts_to_sequences(train_df_shuffled['name'].tolist())
validation_sequences_names = tk.texts_to_sequences(validation_df_shuffled['name'].tolist())
validation_sequences_nicknames = tk.texts_to_sequences(validation_df_shuffled['name'].tolist())
test_sequences_names = tk.texts_to_sequences(test_df_shuffled['name'].tolist())
test_sequences_nicknames = tk.texts_to_sequences(test_df_shuffled['name'].tolist())

# Padding to 15
train_data_names = pad_sequences(train_sequences_names, maxlen=15, padding='post')
train_data_nicknames = pad_sequences(train_sequences_nicknames, maxlen=15, padding='post')
validation_data_names = pad_sequences(validation_sequences_names, maxlen=15, padding='post')
validation_data_nicknames = pad_sequences(validation_sequences_nicknames, maxlen=15, padding='post')
test_data_names = pad_sequences(test_sequences_names, maxlen=15, padding='post')
test_data_nicknames = pad_sequences(test_sequences_nicknames, maxlen=15, padding='post')

# Convert to numpy array
train_data_names = np.array(train_data_names)
train_data_nicknames = np.array(train_data_nicknames)
validation_data_names = np.array(validation_data_names)
validation_data_nicknames = np.array(validation_data_nicknames)
test_data_names = np.array(test_data_names)
test_data_nicknames = np.array(test_data_nicknames)

print(train_df_shuffled['name'].tolist()[1])
print(train_data_names[1])


Alphabet
eainlrstodychmbugkjpfvzwxq.ó

rebecca
[ 6  1 15  1 12 12  2  0  0  0  0  0  0  0  0]


**The above `np.array` indexes for each letter in the original string, the index of the apropriate token in `aphabet`.**

In [None]:
# save data
# ----------------

np.savetxt(processed_data_folder + 'train_data_names.txt',train_data_names,fmt='%d')
np.savetxt(processed_data_folder + 'train_data_nicknames.txt',train_data_nicknames,fmt='%d')
np.savetxt(processed_data_folder + 'validation_data_names.txt',validation_data_names,fmt='%d')
np.savetxt(processed_data_folder + 'validation_data_nicknames.txt',validation_data_nicknames,fmt='%d')
np.savetxt(processed_data_folder + 'test_data_names.txt',test_data_names,fmt='%d')
np.savetxt(processed_data_folder + 'test_data_nicknames.txt',test_data_nicknames,fmt='%d')
np.savetxt(processed_data_folder + 'train_targets.txt',train_targets,fmt='%d')
np.savetxt(processed_data_folder + 'validation_targets.txt',validation_targets,fmt='%d')
np.savetxt(processed_data_folder + 'test_targets.txt',test_targets,fmt='%d')

# Mixed Scenario

In [None]:
train_df = pd.read_csv(interim_data_folder + 'train_df_mixed.csv', index_col=0)
validation_df = pd.read_csv(interim_data_folder + 'validation_df_mixed.csv', index_col=0)
test_df = pd.read_csv(interim_data_folder + 'test_df_mixed.csv', index_col=0)

print('Training observations (nicknames) before augmentation:', len(train_df))
print('Val observations (nicknames) before augmentation:', len(validation_df))
print('Test observations (nicknames) before augmentation:', len(test_df))

Training observations (nicknames) before augmentation: 1508
Val observations (nicknames) before augmentation: 286
Test observations (nicknames) before augmentation: 202


## Augmentation

In [None]:
train_df_shuffled = shuffle_nicknames(train_df)
validation_df_shuffled = shuffle_nicknames(validation_df)
test_df_shuffled = shuffle_nicknames(test_df)

print('Training:')
print(train_df_shuffled['y'].value_counts())
print('Val:')
print(validation_df_shuffled['y'].value_counts())
print('Test:')
print(test_df_shuffled['y'].value_counts())

train_df_shuffled.iloc[:15]

Shuffling nicknames..


  0%|          | 0/4 [00:00<?, ?it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

 13%|█▎        | 85/675 [00:00<00:00, 848.35it/s][A
 25%|██▌       | 170/675 [00:00<00:00, 767.36it/s][A
 37%|███▋      | 248/675 [00:00<00:00, 755.13it/s][A
 48%|████▊     | 324/675 [00:00<00:00, 735.08it/s][A
 60%|█████▉    | 402/675 [00:00<00:00, 750.17it/s][A
 71%|███████   | 480/675 [00:00<00:00, 759.53it/s][A
 83%|████████▎ | 557/675 [00:00<00:00, 746.50it/s][A
100%|██████████| 675/675 [00:00<00:00, 751.48it/s]
 25%|██▌       | 1/4 [00:00<00:02,  1.10it/s]
  0%|          | 0/675 [00:00<?, ?it/s][A
 12%|█▏        | 81/675 [00:00<00:00, 803.76it/s][A
 24%|██▍       | 162/675 [00:00<00:00, 765.84it/s][A
 35%|███▌      | 239/675 [00:00<00:00, 738.06it/s][A
 46%|████▋     | 313

Shuffling nicknames..


  0%|          | 0/4 [00:00<?, ?it/s]
  0%|          | 0/222 [00:00<?, ?it/s][A
 35%|███▍      | 77/222 [00:00<00:00, 760.22it/s][A
100%|██████████| 222/222 [00:00<00:00, 701.75it/s]
 25%|██▌       | 1/4 [00:00<00:00,  3.10it/s]
  0%|          | 0/222 [00:00<?, ?it/s][A
 39%|███▊      | 86/222 [00:00<00:00, 850.47it/s][A
100%|██████████| 222/222 [00:00<00:00, 797.89it/s]
 50%|█████     | 2/4 [00:00<00:00,  3.31it/s]
  0%|          | 0/222 [00:00<?, ?it/s][A
 41%|████▏     | 92/222 [00:00<00:00, 916.90it/s][A
100%|██████████| 222/222 [00:00<00:00, 792.13it/s]
 75%|███████▌  | 3/4 [00:00<00:00,  3.33it/s]
  0%|          | 0/222 [00:00<?, ?it/s][A
 38%|███▊      | 85/222 [00:00<00:00, 845.23it/s][A
100%|██████████| 222/222 [00:00<00:00, 824.16it/s]
100%|██████████| 4/4 [00:01<00:00,  3.36it/s]


Shuffling nicknames..


  0%|          | 0/4 [00:00<?, ?it/s]
  0%|          | 0/163 [00:00<?, ?it/s][A
100%|██████████| 163/163 [00:00<00:00, 875.21it/s]
 25%|██▌       | 1/4 [00:00<00:00,  5.09it/s]
  0%|          | 0/163 [00:00<?, ?it/s][A
100%|██████████| 163/163 [00:00<00:00, 856.83it/s]
 50%|█████     | 2/4 [00:00<00:00,  5.01it/s]
  0%|          | 0/163 [00:00<?, ?it/s][A
100%|██████████| 163/163 [00:00<00:00, 865.41it/s]
 75%|███████▌  | 3/4 [00:00<00:00,  5.02it/s]
  0%|          | 0/163 [00:00<?, ?it/s][A
100%|██████████| 163/163 [00:00<00:00, 739.26it/s]
100%|██████████| 4/4 [00:00<00:00,  4.79it/s]

Training:
0    6032
1    1508
Name: y, dtype: int64
Val:
0    1144
1     286
Name: y, dtype: int64
Test:
0    808
1    202
Name: y, dtype: int64





Unnamed: 0,name,nickname,y
4918,theodore,cy,0
4225,flora,belle,0
6871,isabella,bel,1
1136,jo,ara,0
1754,clement,jessie,0
464,vanessa,eddy,0
3391,samantha,connie,0
126,edwin,ginny,0
6576,ferdinand,nandy,1
7284,edwin,ned,1


In [None]:
# save
train_df_shuffled.to_csv(interim_data_folder + 'train_df_mixed_shuffled.csv')
validation_df_shuffled.to_csv(interim_data_folder + 'validation_df_mixed_shuffled.csv')
test_df_shuffled.to_csv(interim_data_folder + 'test_df_mixed_shuffled.csv')

# load
train_df_shuffled = pd.read_csv(interim_data_folder + 'train_df_mixed_shuffled.csv', index_col=0)
validation_df_shuffled = pd.read_csv(interim_data_folder + 'validation_df_mixed_shuffled.csv', index_col=0)
test_df_shuffled = pd.read_csv(interim_data_folder + 'test_df_mixed_shuffled.csv', index_col=0)


## Tokenize

In [None]:
# Tokenizer
names_and_nicknames = list(set(train_df_shuffled['name'].tolist()+ 
                               validation_df_shuffled['name'].tolist()+ 
                               test_df_shuffled['name'].tolist()+
                               train_df_shuffled['nickname'].tolist()+ 
                               validation_df_shuffled['nickname'].tolist()+ 
                               test_df_shuffled['nickname'].tolist()))
names_and_nicknames=names_and_nicknames[1:]
tk = Tokenizer(num_words=None, char_level=True, oov_token='UNK')
tk.fit_on_texts(names_and_nicknames)
print(f"There are {len(tk.word_index)} unique tokens in Train+Val+Test sets.")
tk.word_index

There are 29 unique tokens in Train+Val+Test sets.


{'UNK': 1,
 'e': 2,
 'a': 3,
 'i': 4,
 'n': 5,
 'l': 6,
 'r': 7,
 's': 8,
 'o': 9,
 't': 10,
 'd': 11,
 'y': 12,
 'c': 13,
 'h': 14,
 'm': 15,
 'b': 16,
 'u': 17,
 'g': 18,
 'k': 19,
 'j': 20,
 'p': 21,
 'f': 22,
 'v': 23,
 'z': 24,
 'w': 25,
 'x': 26,
 'q': 27,
 '.': 28,
 'ó': 29}

In [None]:
L = [len(x) for x in names_and_nicknames]
print(f'Longest name has {max(L)} caracters.')

Longest name has 13 caracters.


In [None]:
# construct a new vocabulary 
# --------------------

# alphabet="abcdefghijklmnopqrstuvwxyz0123456789-,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{}"
alphabet=''.join(list(tk.word_index.keys())).replace('UNK','')
print('Alphabet')
print(alphabet)
print('')

char_dict = {}
for i, char in enumerate(alphabet):
    char_dict[char] = i + 1
    
# Use char_dict to replace the tk.word_index
tk.word_index = char_dict 
# Add 'UNK' to the vocabulary 
tk.word_index[tk.oov_token] = max(char_dict.values()) + 1


# Extract Targets
# -------------------
train_targets = train_df_shuffled['y'].values
validation_targets = validation_df_shuffled['y'].values
test_targets = test_df_shuffled['y'].values


# convert names and nicknames into sequences
# -------------------

train_sequences_names = tk.texts_to_sequences(train_df_shuffled['name'].tolist())
train_sequences_nicknames = tk.texts_to_sequences([str(x) for x in train_df_shuffled['nickname'].tolist()])
validation_sequences_names = tk.texts_to_sequences(validation_df_shuffled['name'].tolist())
validation_sequences_nicknames = tk.texts_to_sequences(validation_df_shuffled['nickname'].tolist())
test_sequences_names = tk.texts_to_sequences(test_df_shuffled['name'].tolist())
test_sequences_nicknames = tk.texts_to_sequences(test_df_shuffled['nickname'].tolist())

# Padding to 15
train_data_names = pad_sequences(train_sequences_names, maxlen=15, padding='post')
train_data_nicknames = pad_sequences(train_sequences_nicknames, maxlen=15, padding='post')
validation_data_names = pad_sequences(validation_sequences_names, maxlen=15, padding='post')
validation_data_nicknames = pad_sequences(validation_sequences_nicknames, maxlen=15, padding='post')
test_data_names = pad_sequences(test_sequences_names, maxlen=15, padding='post')
test_data_nicknames = pad_sequences(test_sequences_nicknames, maxlen=15, padding='post')

# Convert to numpy array
train_data_names = np.array(train_data_names)
train_data_nicknames = np.array(train_data_nicknames)
validation_data_names = np.array(validation_data_names)
validation_data_nicknames = np.array(validation_data_nicknames)
test_data_names = np.array(test_data_names)
test_data_nicknames = np.array(test_data_nicknames)

print(train_df_shuffled['name'].tolist()[1])
print(train_data_names[1])


Alphabet
eainlrsotdychmbugkjpfvzwxq.ó

flora
[21  5  8  6  2  0  0  0  0  0  0  0  0  0  0]


In [None]:
print(train_df_shuffled['name'].iloc[:5],'\n')
print(train_df_shuffled['nickname'].iloc[:5],'\n\n')

print(train_data_names[:5],'\n')
print(train_data_nicknames[:5],'\n\n')

4918    theodore
4225       flora
6871    isabella
1136          jo
1754     clement
Name: name, dtype: object 

4918        cy
4225     belle
6871       bel
1136       ara
1754    jessie
Name: nickname, dtype: object 


[[ 9 13  1  8 10  8  6  1  0  0  0  0  0  0  0]
 [21  5  8  6  2  0  0  0  0  0  0  0  0  0  0]
 [ 3  7  2 15  1  5  5  2  0  0  0  0  0  0  0]
 [19  8  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [12  5  1 14  1  4  9  0  0  0  0  0  0  0  0]] 

[[12 11  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [15  1  5  5  1  0  0  0  0  0  0  0  0  0  0]
 [15  1  5  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 2  6  2  0  0  0  0  0  0  0  0  0  0  0  0]
 [19  1  7  7  3  1  0  0  0  0  0  0  0  0  0]] 




**The above `np.array` indexes for each letter in the original string, the index of the apropriate token in `aphabet`.**

In [None]:
# save data
# ----------------

np.savetxt(processed_data_folder + 'train_mixed_data_names.txt',train_data_names,fmt='%d')
np.savetxt(processed_data_folder + 'train_mixed_data_nicknames.txt',train_data_nicknames,fmt='%d')
np.savetxt(processed_data_folder + 'validation_mixed_data_names.txt',validation_data_names,fmt='%d')
np.savetxt(processed_data_folder + 'validation_mixed_data_nicknames.txt',validation_data_nicknames,fmt='%d')
np.savetxt(processed_data_folder + 'test_mixed_data_names.txt',test_data_names,fmt='%d')
np.savetxt(processed_data_folder + 'test_mixed_data_nicknames.txt',test_data_nicknames,fmt='%d')
np.savetxt(processed_data_folder + 'train_mixed_targets.txt',train_targets,fmt='%d')
np.savetxt(processed_data_folder + 'validation_mixed_targets.txt',validation_targets,fmt='%d')
np.savetxt(processed_data_folder + 'test_mixed_targets.txt',test_targets,fmt='%d')

# Convert to Sound
The part below takes a lot of time for competion, so it was ran but commented out. The files in the appropriate folders.

**NOTE TO SELF - IT HAS EITHER TO BE RAN AGAIN, OR THE ORIGINAL SPLITS' FILES SHOULD BE THE ONES IN THE FOLDERS.**

In [None]:
!pip install gtts

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting gtts
  Downloading gTTS-2.2.4-py3-none-any.whl (26 kB)
Installing collected packages: gtts
Successfully installed gtts-2.2.4


In [None]:
# load
train_df_shuffled = pd.read_csv(interim_data_folder + 'train_df_shuffled.csv', index_col=0)
validation_df_shuffled = pd.read_csv(interim_data_folder + 'validation_df_shuffled.csv', index_col=0)
test_df_shuffled = pd.read_csv(interim_data_folder + 'test_df_shuffled.csv', index_col=0)


In [None]:
from gtts import gTTS

# Prepare repositories
# ---------------------
# os.mkdir(interim_data_folder + '/mixed_datasets_audio/')
# os.mkdir(interim_data_folder + '/mixed_datasets_audio/train/')
# os.mkdir(interim_data_folder + '/mixed_datasets_audio/validation/')
# os.mkdir(interim_data_folder + '/mixed_datasets_audio/test')

# # Training
# # -------------------
# train_names = train_df_shuffled['name'].tolist()
# train_nicknames = train_df_shuffled['nickname'].tolist()

# start = 1116+1038 + 1085+1044 + 1040 + 1037 + 1014

# # Train
# # -------------------

# for j in tqdm(range(start,len(train_df_shuffled))):
#   name_temp = train_names[j]
#   nickname_temp = train_nicknames[j]
#   if str(nickname_temp)=='nan':
#     nickname_temp = train_nicknames[j-1]
#     print('NA nickname detected at ', j)
  
#   name_job = gTTS(text=name_temp, lang='en', slow=False)
#   name_job.save(interim_data_folder + f"mixed_datasets_audio/train/name_{j}.mp3")

#   nickname_job = gTTS(text=nickname_temp, lang='en', slow=False)
#   nickname_job.save(interim_data_folder + f"mixed_datasets_audio/train/nickname_{j}.mp3")
 
# # Validation
# # -------------------
# validation_names = validation_df_shuffled['name'].tolist()
# validation_nicknames = validation_df_shuffled['nickname'].tolist()

# start=874

# for j in tqdm(range(start, len(validation_df_shuffled))):
#   name_temp = validation_names[j]
#   nickname_temp = validation_nicknames[j]
  
#   if str(nickname_temp)=='nan':
#     nickname_temp = validation_nicknames[j-1]
#     print('NA nickname detected at ', j)
  
#   name_job = gTTS(text=name_temp, lang='en', slow=False)
#   name_job.save(interim_data_folder + f"mixed_datasets_audio/validation/name_{j}.mp3")

#   nickname_job = gTTS(text=nickname_temp, lang='en', slow=False)
#   nickname_job.save(interim_data_folder + f"mixed_datasets_audio/validation/nickname_{j}.mp3")

# # Test
# # -------------------
# test_names = test_df_shuffled['name'].tolist()
# test_nicknames = test_df_shuffled['nickname'].tolist()

# start=0

# for j in tqdm(range(start, len(test_df_shuffled))):
#   name_temp = test_names[j]
#   nickname_temp = test_nicknames[j]
  
#   if str(nickname_temp)=='nan':
#     nickname_temp = test_nicknames[j-1]
#     print('NA nickname detected at ', j)

#   name_job = gTTS(text=name_temp, lang='en', slow=False)
#   name_job.save(interim_data_folder + f"mixed_datasets_audio/test/name_{j}.mp3")

#   nickname_job = gTTS(text=nickname_temp, lang='en', slow=False)
#   nickname_job.save(interim_data_folder + f"mixed_datasets_audio/test/nickname_{j}.mp3")
 


100%|██████████| 1010/1010 [04:29<00:00,  3.75it/s]


# Convert Audio Files into Metrices

In [None]:
import librosa
import warnings
warnings.filterwarnings("ignore")

def load_audio(path):
    """
    Load and pad an audio example
    """
    import librosa
    import numpy as np

    def zero_padding(mat, length=80):
      import numpy as np
      n,m = mat.shape
      zero_mat = np.zeros((n,length-m))

      return np.concatenate((mat,zero_mat),axis=1)

    # load
    values, sampling_rate = librosa.load(path)

    # pad
    mat = librosa.feature.melspectrogram(y=values, sr=sampling_rate)
    padded_mat = zero_padding(mat,80)

    return padded_mat


In [None]:
train_names_mat = []
train_nicknames_mat = []

validation_names_mat = []
validation_nicknames_mat = []

test_names_mat = []
test_nicknames_mat = []

for j in tqdm(range(len(train_df_shuffled))):
  train_names_mat.append(load_audio(interim_data_folder + f"mixed_datasets_audio/train/name_{j}.mp3"))
  train_nicknames_mat.append(load_audio(interim_data_folder + f"mixed_datasets_audio/train/nickname_{j}.mp3"))

for j in tqdm(range(len(validation_df_shuffled))):
  validation_names_mat.append(load_audio(interim_data_folder + f"mixed_datasets_audio/validation/name_{j}.mp3"))
  validation_nicknames_mat.append(load_audio(interim_data_folder + f"mixed_datasets_audio/validation/nickname_{j}.mp3"))

for j in tqdm(range(len(test_df_shuffled))):
  test_names_mat.append(load_audio(interim_data_folder + f"mixed_datasets_audio/test/name_{j}.mp3"))
  test_nicknames_mat.append(load_audio(interim_data_folder + f"mixed_datasets_audio/test/nickname_{j}.mp3"))


train_names_mat = np.array(train_names_mat, dtype=np.float32)
train_nicknames_mat = np.array(train_nicknames_mat, dtype=np.float32)

validation_names_mat = np.array(validation_names_mat, dtype=np.float32)
validation_nicknames_mat = np.array(validation_nicknames_mat, dtype=np.float32)

test_names_mat = np.array(test_names_mat, dtype=np.float32)
test_nicknames_mat = np.array(test_nicknames_mat, dtype=np.float32)

print(train_names_mat.shape)
print(train_nicknames_mat.shape)
print(validation_names_mat.shape)
print(validation_nicknames_mat.shape)
print(test_names_mat.shape)
print(test_nicknames_mat.shape)

np.save(processed_data_folder + 'train_names_mat.npy', train_names_mat) 
np.save(processed_data_folder + 'train_nicknames_mat.npy', train_nicknames_mat) 
np.save(processed_data_folder + 'validation_names_mat.npy', validation_names_mat) 
np.save(processed_data_folder + 'validation_nicknames_mat.npy', validation_nicknames_mat) 
np.save(processed_data_folder + 'test_names_mat.npy', test_names_mat) 
np.save(processed_data_folder + 'test_nicknames_mat.npy', test_nicknames_mat) 




100%|██████████| 7540/7540 [1:17:45<00:00,  1.62it/s]
100%|██████████| 1430/1430 [15:24<00:00,  1.55it/s]
100%|██████████| 1010/1010 [06:23<00:00,  2.64it/s]


(7540, 128, 80)
(7540, 128, 80)
(1430, 128, 80)
(1430, 128, 80)
(1010, 128, 80)
(1010, 128, 80)
