In [14]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [28]:
import pandas as pd
import numpy as np
import jellyfish
import dask
import dask.array as da
from dask import dataframe as df1
from unidecode import unidecode

import time

In [16]:
data = df1.read_csv('WorldStreams.csv')

In [17]:
data.head()

Unnamed: 0,geonameid,name,asciiname,latitude,longitude,timezone,country,continent,sub_region
0,2986043,Pic de Font Blanca,Pic de Font Blanca,42.64991,1.53335,Europe/Andorra,Andorra,Europe,Southern Europe
1,2994701,Roc Mélé,Roc Mele,42.58765,1.74028,Europe/Andorra,Andorra,Europe,Southern Europe
2,3007683,Pic des Langounelles,Pic des Langounelles,42.61203,1.47364,Europe/Andorra,Andorra,Europe,Southern Europe
3,3017832,Pic de les Abelletes,Pic de les Abelletes,42.52535,1.73343,Europe/Andorra,Andorra,Europe,Southern Europe
4,3017833,Estany de les Abelletes,Estany de les Abelletes,42.52915,1.73362,Europe/Andorra,Andorra,Europe,Southern Europe


In [18]:
data["continent"].apply(type, meta=('continent', 'object')).value_counts()

Dask Series Structure:
npartitions=1
    int64
      ...
Name: continent, dtype: int64
Dask Name: value-counts-agg, 5 graph layers

In [19]:
data['continent'] = data['continent'].astype(str)

In [20]:
data["continent"].apply(type, meta=('continent', 'object')).value_counts()

Dask Series Structure:
npartitions=1
    int64
      ...
Name: continent, dtype: int64
Dask Name: value-counts-agg, 8 graph layers

In [21]:
data["name_clean"] = data["name"].apply(unidecode, meta=('name', 'object'))

In [30]:
def windowKey(data:pd.DataFrame, cols, methods, substring_length=3, upper=True):
    
    """
    Función encargada de generar una window key para los datos utilizando las
    columnas y métodos especificados por el usuario.
    
    Argumentos:
        data (DataFrame) dataframe que contiene la base de datos.
        cols (lst) lista de columnas a utilizar para generar la window key.
        methods (lst) lista de métodos a utilizar. Para soundex el método es
            'sou' y para substring es 'sub'.
        substring_length (int) número entero que nos indica la longitud que
            tendrá el substring.
        upper (bool) valor booleano que nos indica si el substring será en
            minúscula o mayúscula.
        
    Regresa:
        una lista que contiene las window keys generadas para el dataset en
        cuestión.
        
    """

    func_dict = {i: [cols[i], methods[i]] for i in range(len(cols))}

    windowKey = []

    for i in range(len(func_dict)):
        
        if func_dict[i][1] == "sou":
            soundex = np.vectorize(jellyfish.soundex)
            windowKey.append(soundex(data[func_dict[i][0]].values))
        if func_dict[i][1] == "sub":
            if upper == True:
                substring = np.vectorize(lambda x: x[:substring_length].replace(' ', '').upper() if (len(x) == substring_length) else (x[:substring_length].replace(' ', '').upper() + (' ' * (substring_length - len(x)))))
                windowKey.append(substring(data[func_dict[i][0]].values))
            else:
                substring = np.vectorize(lambda x: x[:substring_length].replace(' ', '') if (len(x) == substring_length) else (x[:substring_length].replace(' ', '') + (' ' * (substring_length - len(x)))))
                windowKey.append(substring(data[func_dict[i][0]].values))

    return list(map(''.join, zip(*windowKey)))

In [31]:
#Aplicacion de la funcion y agregado de la nueva columna WindowKey
mylist = windowKey(data, ["country", "continent","name_clean"], ["sou", "sub", "sou"], 3)

chunks = data.map_partitions(lambda x: len(x)).compute().to_numpy()

myarray = da.from_array(mylist, chunks=tuple(chunks))

data['WindowKey'] = myarray

In [32]:
#ordenamiento
data_order = data.sort_values('WindowKey')  

In [33]:
data_order.head()

Unnamed: 0,geonameid,name,asciiname,latitude,longitude,timezone,country,continent,sub_region,name_clean,WindowKey
68317,1466558,‘Ayūbzī,`Ayubzi,31.52982,65.3707,Asia/Kabul,Afghanistan,Asia,Southern Asia,'Ayubzi,A125ASI'120
38536,1149672,‘Abbās,'Abbas,33.333,67.64328,Asia/Kabul,Afghanistan,Asia,Southern Asia,'Abbas,A125ASI'120
75489,1474164,‘Abbās Ghar,'Abbas Ghar,34.89766,70.96439,Asia/Kabul,Afghanistan,Asia,Southern Asia,'Abbas Ghar,A125ASI'122
46810,1433877,‘Abbās Khān Kalā,'Abbas Khan Kala,32.50398,69.20979,Asia/Kabul,Afghanistan,Asia,Southern Asia,'Abbas Khan Kala,A125ASI'122
46876,1433943,‘Abbās Khēl-e Junūbī,'Abbas Khel-e Junubi,32.67188,69.13467,Asia/Kabul,Afghanistan,Asia,Southern Asia,'Abbas Khel-e Junubi,A125ASI'122
