# Pandas tutorial

## Variables

In [1]:
similarity_percentage = 20

## Functions & Imports

In [2]:
import re
import pandas as pd
import os

def move_head(arr, index: int):
    arr = arr = [arr[index]] + arr[:index] + arr[index + 1:]

def data_filter(name: str, address: str):
    data = name.split(",") + address.split(",")
    filtered_data = []
    for word in data:
        cleaned_word = word.strip().lower()
        cleaned_word = re.sub(r'[^a-zA-Z\s]', ' ', cleaned_word)
        cleaned_word = re.sub(r'\s+', ' ', cleaned_word)
        if len(cleaned_word) > 2:
            filtered_data.append(cleaned_word.strip())
    return ",".join(filtered_data)
            
def similarity(data1: str, data2: str):
    arr1, arr2 = data1.split(","), data2.split(",")
    counter = 0
    total = max(len(arr1), len(arr2))
    for str1 in arr1:
        for str2 in arr2:
            if str1 in str2 or str2 in str1:
                counter += 1
    return counter / total * 100

## Prepare addresses

In [6]:
%%time


address_list = []

for dirpath, dirnames, filenames in os.walk("./data/"):
    if dirnames:
        continue
    data = pd.read_table(os.path.join(dirpath, "ADDRESSES.csv"), sep=',')
    address_list.append(data)

address_df: pd.DataFrame = pd.concat(address_list, ignore_index=True)
address_df["full_address"] = address_df.apply(lambda x: data_filter(str(x["name"]), str(x["address"])), axis=1)

good_address_list = []

for address_index, address in address_df.iterrows():
    flag = True
    for good_address_index, ga in enumerate(good_address_list):
        if similarity(ga[0], address["full_address"]) > similarity_percentage:
            flag = False
            ga[2].append(address_index)
            move_head(good_address_list, good_address_index)
            break
    if flag:    
        good_address_list.append((address["full_address"], address_index, []))

print(f"{len(good_address_list)} good addresses found")

197 good addresses found
CPU times: user 14.1 s, sys: 48.2 ms, total: 14.2 s
Wall time: 14.2 s


## Update ADDRESSES_PEOPLE relation 

In [None]:
import os
import pandas as pd

dfs = []

for dirpath, dirnames, filenames in os.walk("./data/"):
    if dirnames:
        continue
    addresses = pd.read_table(os.path.join(dirpath, "ADDRESSES.csv"), sep=',')
    addresses_people = pd.read_table(os.path.join(dirpath, "ADDRESSES_PEOPLE.csv"), sep=',')
    people = pd.read_table(os.path.join(dirpath, "PEOPLE.csv"), sep=",")
    people_publications = pd.read_table(os.path.join(dirpath, "PEOPLE_PUBLICATIONS.csv"), sep=",")
    publications = pd.read_table(os.path.join(dirpath, "PUBLICATIONS.csv"), sep=",")

    addresses.rename(columns={"temp_id": "address_uuid"}, inplace=True)
    people.rename(columns={"temp_id": "person_uuid"}, inplace=True)
    publications.rename(columns={"temp_id": "publication_uuid"}, inplace=True)
    
    merged_1 = pd.merge(addresses, addresses_people, on='address_uuid', how="inner")
    merged_2 = pd.merge(merged_1, people, on="person_uuid", how="inner")
    merged_3 = pd.merge(merged_2, people_publications, on="person_uuid", how="inner")
    data = pd.merge(merged_3, publications, on="publication_uuid", how="inner")

    dfs.append(data)

df = pd.concat(dfs, ignore_index=True)

df.head()

In [None]:
df['id'] = df['address_uuid'].astype(str) + "_" + df['person_uuid'].astype(str) + "_" + df['publication_uuid'].astype(str)
df = df[['id'] + [col for col in df.columns if col != 'id']]
df.drop(columns=['address_uuid', 'person_uuid', "publication_uuid"], inplace=True)

In [None]:
df.groupby('address').size().reset_index(name='count').sort_values(by='count', ascending=False)

In [None]:
visited = set()
c = 0
i = 0
for index, row in df.iterrows():
    i+=1
    current_address = row['address']
    current_name = row['name']
    v = str(current_address) + ", " + str(current_name)
    if v in visited:
        c += 1
        continue
    visited.add(v)

c, i, c-i
    
    

In [None]:
df.at[1, "firstname"] = "Paweł"
df.iloc[1]