# Pandas tutorial

Dataset characteristic:

- every folder contain: addresses.csv, addresses_people.csv, people.csv, people_publications.csv, publications.csv.
- every file contain few columns, named in first row
- every file not with many-to-many relations, contain temp_id column

Project:

- merge all files into single DataFrame
- change current temp_ids into new unique ids
- eliminate duplicates (eg. consider address with x percentage of similarity as one)
- add column “town” for people.csv data, extracted from “addresses” column in addresses.csv
- get missing lat/lng data for every town/address (eg. through google api)
- save DataFrame to single csv

## Variables

In [2]:
similarity_percentage = 30

data_directory="./data"
data_sub_folders = 134
data_load_step = 1

## Functions & Imports

In [10]:
import re
import pandas as pd
import os

from geopy.geocoders import Nominatim
from typing import List
from wordsegment import load, segment

load()

def move_head(arr, index: int):
    arr = arr = [arr[index]] + arr[:index] + arr[index + 1:]


def data_filter(name: str, address: str):
    data = name.split(",") + address.split(",")
    filtered_data = []
    for word in data:
        cleaned_word = word.strip().lower()
        cleaned_word = re.sub(r'[^a-zA-Z\s]', ' ', cleaned_word)
        cleaned_word = re.sub(r'\s+', ' ', cleaned_word)
        if len(cleaned_word) > 2:
            filtered_data.append(cleaned_word.strip())
    return ",".join(filtered_data)


def similarity(data1: str, data2: str):
    arr1, arr2 = data1.split(","), data2.split(",")
    counter = 0
    total = max(len(arr1), len(arr2))
    for str1 in arr1:
        for str2 in arr2:
            if str1 in str2 or str2 in str1:
                counter += 1
    return counter / total * 100


def make_id_unique(df, value, column="temp_id"):
    df[column] = df[column].apply(lambda id: str(value) + "_" + str(id))
    return df


def get_lat_lng(address):
    geolocator = Nominatim(user_agent="my_geocoder")
    location = geolocator.geocode(address, exactly_one=True)
    if location:
        return location.latitude, location.longitude
    else:
        return None, None

def get_city_name(address):
    cleaned_word = re.sub(r'[^a-zA-Z\s]', ' ', address)
    segments = [word for word in segment(cleaned_word) if len(word) > 3] 
    try:
        return segments[-2]
    except:
        return segments[0]

class GoodAddress:
    string: str
    uuid: str
    children_list: list
    children_dict: dict
    def __init__(self, string, uuid) -> None:
        self.string = string
        self.uuid = uuid
        self.children_list = [uuid]
        self.children_dict = {string: [uuid, 1]}

    def update(self, string, uuid) -> None:
        if string in self.children_dict:
            self.children_dict[string][1] += 1
        else:
            self.children_dict[string] = [uuid, 1]

    def fit_uuid(self) -> None:
        max_count = 0
        for value in self.children_dict.values():
            if value[1] > max_count:
                max_count += 1
                self.uuid = value[0]

## Load data

In [4]:
data_files = {"addresses": [], "addresses_people": [], "people": [], "people_publications": [], "publications": []}

for i in range(0, data_sub_folders, data_load_step):
    data_files["addresses"].append(make_id_unique(pd.read_csv(os.path.join(data_directory, f"{i}/ADDRESSES.csv")), i))
    data_files["addresses_people"].append(make_id_unique(make_id_unique(pd.read_csv(os.path.join(data_directory, f"{i}/ADDRESSES_PEOPLE.csv")), i, "address_uuid"), i, "person_uuid"))
    data_files["people"].append(make_id_unique(pd.read_csv(os.path.join(data_directory, f"{i}/PEOPLE.csv")), i))
    data_files["people_publications"].append(make_id_unique(make_id_unique(pd.read_csv(os.path.join(data_directory, f"{i}/PEOPLE_PUBLICATIONS.csv")), i, "person_uuid"), i, "publication_uuid"))
    data_files["publications"].append(make_id_unique(pd.read_csv(os.path.join(data_directory, f"{i}/PUBLICATIONS.csv")), i))

address_df = pd.concat(data_files["addresses"], ignore_index=True)
addresses_people_df = pd.concat(data_files["addresses_people"], ignore_index=True)
people_df = pd.concat(data_files["people"], ignore_index=True)
people_publications_df = pd.concat(data_files["people_publications"], ignore_index=True)
publications_df = pd.concat(data_files["publications"], ignore_index=True)

address_df.rename(columns={"temp_id": "address_uuid"}, inplace=True)
people_df.rename(columns={"temp_id": "person_uuid"}, inplace=True)
publications_df.rename(columns={"temp_id": "publication_uuid"}, inplace=True)

address_df

Unnamed: 0,address_uuid,address,countries_scope,lat,lon,name,phone,url
0,0_2,"sogn og fjordane university collegesogndal, no...",NO,,,faculty of teacher education and sport,,
1,0_3,"university of bergenbergen, norway",NO,,,"centre for cancer biomarkers, ccbio, departmen...",,
2,0_4,"haukeland university hospitalbergen, norway",NO,,,department of pathology,,
3,0_4,"haukeland university hospitalbergen, norway",NO,,,department of pathology,,
4,0_5,"university of bergenbergen, norway",NO,,,"centre for cancer biomarkers, ccbio, departmen...",,
...,...,...,...,...,...,...,...,...
203334,133_1464,division of environmental medicinenorwegian in...,NO,,,department of air pollution and noise,,
203335,133_1465,division of environmental medicinenorwegian in...,NO,,,department of air pollution and noise,,
203336,133_1466,division of environmental medicinenorwegian in...,NO,,,department of air pollution and noise,,
203337,133_1467,division of environmental medicinenorwegian in...,NO,,,department of air pollution and noise,,


## Prepare good addresses indexes

In [5]:
%%time

address_df["full_address"] = address_df.apply(lambda x: data_filter(str(x["name"]), str(x["address"])), axis=1)

good_address_list: List[GoodAddress] = []

for address_index, address in address_df.iterrows():
    flag = True
    for good_address_index, ga in enumerate(good_address_list):
        if similarity(ga.string, address["full_address"]) > similarity_percentage:
            flag = False
            ga.children_list.append(address["address_uuid"])
            ga.update(address["full_address"], address["address_uuid"])
            move_head(good_address_list, good_address_index)
            break
    if flag:    
        good_address_list.append(GoodAddress(address["full_address"], address["address_uuid"]))

address_df = address_df.drop(["full_address"], axis=1)
for ga in good_address_list:
    ga.fit_uuid()

print(f"{len(good_address_list)} good addresses found")

499 good addresses found
CPU times: user 19.9 s, sys: 40.1 ms, total: 19.9 s
Wall time: 19.9 s


## Update ADDRESSES_PEOPLE relation 

In [6]:
replace_map = {to_replace_address_id: good_address.uuid for good_address in good_address_list for to_replace_address_id in good_address.children_list}
addresses_people_df["address_uuid"] = addresses_people_df["address_uuid"].map(replace_map).fillna(addresses_people_df["address_uuid"])
unique_address_count = addresses_people_df['address_uuid'].nunique()
print(unique_address_count <= len(good_address_list))

True


## Merge people and addresses

In [7]:
merged_1 = pd.merge(people_df, addresses_people_df, on='person_uuid', how="inner")
people_with_address_df = pd.merge(merged_1, address_df, on="address_uuid", how="inner")

## Add town name, lat amd lon

In [15]:
towns = {}
locations = {}

In [17]:
def find_details(address):
    if address not in towns:    # Address unknown
        town_name = get_city_name(address)
        towns[address] = town_name

        if town_name not in locations:  # Position unknown 
            latitude, longitude = get_lat_lng(town_name)
            locations[town_name] = (latitude, longitude)


people_with_address_df["address"].apply(lambda x: find_details(x))
people_with_address_df["town"] = people_with_address_df["address"].apply(lambda x: towns[x])
people_with_address_df["lat"] = people_with_address_df["town"].apply(lambda x: locations[x][0])
people_with_address_df["lon"] = people_with_address_df["town"].apply(lambda x: locations[x][1])
people_with_address_df.head()

Unnamed: 0,person_uuid,lastname,firstname,countries_scope_x,email,town,role,url_x,address_uuid,address,countries_scope_y,lat,lon,name,phone,url_y
0,0_1,Andersen,Lars B,NO,,oslo,,,7_112,"norwegian school of sport sciencesoslo, norway",NO,59.91333,10.73897,department of physical performance,,
1,0_2,Knutsvik,Gøril,NO,,bergen,,,11_1349,"university of bergenbergen, norway",NO,60.394306,5.325919,"centre for cancer biomarkers ccbio, department...",,
2,0_2,Knutsvik,Gøril,NO,,oslo,,,7_112,"norwegian school of sport sciencesoslo, norway",NO,59.91333,10.73897,department of physical performance,,
3,0_3,Collett,Karin,NO,,oslo,,,7_112,"norwegian school of sport sciencesoslo, norway",NO,59.91333,10.73897,department of physical performance,,
4,0_4,Arnes,Jarle,NO,,bergen,,,11_1349,"university of bergenbergen, norway",NO,60.394306,5.325919,"centre for cancer biomarkers ccbio, department...",,


## Merge people_addresses with publications

In [18]:
merged_3 = pd.merge(people_with_address_df, people_publications_df, on="person_uuid", how="inner")
one_df = pd.merge(merged_3, publications_df, on="publication_uuid", how="inner")

len(one_df)

207093

## Unify ID

In [20]:
one_df['id'] = one_df['address_uuid'].astype(str) + "_" + one_df['person_uuid'].astype(str) + "_" + one_df['publication_uuid'].astype(str)
one_df = one_df[['id'] + [col for col in one_df.columns if col != 'id']]
one_df.drop(columns=['address_uuid', 'person_uuid', "publication_uuid"], inplace=True)

one_df.head()

KeyError: 'address_uuid'