# Data exploration

## Load dataset

In [1]:
import os
import pandas as pd

df = pd.read_json(os.path.join("..", "edda_coordinata.json"))
df

Unnamed: 0,id-enccre,headword,text,coordinates,meridian
0,v1-10-0,AA,"*​ AA, s. f. riviere de France, qui prend sa s...",,
1,v1-101-0,ABÉE,"ABÉE, s. f. Ville du détroit Messenien que Xer...",,
2,v1-1013-0,AIX,"*​ AIX, (Géog.)​ ville de France en Provence, ...","[[43 31' 35"" N 23 6' 34"" E]]",
3,v1-1013-1,Aix,"*​ Aix, (Géog.)​ ville de Savoye sur le lac de...",[[45 40' N 23 34' E]],
4,v1-1013-2,Aix,"*​ Aix, (Géog.)​ petite ville de France dans l...",,
...,...,...,...,...,...
15364,v9-995-0,LATITUDE,"LATITUDE, s. f. (Géogr.)​ la latitude marque l...",,
15365,v9-997-0,LATIUM le,"LATIUM le, (Géog. anc.)​ c’est-à-dire le pays ...",,
15366,v9-998-0,LATMICUS SINUS,"LATMICUS SINUS, (Géog. anc.)​ golfe de la mer ...",,
15367,v9-999-0,LATMOS,"LATMOS, (Géog. anc.)​ ancienne ville de l’Ioni...",,


## Distribution of types of coordinates

In [None]:
df_points = df[
    df["coordinates"].apply(lambda x: isinstance(x, list) 
                           and len(x) == 1 
                           and isinstance(x[0], list) 
                           and len(x[0]) == 1)
]
df_surface = df[
    df["coordinates"].apply(lambda x: isinstance(x, list) 
                           and len(x) == 1 
                           and isinstance(x[0], list) 
                           and len(x[0]) > 1)
]
df_sec = df[
    df["coordinates"].apply(lambda x: isinstance(x, list) 
                           and len(x) > 1  
                           and x[0][0] != 'alt' and x[0][0] != 'arc' and x[0][0] != 'misc')
]
df_alt = df[
    df["coordinates"].apply(lambda x: isinstance(x, list) 
                           and len(x) > 1  
                           and x[0][0] == 'alt')
]
df_arc = df[
    df["coordinates"].apply(lambda x: isinstance(x, list) 
                           and len(x) > 1 
                           and x[0][0] == 'arc')
]
df_misc = df[
    df["coordinates"].apply(lambda x: isinstance(x, list) 
                           and len(x) > 1 
                           and x[0][0] == 'misc')
]

def lat_and_long(coord):
    # format : "43 31' 35" N 23 6' 34" E"
    val = ''
    if ('N' in coord or 'S' in coord):
        val += 'lat'
    if ('E' in coord or 'W' in coord):
        val += 'long'
    return val

df_points = df_points.copy()
df_points.loc[:, "latlong"] = df_points["coordinates"].apply(lambda x: lat_and_long(x[0][0]))


print("Number of points: ", len(df_points[df_points["latlong"] == 'latlong']))
print(f"Number of incomplete points: {len(df_points[df_points['latlong'] != 'latlong'])}, latitude only: {len(df_points[df_points['latlong'] == 'lat'])}, longitude only: {len(df_points[df_points['latlong'] == 'long'])}")
print("Number of surfaces: ", len(df_surface))
print("Number of multiple places: ", len(df_sec))
print("Number of multiple sources: ", len(df_alt))
print("Number of arcs: ", len(df_arc))
print("Number of misc: ", len(df_misc))

Number of points:  4289
Number of incomplete points: 234, latitude only: 223, longitude only: 11
Number of surfaces:  217
Number of places:  48
Number of multiple sources:  89
Number of arcs:  11
Number of misc :  1


### Distribution lat / long

In [3]:
def precision(coord):
    val = 'Lat_'
    sep = 'N' if 'N' in coord else 'S'
    lat = coord.split(sep)[0].strip()
    if len(lat.split(" ")) == 3:
        val += "DMS"
    elif len(lat.split(" ")) == 2:
        val += "DM"
    elif len(lat.split(" ")) == 1:
        val += "D"
    val += "-Long_"
    long = coord.split(sep)[1].strip()
    if len(long.split(" ")) == 4:
        val += "DMS"
    elif len(long.split(" ")) == 3:
        val += "DM"
    elif len(long.split(" ")) == 2:
        val += "D"
    return val

df_points_latlong = df_points[df_points["latlong"] == "latlong"].copy()
df_points_latlong.loc[:,"precision"] = df_points_latlong["coordinates"].apply(lambda x: precision(x[0][0]))

print(df_points_latlong.precision.value_counts())

precision
Lat_DM-Long_DM      3357
Lat_DM-Long_D        278
Lat_DMS-Long_DMS     222
Lat_D-Long_DM        182
Lat_D-Long_D         116
Lat_DM-Long_DMS       91
Lat_DMS-Long_DM       38
Lat_DMS-Long_D         3
Lat_D-Long_DMS         2
Name: count, dtype: int64
