In [138]:
from datetime import date
import geoip2.database
import os
import pandas as pd
import us.states

Load latest city-by-IP database from [MaxMind](https://dev.maxmind.com/geoip/geoip2/geolite2/)

In [33]:
georeader = geoip2.database.Reader(
    os.path.join(
        [d for d in os.listdir() if d.startswith("GeoLite2-City")][-1],
        'GeoLite2-City.mmdb'
    )
)

In [197]:
def geo_count(df, level=None):
    """
    Parameter
    ---------
    df: DataFrame
    
    level: string
        'city' or 'country'
    
    Returns
    -------
    count: int
    """
    return(len(geo_frame(df, level)))

    
def geo_frame(df, level=None):
    """
    Parameter
    ---------
    df: DataFrame
    
    level: string
        'city' or 'country'
    
    Returns
    -------
    df: DataFrame
    """
    if not level or level not in ["city", "country"]:
        print("geo_count: Specify\n\tlevel=\"city\" | \"country\"\nto get a count.")
        return(0)
    if level == "country":
        return(
            df["Country"].loc[df["Country"] != ""].drop_duplicates(
                "first"
            ).sort_values()
        )
    if level == "city":
        city_dupes = df.loc[
            (df.duplicated(["City", "Country"], keep=False))
            &
            (df["State or Province"] == "")
        ]
        return(
            df.loc[
                (df["City"] != "")
                &
                ~(
                    (df["City"].isin(city_dupes["City"]))
                    &
                    (df["State or Province"] == "")
                    &
                    (df["Country"].isin(city_dupes["Country"]))
                )
            ].drop_duplicates(
                ["City", "Country"]
            ).sort_values(
                ["Country", "City", "State or Province"]
            )
        )

def geo_from_ip(ip):
    """
    Parameter
    ---------
    ip : string
        IP address
        
    Returns
    -------
    city : string
    
    subdivision : string
    
    country : string
    """
    try:
        country = georeader.city(ip).country.names['en']
    except:
        country = ""
    try:
        subdivision = georeader.city(ip).subdivisions[0].names['en']
        subdivision = subdivision
    except:
        subdivision = country
    try:
        city = georeader.city(ip).city.names['en']
    except:
        city = subdivision
    return(city, subdivision, country)


def locsplit(loc):
    """
    Parameter
    ---------
    loc : string
       "City [, State|, Province|None], Country"
       
    Returns
    -------
    city : string
    
    subdivision : string
    
    country : string
    """
    loc_tup = tuple(loc.split(", "))
    if(len(loc_tup) == 2):
        return((loc_tup[0], "", loc_tup[1]))
    else:
        try:
            return((loc_tup[0], us.states.lookup(loc_tup[1]).name, loc_tup[2]))
        except:
            return(loc_tup)
        
        
def print_geo_frame(df):
    """
    Parameter
    ---------
    df: DataFrame
       
    Returns
    -------
    geo_string : string
    """
    geo_string = ""
    for row in df.iterrows():
        if row[1]["City"] == row[1]["Country"] and row[1]["City"] == row[1]["State or Province"]:
            geo_string = "\n".join([
                geo_string,
                row[1]["Country"]
            ])
        elif row[1]["State or Province"] == "":
            geo_string = "\n".join([
                geo_string,
                ", ".join([
                    row[1]["City"],
                    row[1]["Country"]
                ])
            ])
        else:
            geo_string = "\n".join([
                geo_string,
                ", ".join(row[1])
            ])
    return(geo_string)

# Acquire reports
- [FCP](https://www.nitrc.org/project/stats/download_report.php?group_id=296)
- [INDI](https://www.nitrc.org/project/stats/download_report.php?group_id=404)
- HBN: LORIS & COINS
- Mindboggle

In [198]:
data_dir = os.path.abspath("-".join([str(date.today().year), str(date.today().month - 1)]))

In [199]:
FCP = pd.read_csv(
    os.path.join(data_dir, "FCP.csv"),
    usecols=[
        "Date",
        "Package",
        "Release",
        "File",
        "IP Address"
    ],
    low_memory=False
)

INDI = pd.read_csv(
    os.path.join(data_dir, "INDI.csv"),
    usecols=[
        "Date",
        "Package",
        "Release",
        "File",
        "IP Address"
    ],
    low_memory=False
)

FCP_INDI = pd.concat([FCP, INDI])

HBN = pd.read_csv(
    os.path.join(data_dir, "HBN.csv")
)

In [200]:
FCP_INDI_locations_by_package = FCP_INDI[
    ["Package", "IP Address"]
].drop_duplicates(
    ["Package", "IP Address"]
).copy().reset_index(drop=True)

In [201]:
FCP_locations = FCP_INDI_locations_by_package["IP Address"].apply(
    lambda x: geo_from_ip(x)
).apply(pd.Series)

In [202]:
FCP_INDI_locations_by_package["City"] = FCP_locations[0]
FCP_INDI_locations_by_package["State or Province"] = FCP_locations[1]
FCP_INDI_locations_by_package["Country"] = FCP_locations[2]

In [203]:
FCP_INDI_locations_by_package = FCP_INDI_locations_by_package[
    ["Package", "City", "State or Province", "Country"]
].drop_duplicates(
    ["Package", "City", "State or Province", "Country"]
).copy()

In [204]:
FCP_INDI_locations = FCP_INDI_locations_by_package[
    ["City", "State or Province", "Country"]
].drop_duplicates(
    ["City", "State or Province", "Country"]
).copy().sort_values(
    ["Country", "City", "State or Province"]
)

In [205]:
HBN_locations = HBN["Location"].apply(lambda x: locsplit(x)).apply(pd.Series)
HBN_locations[["City", "State or Province", "Country"]] = HBN_locations[[0, 1, 2]]
HBN_locations = HBN_locations.drop([0, 1, 2], axis=1).copy()

In [206]:
HBN_locations = HBN_locations.drop_duplicates(
    ["City", "State or Province", "Country"]
).copy().sort_values(
    ["Country", "City", "State or Province"]
)

In [209]:
geo_count(FCP_INDI_locations, "city")

2721

In [207]:
geo_count(FCP_INDI_locations, "country")

98

In [210]:
geo_count(HBN_locations, "city")

16

In [208]:
geo_count(HBN_locations, "country")

5

In [211]:
all_locations = pd.concat([
    geo_frame(FCP_INDI_locations, "city"),
    geo_frame(HBN_locations, "city")
]).sort_values(
    ["Country", "City", "State or Province"]
).drop_duplicates(
    ["Country", "City", "State or Province"]
)

In [213]:
geo_count(all_locations, "city")

2721

In [214]:
geo_count(all_locations, "country")

98

In [217]:
print(print_geo_frame(HBN_locations))


London, Ontario, Canada
North York, Ontario, Canada
Beijing, China
Guangzhou, China
Lanzhou, China
Shanghai, China
Taiyuan, China
Paris, France
Munich, Germany
Atlanta, Georgia, United States
Baltimore, Maryland, United States
Bethesda, Maryland, United States
Louisville, Kentucky, United States
New Haven, Connecticut, United States
New York, New York, United States
Pittsburgh, Pennsylvania, United States


In [215]:
print(print_geo_frame(all_locations))


Algeria
Annaba, Annaba, Algeria
Tindouf, Tindouf, Algeria
Abasto, Buenos Aires, Argentina
Aguas Blancas, Salta, Argentina
Argentina
Bahía Blanca, Buenos Aires, Argentina
Buenos Aires, Buenos Aires F.D., Argentina
Buenos Aires F.D., Buenos Aires F.D., Argentina
Caseros, Buenos Aires, Argentina
Ciudadela, Buenos Aires, Argentina
Córdoba, Cordoba, Argentina
General Belgrano, Buenos Aires, Argentina
Gualeguaychú, Entre Rios, Argentina
Hurlingham, Buenos Aires, Argentina
La Plata, Buenos Aires, Argentina
Lobos, Buenos Aires, Argentina
Mar del Plata, Buenos Aires, Argentina
Martinez, Buenos Aires, Argentina
Mendoza, Mendoza, Argentina
Merlo, Buenos Aires, Argentina
Moron, Buenos Aires, Argentina
Olivos, Buenos Aires, Argentina
Rosario, Santa Fe, Argentina
Saavedra, Buenos Aires, Argentina
Salta, Salta, Argentina
San Fernando, Buenos Aires, Argentina
Santa Fe, Santa Fe, Argentina
Venado Tuerto, Santa Fe, Argentina
Villa Angelica, Entre Rios, Argentina
Villa Luzuriaga, Buenos Aires, Argentina