# Dunham's Data

## Setting up

In [1]:
from geopy.distance import geodesic
# import networkx as nx
import pandas as pd
# import seaborn as sns

In [2]:
# pd.set_option('display.max_rows', 999)

In [3]:
# sns.set()

In [4]:
# %matplotlib inline

## Loading data

In [5]:
dunham_df = pd.read_csv('data/out/dunham_1947-60.csv')
dunham_df.fillna('', inplace=True)

In [6]:
dunham_df.shape

(10, 13)

In [7]:
dunham_df.head()

Unnamed: 0,DATE,CITY1,COUNTRY1,CITY2,COUNTRY2,WORKING,HOTEL_ADDRESS,VENUE1,VENUE_TYPE1,VENUE2,VENUE_TYPE2,NOTES,SOURCE
0,1947-03-14,New York City,USA,,,y,,Hotel Delmonico,n,Roxy,n,"Diary--""All Star Show for the Wounded"" between...",Display Ad New York Times (1923-Current file);...
1,1947-04-07,New York City,USA,,,y,,,,,,Diary--rehearsal,SIU Box 142 Folder 7
2,1947-06-28,Mexico City,Mexico,,,y,"Arenal #7, San Angel, Mexico, D.F.",,,,,,
3,1947-07-01,Mexico City,Mexico,,,y,"Arenal #7, San Angel, Mexico, D.F.",,,,,Rehearsing every day at Dunham's rented house;...,SIU Box 10 Folder 4
4,1947-07-06,Mexico City,Mexico,,,y,"Arenal #7, San Angel, Mexico, D.F.",,,,,,


In [8]:
dunham_df.tail(1)

Unnamed: 0,DATE,CITY1,COUNTRY1,CITY2,COUNTRY2,WORKING,HOTEL_ADDRESS,VENUE1,VENUE_TYPE1,VENUE2,VENUE_TYPE2,NOTES,SOURCE
9,1948-01-04,San Francisco,USA,,,y,,Geary Theater,c,,,,Bal Negre program


## Pre-processing data

In [9]:
geolocations = { # (lat, lon)
    'Mexico City': (19.433333, -99.133333),
    'New York City': (40.7127, -74.0059),
    'San Diego': (32.715, -117.1625),
    'San Francisco': (37.783333, -122.416667),
    'Stockton, CA': (37.975556, -121.300833),
}

In [10]:
dunham_df['CITY'] = dunham_df.apply(lambda row: row.CITY2 if row.CITY2 else row.CITY1, axis=1)
dunham_df['COUNTRY'] = dunham_df.apply(lambda row: row.COUNTRY2 if row.CITY2 else row.COUNTRY1, axis=1)

## Processing data

In [11]:
def is_valid_city(city):
    return bool(city) and city != 'in transit'

cols = [
    'START_DATE', 'END_DATE',
    'CITY', 'COUNTRY',
    'N_ROWS', 'LAST_MOMENT', 'MIN_NIGHTS', 'MAX_NIGHTS',
    'LATITUDE', 'LONGITUDE',
]
itinerary_df = pd.DataFrame(columns=cols)
row = dunham_df.iloc[0]
start_date = row.DATE
end_date = row.DATE
start_city = row.CITY
start_country = row.COUNTRY
n_rows = 1
for i, row in dunham_df.iloc[1:].iterrows():
    date = row.DATE
    city = row.CITY
    country = row.COUNTRY
    if is_valid_city(city) and city == start_city:
        n_rows += 1
        end_date = date
    else:
        if is_valid_city(start_city):
            itinerary_df.loc[len(itinerary_df)] = [
                start_date,
                end_date,
                start_city,
                start_country,
                n_rows,
                'The next morning' if row.CITY2 else 'Probably that night',
                n_rows + (0 if row.CITY2 else -1),
                n_rows,
                geolocations[start_city][0],
                geolocations[start_city][1],
            ]
#             display(pd.DataFrame(itinerary_df.iloc[-1]).T)
        start_date = date
        end_date = date
        start_city = city
        start_country = country
        n_rows = 1
if city == start_city:
    if is_valid_city(start_city):
        itinerary_df.loc[len(itinerary_df)] = [
            start_date,
            end_date,
            start_city,
            start_country,
            n_rows,
            'The next morning' if row.CITY2 else 'Probably that night',
            n_rows + (0 if row.CITY2 else -1),
            n_rows,
            geolocations[start_city][0],
            geolocations[start_city][1],
        ]
#         display(pd.DataFrame(itinerary_df.iloc[-1]).T)

In [12]:
itinerary_df.shape

(5, 10)

In [13]:
itinerary_df

Unnamed: 0,START_DATE,END_DATE,CITY,COUNTRY,N_ROWS,LAST_MOMENT,MIN_NIGHTS,MAX_NIGHTS,LATITUDE,LONGITUDE
0,1947-03-14,1947-04-07,New York City,USA,2,Probably that night,1,2,40.7127,-74.0059
1,1947-06-28,1947-09-01,Mexico City,Mexico,5,Probably that night,4,5,19.433333,-99.133333
2,1947-12-29,1947-12-29,San Diego,USA,1,Probably that night,0,1,32.715,-117.1625
3,1948-01-02,1948-01-02,"Stockton, CA",USA,1,Probably that night,0,1,37.975556,-121.300833
4,1948-01-04,1948-01-04,San Francisco,USA,1,Probably that night,0,1,37.783333,-122.416667


## Saving data

In [14]:
itinerary_df.to_csv('data/out/itinerary_1947-60.csv', index=False)