In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

In [2]:
df = pd.read_csv("data/sakura-dates-fixed-cleaned.csv")
df.head()

Unnamed: 0,city,city_eng,latitude,longitude,start_date,full_date
0,稚内,Wakkanai,45.4156,141.6734,2000-05-11,2000-05-15
1,旭川,Asahikawa,43.7709,142.365,2000-05-03,2000-05-04
2,網走,Abashiri,44.0206,144.2735,2000-05-07,2000-05-11
3,札幌,Sapporo,43.0618,141.3545,2000-04-29,2000-05-04
4,帯広,Obihiro,42.9233,143.1972,2000-05-01,2000-05-03


# Build our template dataframe, 1 row for every day

In [3]:
days = pd.DataFrame({'day_of_year': range(1, 200)})
days['day_of_year'] = days.day_of_year.apply(lambda doy: datetime(2000, 1, 1) + timedelta(days=doy))
days = days.set_index('day_of_year')
days.tail()

2000-07-14
2000-07-15
2000-07-16
2000-07-17
2000-07-18


In [4]:
first_day = df.start_date.min()
last_day = df.full_date.max()
print(f"Between {first_day} and {last_day}")

Between 2000-01-15 and 2000-05-19


In [5]:
days = days[(days.index >= first_day) & (days.index <= last_day)]
days.index

DatetimeIndex(['2000-01-15', '2000-01-16', '2000-01-17', '2000-01-18',
               '2000-01-19', '2000-01-20', '2000-01-21', '2000-01-22',
               '2000-01-23', '2000-01-24',
               ...
               '2000-05-10', '2000-05-11', '2000-05-12', '2000-05-13',
               '2000-05-14', '2000-05-15', '2000-05-16', '2000-05-17',
               '2000-05-18', '2000-05-19'],
              dtype='datetime64[ns]', name='day_of_year', length=126, freq=None)

# Crossssssstabs

In [6]:
df.dtypes

city           object
city_eng       object
latitude      float64
longitude      object
start_date     object
full_date      object
dtype: object

In [7]:
# Need to convert to dates to match our columns up above
df.start_median = pd.to_datetime(df.start_date)
df.full_median = pd.to_datetime(df.full_date)

  
  This is separate from the ipykernel package so we can avoid doing imports until


In [8]:
pd.crosstab(df.start_median, df.city_eng).replace({0: np.nan})

city_eng,Abashiri,Akita,Aomori,Asahikawa,Choshi,Fukui,Fukuoka,Fukushima,Gifu,Hakodate,...,Tokyo,Tottori,Toyama,Tsu,Utsunomiya,Wakayama,Wakkanai,Yamagata,Yokohama,Ōita
start_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2000-01-15,,,,,,,,,,,...,,,,,,,,,,
2000-01-17,,,,,,,,,,,...,,,,,,,,,,
2000-01-19,,,,,,,,,,,...,,,,,,,,,,
2000-01-21,,,,,,,,,,,...,,,,,,,,,,
2000-01-22,,,,,,,,,,,...,,,,,,,,,,
2000-03-19,,,,,,,,,,,...,,,,,,,,,,
2000-03-21,,,,,,,1.0,,,,...,1.0,,,,,,,,,
2000-03-22,,,,,,,,,,,...,,,,,,,,,1.0,
2000-03-23,,,,,,,,,1.0,,...,,,,,,1.0,,,,1.0
2000-03-24,,,,,,,,,,,...,,,,,,,,,,


In [9]:
# One is 1 and one is 10 to tell them apart and for easy scaling later
start = pd.crosstab(df.start_date, df.city_eng).replace({0: np.nan})
finish = pd.crosstab(df.full_date, df.city_eng).replace({0: np.nan, 1: 10})

In [10]:
start.head(10)

city_eng,Abashiri,Akita,Aomori,Asahikawa,Choshi,Fukui,Fukuoka,Fukushima,Gifu,Hakodate,...,Tokyo,Tottori,Toyama,Tsu,Utsunomiya,Wakayama,Wakkanai,Yamagata,Yokohama,Ōita
start_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2000-01-15,,,,,,,,,,,...,,,,,,,,,,
2000-01-17,,,,,,,,,,,...,,,,,,,,,,
2000-01-19,,,,,,,,,,,...,,,,,,,,,,
2000-01-21,,,,,,,,,,,...,,,,,,,,,,
2000-01-22,,,,,,,,,,,...,,,,,,,,,,
2000-03-19,,,,,,,,,,,...,,,,,,,,,,
2000-03-21,,,,,,,1.0,,,,...,1.0,,,,,,,,,
2000-03-22,,,,,,,,,,,...,,,,,,,,,1.0,
2000-03-23,,,,,,,,,1.0,,...,,,,,,1.0,,,,1.0
2000-03-24,,,,,,,,,,,...,,,,,,,,,,


In [11]:
finish.head(10)

city_eng,Abashiri,Akita,Aomori,Asahikawa,Choshi,Fukui,Fukuoka,Fukushima,Gifu,Hakodate,...,Tokyo,Tottori,Toyama,Tsu,Utsunomiya,Wakayama,Wakkanai,Yamagata,Yokohama,Ōita
full_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2000-02-02,,,,,,,,,,,...,,,,,,,,,,
2000-02-05,,,,,,,,,,,...,,,,,,,,,,
2000-02-08,,,,,,,,,,,...,,,,,,,,,,
2000-02-09,,,,,,,,,,,...,,,,,,,,,,
2000-03-28,,,,,,,,,,,...,,,,,,,,,,
2000-03-29,,,,,,,,,,,...,10.0,,,,,,,,,
2000-03-30,,,,,,,10.0,,,,...,,,,,,,,,,
2000-03-31,,,,,,,,,,,...,,,,,,,,,10.0,
2000-04-01,,,,,,,,,10.0,,...,,,,,,10.0,,,,
2000-04-02,,,,,,,,,,,...,,,,10.0,,,,,,


# Combine

In [12]:
combined = days.combine_first(start).combine_first(finish)
combined = combined.interpolate(limit_area='inside')
combined = combined.reset_index().rename(columns={'index': 'date'})
combined.head(10)

city_eng,date,Abashiri,Akita,Aomori,Asahikawa,Choshi,Fukui,Fukuoka,Fukushima,Gifu,...,Tokyo,Tottori,Toyama,Tsu,Utsunomiya,Wakayama,Wakkanai,Yamagata,Yokohama,Ōita
0,2000-01-15,,,,,,,,,,...,,,,,,,,,,
1,2000-01-16,,,,,,,,,,...,,,,,,,,,,
2,2000-01-17,,,,,,,,,,...,,,,,,,,,,
3,2000-01-18,,,,,,,,,,...,,,,,,,,,,
4,2000-01-19,,,,,,,,,,...,,,,,,,,,,
5,2000-01-20,,,,,,,,,,...,,,,,,,,,,
6,2000-01-21,,,,,,,,,,...,,,,,,,,,,
7,2000-01-22,,,,,,,,,,...,,,,,,,,,,
8,2000-01-23,,,,,,,,,,...,,,,,,,,,,
9,2000-01-24,,,,,,,,,,...,,,,,,,,,,


In [13]:
melted = combined.melt(id_vars=['date']).dropna().sort_values(by='date')
melted

Unnamed: 0,date,city_eng,value
4284,2000-01-15,Naha,1.000000
4285,2000-01-16,Naha,1.428571
1514,2000-01-17,Ishigaki,1.000000
4286,2000-01-17,Naha,1.857143
4287,2000-01-18,Naha,2.285714
1515,2000-01-18,Ishigaki,1.391304
4288,2000-01-19,Naha,2.714286
1516,2000-01-19,Ishigaki,1.782609
3406,2000-01-19,Miyako-jima,1.000000
3407,2000-01-20,Miyako-jima,1.450000


# I'm lazy so I want lat-lons to come with

In [14]:
subset = df.drop(columns=['start_date', 'full_date'])
subset.head()

Unnamed: 0,city,city_eng,latitude,longitude
0,稚内,Wakkanai,45.4156,141.6734
1,旭川,Asahikawa,43.7709,142.365
2,網走,Abashiri,44.0206,144.2735
3,札幌,Sapporo,43.0618,141.3545
4,帯広,Obihiro,42.9233,143.1972


In [15]:
merged = melted.merge(subset, on='city_eng')

In [18]:
pd.set_option('display.max_rows', 10000)
merged.sort_values('date')

Unnamed: 0,date,city_eng,value,city,latitude,longitude
0,2000-01-15,Naha,1.0,那覇,26.2126,127.679
1,2000-01-16,Naha,1.428571,那覇,26.2126,127.679
2,2000-01-17,Naha,1.857143,那覇,26.2126,127.679
22,2000-01-17,Ishigaki,1.0,石垣島,24.4064,124.1754
23,2000-01-18,Ishigaki,1.391304,石垣島,24.4064,124.1754
3,2000-01-18,Naha,2.285714,那覇,26.2126,127.679
24,2000-01-19,Ishigaki,1.782609,石垣島,24.4064,124.1754
46,2000-01-19,Miyako-jima,1.0,宮古島,24.7674,125.3247
4,2000-01-19,Naha,2.714286,那覇,26.2126,127.679
5,2000-01-20,Naha,3.142857,那覇,26.2126,127.679


In [17]:
merged.to_csv("data/completed.csv", index=False)