In [48]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

In [17]:
df = pd.read_csv("sakura_median_cleaned.csv")
df.head()

Unnamed: 0,city_eng,start_median,full_median,bloom_median,bloom_mean,city,latitude,longitude,march_avg_temp
0,Wakkanai,2000-05-08,2000-05-11,2 days 00:00:00.000000000,2 days 08:50:31.578947368,稚内,45.4156,141.6734,-0.494737
1,Asahikawa,2000-04-16,2000-04-20,4 days 00:00:00.000000000,4 days 08:50:31.578947368,旭川,43.7709,142.365,-1.263158
2,Abashiri,2000-04-20,2000-04-25,4 days 00:00:00.000000000,3 days 22:44:12.631578947,網走,44.0206,144.2735,-1.057895
3,Sapporo,2000-05-03,2000-05-04,2 days 00:00:00.000000000,2 days 00:00:00.000000000,札幌,43.0618,141.3545,1.294737
4,Obihiro,2000-03-30,2000-04-05,7 days 00:00:00.000000000,7 days 06:18:56.842105263,帯広,42.9233,143.1972,-0.136842


# Build our template dataframe, 1 row for every day

In [135]:
days = pd.DataFrame({'day_of_year': range(1, 200)})
days['day_of_year'] = days.day_of_year.apply(lambda doy: datetime(2000, 1, 1) + timedelta(days=doy))
days = days.set_index('day_of_year')
days.head()

2000-01-02
2000-01-03
2000-01-04
2000-01-05
2000-01-06


In [136]:
first_day = df.start_median.min()
last_day = df.full_median.max()
print(f"Between {first_day} and {last_day}")

Between 2000-01-15 00:00:00 and 2000-05-19 00:00:00


In [143]:
days = days[(days.index >= first_date) & (days.index <= last_day)]
days.index

DatetimeIndex(['2000-01-15', '2000-01-16', '2000-01-17', '2000-01-18',
               '2000-01-19', '2000-01-20', '2000-01-21', '2000-01-22',
               '2000-01-23', '2000-01-24',
               ...
               '2000-05-10', '2000-05-11', '2000-05-12', '2000-05-13',
               '2000-05-14', '2000-05-15', '2000-05-16', '2000-05-17',
               '2000-05-18', '2000-05-19'],
              dtype='datetime64[ns]', name='day_of_year', length=126, freq=None)

# Crossssssstabs

In [147]:
# Need to convert to dates to match our columns up above
df.start_median = pd.to_datetime(df.start_median)
df.full_median = pd.to_datetime(df.full_median)

In [149]:
# One is 1 and one is 10 to tell them apart and for easy scaling later
start = pd.crosstab(df.start_median, df.city_eng).replace({0: np.nan})
finish = pd.crosstab(df.full_median, df.city_eng).replace({0: np.nan, 1: 10})

In [150]:
start.head()

city_eng,Abashiri,Akita,Aomori,Asahikawa,Choshi,Fukui,Fukuoka,Fukushima,Gifu,Hakodate,...,Tokyo,Tottori,Toyama,Tsu,Utsunomiya,Wakayama,Wakkanai,Yamagata,Yokohama,Ōita
start_median,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2000-01-15,,,,,,,,,,,...,,,,,,,,,,
2000-01-17,,,,,,,,,,,...,,,,,,,,,,
2000-01-19,,,,,,,,,,,...,,,,1.0,,,,,,
2000-01-21,,,,,,,,,,,...,,,,,,,,,,
2000-01-22,,,,,,,,,,,...,,,,,,,,,,


In [151]:
finish.head(10)

city_eng,Abashiri,Akita,Aomori,Asahikawa,Choshi,Fukui,Fukuoka,Fukushima,Gifu,Hakodate,...,Tokyo,Tottori,Toyama,Tsu,Utsunomiya,Wakayama,Wakkanai,Yamagata,Yokohama,Ōita
full_median,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2000-02-02,,,,,,,,,,,...,,,,,,,,,,
2000-02-05,,,,,,,,,,,...,,,,,,,,,,
2000-02-09,,,,,,,,,,,...,,,,,,,,,,
2000-03-28,,,,,,,,,,,...,,,,,,,,,,
2000-03-29,,,,,,,,,,,...,,,,,,,,,,10.0
2000-03-30,,,,,,,,,,,...,,,,,,,,,,
2000-03-31,,,,,,,,,,,...,,,10.0,,,,,,,
2000-04-01,,,10.0,,,,,,,,...,,10.0,,,10.0,,,,,
2000-04-02,,,,,,,,,,,...,,,,10.0,,,,10.0,,
2000-04-03,,,,,,,,,10.0,,...,,,,,,,,,,


# Combine

In [154]:
combined = days.combine_first(start).combine_first(finish)
combined = combined.interpolate(limit_area='inside')
combined = combined.reset_index().rename(columns={'index': 'date'})
combined.head(10)

city_eng,date,Abashiri,Akita,Aomori,Asahikawa,Choshi,Fukui,Fukuoka,Fukushima,Gifu,...,Tokyo,Tottori,Toyama,Tsu,Utsunomiya,Wakayama,Wakkanai,Yamagata,Yokohama,Ōita
0,2000-01-15,,,,,,,,,,...,,,,,,,,,,
1,2000-01-16,,,,,,,,,,...,,,,,,,,,,
2,2000-01-17,,,,,,,,,,...,,,,,,,,,,
3,2000-01-18,,,,,,,,,,...,,,,,,,,,,
4,2000-01-19,,,,,,,,,,...,,,,1.0,,,,,,
5,2000-01-20,,,,,,,,,,...,,,,1.121622,,,,,,
6,2000-01-21,,,,,,,,,,...,,,,1.243243,,,,,,
7,2000-01-22,,,,,,,,,,...,,,,1.364865,,,,,,
8,2000-01-23,,,,,,,,,,...,,,,1.486486,,,,,,
9,2000-01-24,,,,,,,,,,...,,,,1.608108,,,,,,


In [164]:
melted = combined.melt(id_vars=['date']).dropna().sort_values(by='date')
melted.head(10)

Unnamed: 0,date,city_eng,value
1260,2000-01-15,Hikone,1.0
1261,2000-01-16,Hikone,1.428571
5420,2000-01-17,Sendai,1.0
1262,2000-01-17,Hikone,1.857143
1263,2000-01-18,Hikone,2.285714
5421,2000-01-18,Sendai,1.391304
1264,2000-01-19,Hikone,2.714286
5422,2000-01-19,Sendai,1.782609
6430,2000-01-19,Tsu,1.0
5423,2000-01-20,Sendai,2.173913


# I'm lazy so I want lat-lons to come with

In [165]:
subset = df.drop(columns=['start_median', 'full_median', 'bloom_median', 'bloom_mean'])
subset.head()

Unnamed: 0,city_eng,city,latitude,longitude,march_avg_temp
0,Wakkanai,稚内,45.4156,141.6734,-0.494737
1,Asahikawa,旭川,43.7709,142.365,-1.263158
2,Abashiri,網走,44.0206,144.2735,-1.057895
3,Sapporo,札幌,43.0618,141.3545,1.294737
4,Obihiro,帯広,42.9233,143.1972,-0.136842


In [166]:
merged = melted.merge(subset, on='city_eng')
merged.head()

Unnamed: 0,date,city_eng,value,city,latitude,longitude,march_avg_temp
0,2000-01-15,Hikone,1.0,彦根,35.2744,136.2597,9.131579
1,2000-01-16,Hikone,1.428571,彦根,35.2744,136.2597,9.131579
2,2000-01-17,Hikone,1.857143,彦根,35.2744,136.2597,9.131579
3,2000-01-18,Hikone,2.285714,彦根,35.2744,136.2597,9.131579
4,2000-01-19,Hikone,2.714286,彦根,35.2744,136.2597,9.131579


In [167]:
merged.to_csv("completed.csv", index=False)