In [1]:
!pip install pandas



# Prepare Bloom CSV

In [1]:
import pandas as pd
from datetime import date


def ParseRawBloomData(file, cities):
    # Open and read the contents of the page
    with open(file, 'r') as file:
        lines = file.readlines()

    header = [colheader.strip() for colheader in lines[0].strip().split()]
    
    # Initialize a dictionary to store the data
    data = {}    

    for line in lines[1:]:
        words = [word.strip() for word in line.strip().split()]

        # Only includes the data of the specified cities
        if words[0] not in cities:
            continue
        if words[0] not in data:
            data[words[0]] = {}

        column = 1
        month = 0
        day = 0

        # Adds data to the dictionary
        for i in range(1, len(words), 1):
            word = words[i]

            # Once reaches these columns, finish
            if header[column].lower() == "normal_value" or header[column].lower() == "alternative_event":
                break

            # Parses data into the columns
            if word == "-":
                column = column + 1 
            elif word.isdigit():
                if month == 0:
                    month = int(word)
                    continue
                else:
                    day = int(word)

                # Converts month-day pair to day-of-year    
                data[words[0]][header[column]] = date(int(header[column]), month, day).timetuple().tm_yday

                month = 0
                day = 0
                column = column + 1
    
    return data           

In [3]:
def PrepareBloomCSVFile(rawdatafiles, whitelistedCities, filename):

    # Parses all the data points together from the files
    totalData = []
    for file in rawdatafiles:
        partData = ParseRawBloomData("raw_data/" + file, whitelistedCities)
        # Transpose so that the columns would contain the years, and rows the cities
        dataframe = pd.DataFrame.from_dict(partData).fillna(-1).astype(int).T
        totalData.append(dataframe)


    # Create a DataFrame from the parsed data
    # Concatentates based on the horizontal axis (column based), since years are part of the columns
    output = pd.concat(totalData, axis=1)
    output.to_csv(filename)
    print(output)

    
def PrepareBloomFiles():
    # Define the list of cities to include in the csv
    whitelistedCities = ["Sapporo", "Sendai", "Tokyo", "Nagoya", "Osaka", "Kyoto", "Hiroshima", "Matsuyama", "Fukuoka"]
    
    # Defines raw data files to parse and concatenate
    # Sourced from: https://www.data.jma.go.jp/sakura/data/sakura004_01.html
    rawdatafiles1 = ["rawdata_full_bloom_1961_1970.txt", "rawdata_full_bloom_1971_1980.txt", 
                     "rawdata_full_bloom_1981_1990.txt", "rawdata_full_bloom_1991_2000.txt", 
                     "rawdata_full_bloom_2001_2010.txt", "rawdata_full_bloom_2011_2020.txt", 
                     "rawdata_full_bloom_2021_2022.txt"]
    # Sourced from: https://www.data.jma.go.jp/sakura/data/sakura003_01.html
    rawdatafiles2 = ["rawdata_bloom_1961_1970.txt", "rawdata_bloom_1971_1980.txt", 
                     "rawdata_bloom_1981_1990.txt", "rawdata_bloom_1991_2000.txt", 
                     "rawdata_bloom_2001_2010.txt", "rawdata_bloom_2011_2020.txt", 
                     "rawdata_bloom_2021_2022.txt"]
    # Note: Missing bloom day values should be manually estimated based on other cities' bloom days in similar geographic location.
    
    PrepareBloomCSVFile(rawdatafiles1, whitelistedCities, "full_bloom_data.csv")
    PrepareBloomCSVFile(rawdatafiles2, whitelistedCities, "bloom_data.csv")

    
PrepareBloomFiles()

           1961  1962  1963  1964  1965  1966  1967  1968  1969  1970  ...  \
Sapporo     129   125   125   128   136   131   124   125   130   129  ...   
Sendai      106   111   108   110   115   108   108   105   110   115  ...   
Nagoya       96   102    96    98   106    93    95    96    97   102  ...   
Tokyo        98    99    96    96   102    95    95    94   100   105  ...   
Kyoto        96    97    99    97   107    96    96    96   103   105  ...   
Hiroshima    96   102    98    97   107    97    98    98   103   104  ...   
Osaka        95    99   103   106    95    93    94   101   104    94  ...   
Matsuyama    95   101   105    95   100    95    93    95   101   102  ...   
Fukuoka      93    98    97    95    98    94    94    94    97    98  ...   

           2013  2014  2015  2016  2017  2018  2019  2020  2021  2022  
Sapporo     137   121   116   122   123   119   119   123   117   115  
Sendai      105   101    99    97   103    94   100    94    90   101  
Nag

# Prepare City Temperatures CSV

In [1]:
import pandas as pd
import numpy as np
import os
from datetime import datetime
import chardet # Used to determine the encoding scheme of a given file


def StripAndConcatCityTemperatureCSVs(city, fileParts):
    directory = "temperatures_data/" + city.lower()
    if not os.path.exists(directory):
        raise Exception("Dependent city directory is missing.")
    
    # Processes temperature data together from the files
    totalData = []
    for fileName in fileParts:
        directory = "temperatures_data/" + city.lower() + "/" + fileName
        
        # Open and prepares the contents of the csv, ignoring the first 5 rows
        dataframe = pd.read_csv(directory, skiprows=4, encoding="SHIFT_JIS")
        
        # Creates new heading and removes unnecessary columns
        dataframe = dataframe.iloc[:, :-2]
        dataframe.columns = ["year", "temperature"]

        # Adds day-of-year column and truncates year column
        dataframe.insert(loc=1, column="day", value=dataframe["year"])
        dataframe["day"] = dataframe["day"].apply(lambda x: datetime.strptime(x, '%Y/%m/%d').timetuple().tm_yday)
        dataframe["year"] = dataframe["year"].apply(lambda x: x[:4])
        
        totalData.append(dataframe)
        
    # Concatentates based on the vertical axis (row based)  
    output = pd.concat(totalData, axis=0)
    output = output.fillna(method="bfill")
    
    return output


def AddFeaturesToDataFrame(dataframe):
    dataframe.loc[:,"week_mean"] = dataframe.loc[:,"temperature"].rolling(window=7, min_periods=1).mean()
    dataframe.loc[:,"2week_mean"] = dataframe.loc[:,"temperature"].rolling(window=14, min_periods=1).mean()
    dataframe.loc[:,"4week_mean"] = dataframe.loc[:,"temperature"].rolling(window=28, min_periods=1).mean()
    dataframe.loc[:,"8week_mean"] = dataframe.loc[:,"temperature"].rolling(window=56, min_periods=1).mean()
    dataframe.loc[:,"16week_mean"] = dataframe.loc[:,"temperature"].rolling(window=112, min_periods=1).mean()
    
    # Total Temperature Accumulation
    dataframe['sum'] = pd.Series(dtype=float)
    accum = 0
    for i in range(len(dataframe)):
        if dataframe.iloc[i, dataframe.columns.get_loc("day")] == 1:
            accum = 0
        accum = accum + dataframe.iloc[i, dataframe.columns.get_loc("temperature")]
        dataframe.iloc[i, dataframe.columns.get_loc("sum")] = accum
        
    return dataframe


def AddTargetToDataFrame(city, dataframe, targetfilename, targetname):
    if not os.path.exists(targetfilename):
        raise Exception("Dependent "+ targetfilename +" is missing.")
        
    # Adds bloom status
    dataframe[targetname] = np.zeros(len(dataframe), dtype=int)
    targetbloom = pd.read_csv(targetfilename, index_col=False)
    row = targetbloom.loc[targetbloom["Unnamed: 0"] == city].drop(targetbloom.columns[0], axis=1)
    
    # How many days left until the bloom day. Negative values after the bloom day (Regression)
    runningBloomDay = 0
    for i in range(len(dataframe)):
        day = dataframe.iloc[i, dataframe.columns.get_loc("day")]
        if day == 1:
            runningBloomDay = row[str(dataframe.iloc[i, dataframe.columns.get_loc("year")])]
        dataframe.iloc[i, dataframe.columns.get_loc(targetname)] = runningBloomDay - day
    
    return dataframe

In [15]:
def PrepareHistoricCityTemperaturesCSVFile(cityselect):
    # Defines csv temperature data from the Japan's Meteorlogical Agency 
    # Sourced from: https://www.data.jma.go.jp/gmd/risk/obsdl/index.php
    historyFileParts = ["temperature_data_1961_1990.csv", "temperature_data_1991_2020.csv", "temperature_data_2021_2022.csv"]
    recentFileParts = ["temperature_data_2021_2022.csv", "temperature_data_2023_present.csv"]
    
    # Define the list of cities prepare
    # Here for documentation purposes
    whitelistedCities = ["Sapporo", "Sendai", "Tokyo", "Nagoya", "Osaka", "Kyoto", "Hiroshima", "Matsuyama", "Fukuoka"]
    
    
    
    # Creates history dataframe and csv
    dataframe = StripAndConcatCityTemperatureCSVs(cityselect, historyFileParts)
    dataframe = AddTargetToDataFrame(cityselect, dataframe, "bloom_data.csv", "bloom")
    dataframe = AddTargetToDataFrame(cityselect, dataframe, "full_bloom_data.csv", "full_bloom")
    dataframe = AddFeaturesToDataFrame(dataframe)
    # dataframe = dataframe[dataframe['year'] != "1961"]
    dataframe.to_csv("historic.csv", index=False)
    print(dataframe)
    
    # Creates csv for the current data used for prediction
    predict = StripAndConcatCityTemperatureCSVs(cityselect, recentFileParts)
    predict = AddFeaturesToDataFrame(predict)
    # predict.to_csv("predict.csv", index=False)
    filteredPredict = predict[predict['year'] == "2023"]
    filteredPredict.to_csv("predict.csv", index=False)

    
PrepareHistoricCityTemperaturesCSVFile("Sapporo")

     year  day  temperature  bloom  full_bloom  week_mean  2week_mean  \
0    1961    1         -6.9    123         128  -6.900000   -6.900000   
1    1961    2         -7.0    122         127  -6.950000   -6.950000   
2    1961    3         -3.0    121         126  -5.633333   -5.633333   
3    1961    4         -3.2    120         125  -5.025000   -5.025000   
4    1961    5         -6.6    119         124  -5.340000   -5.340000   
..    ...  ...          ...    ...         ...        ...         ...   
725  2022  361          1.5   -248        -246   1.000000   -1.907143   
726  2022  362          0.8   -249        -247   1.400000   -1.585714   
727  2022  363         -2.3   -250        -248   1.171429   -1.392857   
728  2022  364         -1.8   -251        -249   0.800000   -1.164286   
729  2022  365          0.3   -252        -250   0.471429   -0.892857   

     4week_mean  8week_mean  16week_mean  accumulated  
0     -6.900000   -6.900000    -6.900000         -6.9  
1     -6.95

In [5]:
def PreparePredictCityTemperaturesCSVFile(cityselect):
    # Defines csv temperature data from the Japan's Meteorlogical Agency 
    # Sourced from: https://www.data.jma.go.jp/gmd/risk/obsdl/index.php
    recentFileParts = ["temperature_data_2021_2022.csv", "temperature_data_2023_present.csv"]
    
    # Define the list of cities prepare
    # Here for documentation purposes
    whitelistedCities = ["Sapporo", "Sendai", "Tokyo", "Nagoya", "Osaka", "Kyoto", "Hiroshima", "Matsuyama", "Fukuoka"]

    
    # Creates csv for the current data used for prediction
    predict = StripAndConcatCityTemperatureCSVs(cityselect, recentFileParts)
    predict = AddFeaturesToDataFrame(predict)
    # predict.to_csv("predict.csv", index=False)
    filteredPredict = predict[predict['year'] == "2023"]
    filteredPredict.to_csv("predict.csv", index=False)

    
PreparePredictCityTemperaturesCSVFile("Sapporo")