## Setup

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime

import findspark
findspark.init()

from pyspark.sql import *
import pyspark.sql.functions as F
from pyspark import SparkContext

## Data Loading

In [2]:
location = pd.read_csv('locations.csv')
location.head()

Unnamed: 0,ID,Location
0,401,Wakkanai
1,406,Rumoi
2,407,Asahikawa
3,409,Abashiri
4,412,Sapporo


In [17]:
"""
convert_to_datetime

entry: Integer of the month-date, written in format mmdd.
year: Integer of the year, extracted by the column name. 
"""
def convert_to_datetime(entry, year):
    date_str = str(int(entry)).zfill(4)
    month = int(date_str[:2])
    day = int(date_str[2:])
    return datetime(int(year), month, day)

## Create a function that will return the cleaned dataframe 

In [None]:
"""
clean_data

Cleans the csv file in the format to a data analysis ready dataframe. 

Currently only creates a dataframe for the observed dates. 
No table for the remarks. No sanity check for checking if locations are matched correctly. 

file_path: String for the file path. 
"""

def clean_data(file_path):
    df_jp = pd.read_csv(file_path, encoding='cp932') # Encoding to correctly read Japanese characters
    df_jp['Site Name'] = location['Location']

    # could add a sanity check to make sure the location ids are matching...?

    df_eng = df_jp.drop(['地点名', '番号'], axis=1)
    cols = df_eng.columns.tolist()
    cols = cols[-1:] + cols[:-1]
    df_eng = df_eng[cols]
    
    # Partition the data to two tables
    df_date = df_eng[df_eng.columns[1:][::2][:-4].insert(0, 'Site Name')]
    df_rmk = df_eng[df_eng.columns[::2][:-2]]
    
    # Convert the times to datetime objects
    years = df_date.columns[1:]
    for year in years:
        df_date[year] = df_date[year].apply(lambda x: convert_to_datetime(x, year) if x != 0 else 0)
        
    # Check if the observations are still made or not. 
    prev_yr = datetime.today().year -1

    # Get the column for the previous year and if the value is 0, then 
    prev_yr_dates = df_date[str(prev_yr)]

    df_date['Currently Being Observed'] = prev_yr_dates.apply(lambda x: True if x != 0 else False)
    reorder = ['Site Name', 'Currently Being Observed'] + list(years)
    return df_date[reorder]

In [3]:
df_jp = pd.read_csv("../nodafuji_full_bloom.csv", encoding='cp932')
df_jp.head()

Unnamed: 0,番号,地点名,1953,rm,1954,rm.1,1955,rm.2,1956,rm.3,...,2032,rm.79,平年値,rm.80,最早値,rm.81,最早年,最晩値,rm.82,最晩年
0,401,稚内,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,406,留萌,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,407,旭川,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,409,網走,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,412,札幌,529,8,603,8,606,8,523,8,...,0,0,0,0,503,8,1966,606,8,1955


## Add Site Name in English

In [4]:
wisteria_jp['Site Name'] = location['Location']
wisteria_eng = wisteria_jp.drop(['地点名', '番号'], axis=1)
wisteria_eng = wisteria_eng.rename(columns={'平年値': 'Avg', '最早値': 'Min Date', '最早年': 'Min Year', '最晩値': 'Max Date', '最晩年': 'Max Year'})

In [5]:
# Reorder Columns 
cols = wisteria_eng.columns.tolist() 
cols = cols[-1:] + cols[:-1]
wisteria_eng = wisteria_eng[cols]

In [6]:
wisteria_eng

Unnamed: 0,Site Name,1953,rm,1954,rm.1,1955,rm.2,1956,rm.3,1957,...,2032,rm.79,Avg,rm.80,Min Date,rm.81,Min Year,Max Date,rm.82,Max Year
0,Wakkanai,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Rumoi,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Asahikawa,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Abashiri,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Sapporo,529,8,603,8,606,8,523,8,524,...,0,0,0,0,503,8,1966,606,8,1955
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97,Miyako-jima,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
98,Kume-jima,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
99,Naha,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
100,Nago,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
#wisteria_eng.replace(0, np.nan, inplace=True)

In [None]:
wisteria_eng

Since we know the last columns, we can replace the rm.# by more informative names coutning from there. To make the code more general. 

In [8]:
# Partition the data to two tables
wisteria_date = wisteria_eng[wisteria_eng.columns[1:][::2][:-4].insert(0, 'Site Name')]
wisteria_rmk = wisteria_eng[wisteria_eng.columns[::2][:-2]]
# For the remark, need to remove the columns that are associate with the stats 

In [55]:
def convert_to_datetime(entry, year):
    date_str = str(int(entry)).zfill(4)
    month = int(date_str[:2])
    day = int(date_str[2:])
    return datetime(int(year), month, day)

In [56]:
years = wisteria_date.columns[1:]

In [57]:
for year in years:
    wisteria_date[year] = wisteria_date[year].apply(lambda x: convert_to_datetime(x, year) if x != 0 else 0)

TypeError: int() argument must be a string, a bytes-like object or a number, not 'datetime.datetime'

In [58]:
wisteria_date

Unnamed: 0,Site Name,1953,1954,1955,1956,1957,1958,1959,1960,1961,...,2024,2025,2026,2027,2028,2029,2030,2031,2032,Currently Being Observed
0,Wakkanai,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,False
1,Rumoi,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,False
2,Asahikawa,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,False
3,Abashiri,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,False
4,Sapporo,1953-05-29 00:00:00,1954-06-03 00:00:00,1955-06-06 00:00:00,1956-05-23 00:00:00,1957-05-24 00:00:00,0,0,0,0,...,0,0,0,0,0,0,0,0,0,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97,Miyako-jima,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,False
98,Kume-jima,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,False
99,Naha,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,False
100,Nago,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,False


In [59]:
# Check if the observations are still made or not. 
prev_yr = datetime.today().year -1

# Get the column for the previous year and if the value is 0, then 
prev_yr_dates = wisteria_date[str(prev_yr)]

wisteria_date['Currently Being Observed'] = prev_yr_dates.apply(lambda x: True if x != 0 else False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  wisteria_date['Currently Being Observed'] = prev_yr_dates.apply(lambda x: True if x != 0 else False)


In [60]:
reorder = ['Site Name', 'Currently Being Observed'] + list(years)
wisteria_date[reorder]

Unnamed: 0,Site Name,Currently Being Observed,1953,1954,1955,1956,1957,1958,1959,1960,...,2024,2025,2026,2027,2028,2029,2030,2031,2032,Currently Being Observed.1
0,Wakkanai,False,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,False
1,Rumoi,False,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,False
2,Asahikawa,False,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,False
3,Abashiri,False,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,False
4,Sapporo,False,1953-05-29 00:00:00,1954-06-03 00:00:00,1955-06-06 00:00:00,1956-05-23 00:00:00,1957-05-24 00:00:00,0,0,0,...,0,0,0,0,0,0,0,0,0,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97,Miyako-jima,False,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,False
98,Kume-jima,False,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,False
99,Naha,False,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,False
100,Nago,False,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,False


In [61]:
# Write this to csv. 
# Finished building basic code for converting the data, now want to make this more general.

# Then use the data for EDA and building models. 
# the Cherry blosom data has the type of the cherry blossom. Does this slightly differ between tables? 

## Todo 

- Reassign IDs
- Currently Being Observed (<- influence from other person)
    - if the column for the previous year (from today) is filled, still observed. Otherwise, not observed. 
- Remove the columns with the stats
- Create two tables: One for dates and one for the remarks. 


## Steps
1. Convert the cities to English using the locations.csv
2. 

## Write a fn that takes the file and creates a clean data

Ref for data format: https://www.kaggle.com/datasets/ryanglasnapp/japanese-cherry-blossom-data/data

Cols: Site Name, Currently Being Observed (For future data parsing check current year and check previous year values), [Col for years], Avg, Median, Max, Min

If value is 0, set it to none. 
For non-zero entries, get the corresponding column years. 

In [None]:
def 

## Create csv file for location

Should I create a separate file for each location? -> no

I want a parser for all of the data. I should be all in the same format, so if I can write code for that it'll be great. 

In [None]:
"""
Have the column names from the data frame. 
It's not too big, can probabably parse just using a for loop

"""
def parser():
    