## Setup

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime

import findspark
findspark.init()

from pyspark.sql import *
import pyspark.sql.functions as F
from pyspark import SparkContext

## Data Loading

In [2]:
location = pd.read_csv('locations.csv')
location.head()

Unnamed: 0,ID,Location
0,401,Wakkanai
1,406,Rumoi
2,407,Asahikawa
3,409,Abashiri
4,412,Sapporo


## Create a function that will return the cleaned dataframe 

In [10]:
"""
convert_to_datetime

Converts the int dates to strings.

entry: Integer of the month-date, written in format mmdd.
year: Integer of the year, extracted by the column name. 
"""
def convert_to_datetime(entry, year):
    date_str = str(int(entry)).zfill(4)
    month = str(date_str[:2])
    day = str(date_str[2:])
    return str(year) + '-' + month + '-' + day

In [11]:
"""
clean_data

Cleans the csv file in the format to a data analysis ready dataframe. 

Currently only creates a dataframe for the observed dates. 
No table for the remarks. No sanity check for checking if locations are matched correctly. 

file_path: String for the file path. 
"""

def clean_data(file_path):
    df_jp = pd.read_csv(file_path, encoding='cp932') # Encoding to correctly read Japanese characters
    df_jp['Site Name'] = location['Location']

    # could add a sanity check to make sure the location ids are matching...?

    df_eng = df_jp.drop(['地点名', '番号'], axis=1)
    cols = df_eng.columns.tolist()
    cols = cols[-1:] + cols[:-1]
    df_eng = df_eng[cols]
    
    # Partition the data to two tables
    df_date = df_eng[df_eng.columns[1:][::2][:-4].insert(0, 'Site Name')]
    df_rmk = df_eng[df_eng.columns[::2][:-2]]
    
    # Convert the times to datetime objects
    years = df_date.columns[1:]
    for year in years:
        df_date[year] = df_date[year].apply(lambda x: convert_to_datetime(x, year) if x != 0 else 0)
        
    # Check if the observations are still made or not. 
    prev_yr = datetime.today().year -1

    # Get the column for the previous year and if the value is 0, then 
    prev_yr_dates = df_date[str(prev_yr)]

    df_date['Currently Being Observed'] = prev_yr_dates.apply(lambda x: True if x != 0 else False)
    reorder = ['Site Name', 'Currently Being Observed'] + list(years)
    return df_date[reorder]

## Example

In [12]:
test = clean_data("nodafuji_full_bloom.csv")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_date[year] = df_date[year].apply(lambda x: convert_to_datetime(x, year) if x != 0 else 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_date['Currently Being Observed'] = prev_yr_dates.apply(lambda x: True if x != 0 else False)


In [13]:
test.to_csv('fuji_cleaned.csv', index=False)