# Dylan

### Notes
**Ideas:**  
Split data into blocks, maybe store the data 3 dimensionally.  

**Dimensions:**  
- Location
- Block (10 sequential days)
- Day
- Weather data
  
**Test Train Split Legend:**  
Source  
[[4, 5, 1, 9],  
 [2, 9, 5, 6],  
 [9, 5, 1, 8],  
 [8, 1, 2, 7]]  
Features (X) → Target (Y)  
[[4, 5, 1]   → [[9]  
 [2, 9, 5]   →  [6]  
 [9, 5, 1]   →  [8]  
 [8, 1, 2]   →  [7]]  

### Imports

In [1]:
import pandas as pd
#import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt
import sklearn as sk
from datetime import datetime, date
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [None]:
raw_data = pd.read_csv('./weatherAUS.csv')
# Remove columns that either has a large amount of missing data or are not suitable for machine learning
raw_data.drop(columns=['Sunshine', 'Evaporation', 'WindGustDir', 'WindDir9am', 'WindDir3pm', 'RainToday', 'RainTomorrow'], inplace=True)
raw_data

### Date conversion

In [None]:
def to_iso_date(_date: str) -> str:
	'''Converts the date from dd-mm-yyyy to yyyy-mm-dd'''
	return datetime.strptime(_date, '%d-%m-%Y').strftime("%Y-%m-%d")

def convert_date_to_day_index(_date: str):
	'''Converts the date into the number of days since 2000-01-01'''
	delta = datetime.strptime(_date, '%Y-%m-%d').date() - date(2000, 1, 1)
	return delta.days

def extract_year(_date: str) -> int:
	'''Returns the year component of the date'''
	return datetime.strptime(_date, '%Y-%m-%d').year

def extract_month(_date: str) -> int:
	'''Returns the month component of the date'''
	return datetime.strptime(_date, '%Y-%m-%d').month

def extract_day(_date: str) -> int:
	'''Returns the day component of the date'''
	return datetime.strptime(_date, '%Y-%m-%d').day

redated = raw_data.copy()
redated['Date'] = redated['Date'].apply(to_iso_date)
redated['DayIndex'] = redated['Date'].apply(convert_date_to_day_index)
redated['Year'] = redated['Date'].apply(extract_year)
redated['Month'] = redated['Date'].apply(extract_month)
redated['Day'] = redated['Date'].apply(extract_day)
redated

### Reconfigure dataframe
https://pandas.pydata.org/docs/user_guide/advanced.html

In [None]:
def reconfigure(_df: pd.DataFrame, block_size=5):
	'''Splits the rows into blocks up to a max size defined by block_size. Blocks are per location. Uses Location, Date, Block, and Id as the labels for a multiIndex DataFrame.'''
	# Lists to store block and id number
	block = []
	id = []

	# Goes through every location
	for tag, group in _df.groupby('Location'):
		block_num = 0
		id_num = 0
		prev = group['DayIndex'].iloc[0]

		# Iterate over the DayIndex column
		for idx, index_num in group['DayIndex'].items():
			# Check if a new block should be started
			if id_num == block_size or (index_num - prev > 1):
				block_num += 1
				id_num = 0  # Reset ID within the block

			# Append the block number and ID within block to the lists
			block.append(block_num)
			id.append(id_num)

			# Update variables for the next iteration
			id_num += 1
			prev = index_num

	# Create the multiIndex
	index = pd.MultiIndex.from_arrays(
		[_df['Location'], _df['Date'], block, id],
		names=['Location', 'Date', 'Block', 'Id']
	)

	# Removed unneeded columns and apply the multiIndex
	stripped = _df.drop(columns=['Date', 'Location'])
	stripped.set_index(index, inplace=True)
	return stripped

reconfigured = reconfigure(redated)
reconfigured

#locations = multi_indexed.index.get_level_values('Location').drop_duplicates().to_list()
#albury = multi_indexed.xs('Albury')


### Missing values

In [None]:
print(reconfigured.isnull().sum())
reconfigured.describe()

In [None]:
# Fill missing data
data_filled = reconfigured.copy()
data_filled.fillna({'MinTemp': data_filled['MinTemp'].interpolate()}, inplace=True)
data_filled.fillna({'MaxTemp': data_filled['MaxTemp'].interpolate()}, inplace=True)
data_filled.fillna({'Temp9am': data_filled['Temp9am'].interpolate()}, inplace=True)
data_filled.fillna({'Temp3pm': data_filled['Temp3pm'].interpolate()}, inplace=True)
data_filled.fillna({'Rainfall': data_filled['Rainfall'].interpolate()}, inplace=True)
data_filled.fillna({'WindGustSpeed': data_filled['WindGustSpeed'].interpolate()}, inplace=True)
data_filled.fillna({'WindSpeed9am': data_filled['WindSpeed9am'].interpolate()}, inplace=True)
data_filled.fillna({'WindSpeed3pm': data_filled['WindSpeed3pm'].interpolate()}, inplace=True)
data_filled.fillna({'Humidity9am': data_filled['Humidity9am'].interpolate()}, inplace=True)
data_filled.fillna({'Humidity3pm': data_filled['Humidity3pm'].interpolate()}, inplace=True)
data_filled.fillna({'Pressure9am': data_filled['Pressure9am'].interpolate()}, inplace=True)
data_filled.fillna({'Pressure3pm': data_filled['Pressure3pm'].interpolate()}, inplace=True)
data_filled.fillna({'Cloud9am': 0}, inplace=True)
data_filled.fillna({'Cloud3pm': 0}, inplace=True)

In [None]:
print(data_filled.isnull().sum())
data_filled.describe()

### Visualisation

In [None]:
visualisation_data = data_filled.copy()
fig, axs = plt.subplots(2, 4, figsize=(22, 20), sharey=False)

# MinTemp MaxTemp Temp9am Temp3pm
axs[0][0].set_title('Temperature')
temperature_labels = ['MinTemp', 'MaxTemp', 'Temp9am', 'Temp3pm']
temperatures = visualisation_data[temperature_labels]
axs[0][0].boxplot(temperatures, tick_labels=temperature_labels)

# Rainfall Evaporation
axs[0][1].set_title('Rainfall')
rainfall = visualisation_data['Rainfall']
axs[0][1].boxplot(rainfall)

# WindGustSpeed WindSpeed9am WindSpeed3pm
axs[0][2].set_title('Wind speed')
wind_speed_labels = ['WindGustSpeed', 'WindSpeed9am', 'WindSpeed3pm']
wind_speed = visualisation_data[wind_speed_labels]
axs[0][2].boxplot(wind_speed, tick_labels=wind_speed_labels)

# Humidity9am Humidity3pm
axs[0][3].set_title('Humidity')
humidity_labels = ['Humidity9am', 'Humidity3pm']
humidity = visualisation_data[humidity_labels]
axs[0][3].boxplot(humidity, tick_labels=humidity_labels)

# Pressure9am Pressure3pm
axs[1][0].set_title('Pressure')
pressure_labels = ['Pressure9am', 'Pressure3pm']
pressure = visualisation_data[pressure_labels]
axs[1][0].boxplot(pressure, tick_labels=pressure_labels)

# Cloud9am Cloud3pm
axs[1][1].set_title('Cloud')
cloud_labels = ['Cloud9am', 'Cloud3pm']
cloud = visualisation_data[cloud_labels]
axs[1][1].boxplot(cloud, tick_labels=cloud_labels)

## Year Location
#axs[1][2].set_title('Year range per location')
#axs[1][2].scatter(visualisation_data['Year'], visualisation_data['Location'])

## Month Day
#axs[1][3].set_title('Total day range per location')
#date = visualisation_data['Year'] * 353 + (visualisation_data['Month'] - 1) * 32 + visualisation_data['Day']
#axs[1][3].scatter(date, visualisation_data['Location'])

plt.show()

In [None]:
plt.figure(figsize=(12, 10))
corr_matrix = data_filled.corr()
sb.heatmap(corr_matrix, annot=True, cmap='viridis', linewidths=0.5)
plt.title('Correlation Matrix Heatmap')
plt.show()

### Training

Create training, validation, and test data