# Dylan

### Ideas
- Split data into blocks, maybe store the data 3 dimensionally.

### Imports

In [1]:
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
from datetime import datetime, date
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
file_path = './weatherAUS.csv'
raw_data = pd.read_csv(file_path)

# Remove columns that either has a large amount of missing data or are not suitable for machine learning
raw_data.drop(columns=['Sunshine', 'Evaporation', 'WindGustDir', 'WindDir9am', 'WindDir3pm', 'RainToday', 'RainTomorrow'], inplace=True)

### Date conversion

In [3]:
def to_isodate(start: str):
	'''Converts the date from dd-mm-yyyy to yyyy-mm-dd'''
	return datetime.strptime(start, '%d-%m-%Y').strftime("%Y-%m-%d")

def convert_date_to_day(_date: str):
	'''Converts the date into the number of days since 2000-01-01'''
	delta = datetime.strptime(_date, '%Y-%m-%d').date() - date(2000, 1, 1)
	return delta.days

raw_data['Date'] = raw_data['Date'].apply(to_isodate)
raw_data['Day'] = raw_data['Date'].apply(convert_date_to_day)

In [None]:
print(raw_data)

### Missing values

In [None]:
print(raw_data.isnull().sum())
raw_data.describe()

In [6]:
# Fill missing data
data_repaired = raw_data.copy()
data_repaired.fillna({'MinTemp': data_repaired['MinTemp'].interpolate()}, inplace=True)
data_repaired.fillna({'MaxTemp': data_repaired['MaxTemp'].interpolate()}, inplace=True)
data_repaired.fillna({'Temp9am': data_repaired['Temp9am'].interpolate()}, inplace=True)
data_repaired.fillna({'Temp3pm': data_repaired['Temp3pm'].interpolate()}, inplace=True)
data_repaired.fillna({'Rainfall': data_repaired['Rainfall'].interpolate()}, inplace=True)
data_repaired.fillna({'WindGustSpeed': data_repaired['WindGustSpeed'].interpolate()}, inplace=True)
data_repaired.fillna({'WindSpeed9am': data_repaired['WindSpeed9am'].interpolate()}, inplace=True)
data_repaired.fillna({'WindSpeed3pm': data_repaired['WindSpeed3pm'].interpolate()}, inplace=True)
data_repaired.fillna({'Humidity9am': data_repaired['Humidity9am'].interpolate()}, inplace=True)
data_repaired.fillna({'Humidity3pm': data_repaired['Humidity3pm'].interpolate()}, inplace=True)
data_repaired.fillna({'Pressure9am': data_repaired['Pressure9am'].interpolate()}, inplace=True)
data_repaired.fillna({'Pressure3pm': data_repaired['Pressure3pm'].interpolate()}, inplace=True)
data_repaired.fillna({'Cloud9am': 0}, inplace=True)
data_repaired.fillna({'Cloud3pm': 0}, inplace=True)

In [None]:
print(data_repaired.isnull().sum())
data_repaired.describe()

### Visualisation

In [None]:
visualisation_data = data_repaired.copy()
fig, axs = plt.subplots(2, 4, figsize=(22, 20), sharey=False)

# Date Location
axs[0][0].set_title('Date range per location')
axs[0][0].scatter(visualisation_data['Date'], visualisation_data['Location'])
# I have no idea why these two don't line up
# Day Location
axs[0][1].set_title('Day range per location')
axs[0][1].scatter(visualisation_data['Day'], visualisation_data['Location'])

# MinTemp MaxTemp Temp9am Temp3pm
axs[0][2].set_title('Temperature')
temperature_labels = ['MinTemp', 'MaxTemp', 'Temp9am', 'Temp3pm']
temperatures = visualisation_data[temperature_labels]
axs[0][2].boxplot(temperatures, tick_labels=temperature_labels)

# Rainfall Evaporation
axs[0][3].set_title('Rainfall')
rainfall = visualisation_data['Rainfall']
axs[0][3].boxplot(rainfall)

# WindGustSpeed WindSpeed9am WindSpeed3pm
axs[1][0].set_title('Wind speed')
wind_speed_labels = ['WindGustSpeed', 'WindSpeed9am', 'WindSpeed3pm']
wind_speed = visualisation_data[wind_speed_labels]
axs[1][0].boxplot(wind_speed, tick_labels=wind_speed_labels)

# Humidity9am Humidity3pm
axs[1][1].set_title('Humidity')
humidity_labels = ['Humidity9am', 'Humidity3pm']
humidity = visualisation_data[humidity_labels]
axs[1][1].boxplot(humidity, tick_labels=humidity_labels)

# Pressure9am Pressure3pm
axs[1][2].set_title('Pressure')
pressure_labels = ['Pressure9am', 'Pressure3pm']
pressure = visualisation_data[pressure_labels]
axs[1][2].boxplot(pressure, tick_labels=pressure_labels)

# Cloud9am Cloud3pm
axs[1][3].set_title('Cloud')
cloud_labels = ['Cloud9am', 'Cloud3pm']
cloud = visualisation_data[cloud_labels]
axs[1][3].boxplot(cloud, tick_labels=cloud_labels)

plt.show()

In [None]:
plt.figure(figsize=(12, 10))
corr_matrix = data_repaired.drop(columns=['Date', 'Location']).corr()
sb.heatmap(corr_matrix, annot=True, cmap='viridis', linewidths=0.5)
plt.title('Correlation Matrix Heatmap')
plt.show()

### Isolate Locations

In [None]:
locations = data_repaired['Location'].drop_duplicates().to_list()
locations

### Training

Create training, validation, and test data