# Dylan

### Notes
**Ideas:**  
Split data into blocks, maybe store the data 3 dimensionally.  

**Dimensions:**  
- Location
- Block (10 sequential days)
- Day
- Weather data
  
**Test Train Split Legend:**  
Source  
[[4, 5, 1, 9],  
 [2, 9, 5, 6],  
 [9, 5, 1, 8],  
 [8, 1, 2, 7]]  
Features (X) → Target (Y)  
[[4, 5, 1]   → [[9]  
 [2, 9, 5]   →  [6]  
 [9, 5, 1]   →  [8]  
 [8, 1, 2]   →  [7]]  

## Header

### Settings

In [1]:
block_size = 14

### Imports

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
from datetime import datetime, date
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.cluster import KMeans

## Data Processing

### Loading

In [None]:
raw_data = pd.read_csv('./weatherAUS.csv')
# Remove columns that either has a large amount of missing data or are not suitable for machine learning
raw_data.drop(columns=['Sunshine', 'Evaporation', 'WindGustDir', 'WindDir9am', 'WindDir3pm', 'RainToday', 'RainTomorrow'], inplace=True)
raw_data

### Missing values

In [None]:
print(raw_data.isnull().sum())
raw_data.describe()

In [5]:
# Fill missing data
data_filled = raw_data.copy()
data_filled.fillna({'MinTemp': data_filled['MinTemp'].interpolate()}, inplace=True)
data_filled.fillna({'MaxTemp': data_filled['MaxTemp'].interpolate()}, inplace=True)
data_filled.fillna({'Temp9am': data_filled['Temp9am'].interpolate()}, inplace=True)
data_filled.fillna({'Temp3pm': data_filled['Temp3pm'].interpolate()}, inplace=True)
data_filled.fillna({'Rainfall': data_filled['Rainfall'].interpolate()}, inplace=True)
data_filled.fillna({'WindGustSpeed': data_filled['WindGustSpeed'].interpolate()}, inplace=True)
data_filled.fillna({'WindSpeed9am': data_filled['WindSpeed9am'].interpolate()}, inplace=True)
data_filled.fillna({'WindSpeed3pm': data_filled['WindSpeed3pm'].interpolate()}, inplace=True)
data_filled.fillna({'Humidity9am': data_filled['Humidity9am'].interpolate()}, inplace=True)
data_filled.fillna({'Humidity3pm': data_filled['Humidity3pm'].interpolate()}, inplace=True)
data_filled.fillna({'Pressure9am': data_filled['Pressure9am'].interpolate()}, inplace=True)
data_filled.fillna({'Pressure3pm': data_filled['Pressure3pm'].interpolate()}, inplace=True)
data_filled.fillna({'Cloud9am': 0}, inplace=True)
data_filled.fillna({'Cloud3pm': 0}, inplace=True)

In [None]:
print(data_filled.isnull().sum())
data_filled.describe()

### Visualisation

In [None]:
visualisation_data = data_filled.copy()
fig, axs = plt.subplots(2, 4, figsize=(22, 20), sharey=False)

# MinTemp MaxTemp Temp9am Temp3pm
axs[0][0].set_title('Temperature')
temperature_labels = ['MinTemp', 'MaxTemp', 'Temp9am', 'Temp3pm']
temperatures = visualisation_data[temperature_labels]
axs[0][0].boxplot(temperatures, tick_labels=temperature_labels)

# Rainfall Evaporation
axs[0][1].set_title('Rainfall')
rainfall = visualisation_data['Rainfall']
axs[0][1].boxplot(rainfall)

# WindGustSpeed WindSpeed9am WindSpeed3pm
axs[0][2].set_title('Wind speed')
wind_speed_labels = ['WindGustSpeed', 'WindSpeed9am', 'WindSpeed3pm']
wind_speed = visualisation_data[wind_speed_labels]
axs[0][2].boxplot(wind_speed, tick_labels=wind_speed_labels)

# Humidity9am Humidity3pm
axs[0][3].set_title('Humidity')
humidity_labels = ['Humidity9am', 'Humidity3pm']
humidity = visualisation_data[humidity_labels]
axs[0][3].boxplot(humidity, tick_labels=humidity_labels)

# Pressure9am Pressure3pm
axs[1][0].set_title('Pressure')
pressure_labels = ['Pressure9am', 'Pressure3pm']
pressure = visualisation_data[pressure_labels]
axs[1][0].boxplot(pressure, tick_labels=pressure_labels)

# Cloud9am Cloud3pm
axs[1][1].set_title('Cloud')
cloud_labels = ['Cloud9am', 'Cloud3pm']
cloud = visualisation_data[cloud_labels]
axs[1][1].boxplot(cloud, tick_labels=cloud_labels)

## Year Location
#axs[1][2].set_title('Year range per location')
#axs[1][2].scatter(visualisation_data['Year'], visualisation_data['Location'])

## Month Day
#axs[1][3].set_title('Total day range per location')
#date = visualisation_data['Year'] * 353 + (visualisation_data['Month'] - 1) * 32 + visualisation_data['Day']
#axs[1][3].scatter(date, visualisation_data['Location'])

plt.show()

In [None]:
plt.figure(figsize=(12, 10))
corr_matrix = data_filled.drop(columns=['Location', 'Date']).corr()
sb.heatmap(corr_matrix, annot=True, cmap='viridis', linewidths=0.5)
plt.title('Correlation Matrix Heatmap')
plt.show()

### Date conversion

In [None]:
def to_iso_date(_date: str) -> str:
	'''Converts the date from dd-mm-yyyy to yyyy-mm-dd'''
	return datetime.strptime(_date, '%d-%m-%Y').strftime("%Y-%m-%d")

def convert_date_to_day_index(_date: str):
	'''Converts the date into the number of days since 2000-01-01'''
	delta = datetime.strptime(_date, '%Y-%m-%d').date() - date(2000, 1, 1)
	return delta.days

def extract_year(_date: str) -> int:
	'''Returns the year component of the date'''
	return datetime.strptime(_date, '%Y-%m-%d').year

def extract_month(_date: str) -> int:
	'''Returns the month component of the date'''
	return datetime.strptime(_date, '%Y-%m-%d').month

def extract_day(_date: str) -> int:
	'''Returns the day component of the date'''
	return datetime.strptime(_date, '%Y-%m-%d').day

redated = data_filled.copy()
redated['Date'] = redated['Date'].apply(to_iso_date)
redated['DayIndex'] = redated['Date'].apply(convert_date_to_day_index)
# TODO May need to remove year to prevent overfitting
redated['Year'] = redated['Date'].apply(extract_year)
redated['Month'] = redated['Date'].apply(extract_month)
# Don't include the day to make it harder for the model to overfit
#redated['Day'] = redated['Date'].apply(extract_day)
# Remove the date as well for the same reason
redated.drop(columns=['Date'], inplace=True)
redated

### Location Hash

In [None]:
def hash_location(_location: str) -> int:
	'''Converts the string into bytes then reencodes it to an int.'''
	return int.from_bytes(_location.encode(), 'big')

hashed = redated.copy()
hashed['LocationHash'] = hashed['Location'].apply(hash_location)
hashed

### Reconfigure dataframe
https://pandas.pydata.org/docs/user_guide/advanced.html

In [None]:
def reconfigure(_df: pd.DataFrame, _block_size=5):
	'''Splits the rows into blocks up to a max size defined by block_size. Blocks are per location. Uses Location, Date, Block, and Id as the labels for a multiIndex DataFrame.'''
	# Lists to store block and id number
	block = []
	id = []

	# Goes through every location
	for _, group in _df.groupby('Location', sort=False):
		block_num = 0
		id_num = 0
		prev = group['DayIndex'].iloc[0]

		# Iterate over the DayIndex column
		for _, index_num in group['DayIndex'].items():
			# Check if a new block should be started
			if id_num == _block_size or (index_num != prev + 1):
				block_num += 1
				id_num = 0  # Reset ID within the block

			# Append the block number and ID within block to the lists
			block.append(block_num)
			id.append(id_num)

			# Update variables for the next iteration
			id_num += 1
			prev = index_num

	# Create the multiIndex
	index = pd.MultiIndex.from_arrays(
		[_df['Location'], block, id],
		names=['Location', 'Block', 'Id']
	)

	# Removed unneeded columns and apply the multiIndex
	stripped = _df.drop(columns=['Location'])
	stripped.set_index(index, inplace=True)
	return stripped

reconfigured = reconfigure(hashed, block_size)
reconfigured

In [None]:
def purge(_df: pd.DataFrame) -> pd.DataFrame:
	'''Purge blocks with less than 10 elements in them.'''
	# Count rows in each block
	block_sizes = _df.groupby(['Location', 'Block'], sort=False).size()

	# Identify unfit blocks (those with less than block_size)
	# Pylance mistakes this for an error
	unfit = block_sizes[block_sizes < block_size].index.to_list()

	# Boolean indexing to drop multiple combinations
	df_filtered = _df[~((_df.index.get_level_values('Location').isin([x[0] for x in unfit])) &
                        _df.index.get_level_values('Block').isin([x[1] for x in unfit]))]

	return df_filtered

purged = purge(reconfigured)
purged

### Splitting

Create training, validation, and test data

In [None]:
def divide_group(_group: pd.DataFrame):
	features = _group.iloc[:-1]  # First 9 rows as features
	target = _group.iloc[-1]     # 10th row as the target
	# ERROR: inconsistent array lengths
	return features, target

def gpt_split_into_features_and_target(_df: pd.DataFrame):
	'''Split a dataframe with groups into features and targets lists.'''
	features_list = []
	targets_list = []
	broken_counter = 0

	# The issue is that this doesn't result in blocks of 10
	for _, group in _df.groupby(['Location', 'Block'], sort=False):
		if len(group) != 10:
			broken_counter += 1
		features, target = divide_group(group)
		features_list.append(features.values.flatten())  # Flatten the features into one row
		targets_list.append(target.values[0])  # Single target value

	print(broken_counter)

	# Convert lists to arrays for scikit-learn
	features = np.array(features_list)
	targets = np.array(targets_list)
	return features, targets

X, Y = gpt_split_into_features_and_target(purged)

print(f'X has {X.shape[0]} samples, Y has {Y.shape[0]} samples.')

#X_train_val, X_test, Y_train_val, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
## Note: 0.25 x 0.8 = 0.2, so the validation set is 20% of the original dataset
#X_train, X_val, Y_train, Y_val = train_test_split(X_train_val, Y_train_val, test_size=0.25, random_state=42)
#print(f"Training set size: {X_train.shape[0]} samples")
#print(f"Validation set size: {X_val.shape[0]} samples")
#print(f"Test set size: {X_test.shape[0]} samples")

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
print(f"Training set size: {X_train.shape[0]} samples")
print(f"Test set size: {X_test.shape[0]} samples")

#locations = multi_indexed.index.get_level_values('Location').drop_duplicates().to_list()
#albury = multi_indexed.xs('Albury')

## Training

In [None]:
linear_regresssion_model = LinearRegression()
linear_regresssion_model.fit(X_train, Y_train)

ridge_regression_model = Ridge()
ridge_regression_model.fit(X_train, Y_train)

## Testing

In [None]:
linear_y_pred = linear_regresssion_model.predict(X_test)
ridge_y_pred = ridge_regression_model.predict(X_test)

print('Linear Regression Evaluation:')
print(f'Mean Squared Error: {mean_squared_error(Y_test, linear_y_pred):.2f}')
print(f'R^2 Score: {r2_score(Y_test, linear_y_pred):.2f}')

print('\nRidge Regression Evaluation:')
print(f'Mean Squared Error: {mean_squared_error(Y_test, ridge_y_pred):.2f}')
print(f'R^2 Score: {r2_score(Y_test, ridge_y_pred):.2f}')

## Clustering Tests

Ultimately a failure, but kept for posterity

In [None]:
k_means_model = KMeans(n_clusters = 6)
k_means_model.fit(X=purged)

In [17]:
cluster_pred = k_means_model.predict(purged)

In [None]:
# Display the different column names for use in the visualisation below
purged.columns

In [None]:
# Plot kmeans results
axes = ['MaxTemp', 'Rainfall']
plt.figure(figsize=(12, 6))
plt.scatter(purged[axes[0]], purged[axes[1]], c=cluster_pred, cmap='viridis')
plt.title('K-Means Clustering')
plt.xlabel(axes[0])
plt.ylabel(axes[1])
plt.colorbar(label='Cluster')
plt.show()