In [None]:
# Import libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
import plotly.express as px

# Load dataset (relative path from notebook folder)
df = pd.read_csv('../data/togo.csv')


# Summary statistics
print(df.describe())
print(df.isna().sum())

# Flag columns with >5% missing values
missing_percent = df.isna().mean() * 100
print(missing_percent[missing_percent > 5])

# Handle missing values and outliers (Z-score)
for col in ['GHI', 'DNI', 'DHI', 'ModA', 'ModB', 'WS', 'WSgust']:
    df['z_'+col] = stats.zscore(df[col])
    df = df[df['z_'+col].abs() <= 3]

# Impute missing values with median
df.fillna(df.median(numeric_only=True), inplace=True)

# Time series analysis
df['Timestamp'] = pd.to_datetime(df['Timestamp'])
df.set_index('Timestamp')[['GHI', 'DNI', 'DHI', 'Tamb']].plot(subplots=True, figsize=(12, 8))
plt.show()

# Cleaning impact plot
df.groupby('Cleaning')[['ModA', 'ModB']].mean().plot(kind='bar')
plt.title('Effect of Cleaning on ModA and ModB')
plt.show()

# Correlation heatmap
sns.heatmap(df[['GHI', 'DNI', 'DHI', 'TModA', 'TModB']].corr(), annot=True, cmap='coolwarm')
plt.show()

# Scatter plots
sns.scatterplot(x='WS', y='GHI', data=df)
plt.show()
sns.scatterplot(x='RH', y='Tamb', data=df)
plt.show()

# Histograms
df['GHI'].hist()
plt.show()
df['WS'].hist()
plt.show()

# Bubble chart
plt.scatter(df['GHI'], df['Tamb'], s=df['RH'], alpha=0.5)
plt.xlabel('GHI')
plt.ylabel('Tamb')
plt.title('Bubble Chart: GHI vs Tamb (Bubble=RH)')
plt.show()

# Export cleaned data
df.to_csv('../data/togo_clean.csv', index=False)
