<a href="https://colab.research.google.com/github/CarolKSD/Weather_Data_Clusterization/blob/main/HistoricalWeatherDataClusterization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import folium

In [None]:
# Load the dataset
from google.colab import drive
drive.mount('/content/drive')

data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/daily_weather_data.csv')

# Display the first rows of the dataset
print("Initial data preview:")
print(data.head())
print("\nDataset information:")
print(data.info())

In [None]:
# Rename columns for better readability
data.rename(columns={
    'date': 'Date',
    'country': 'Country',
    'city': 'City',
    'tavg': 'Avg Temperature (°C)',
    'tmin': 'Min Temperature (°C)',
    'tmax': 'Max Temperature (°C)',
    'wdir': 'Wind Direction (°)',
    'wspd': 'Wind Speed (m/s)',
    'pres': 'Pressure (hPa)'

}, inplace=True)

# Set Date, Country, and City as the multi-index
data.set_index(['Date', 'Country', 'City'], inplace=True)

In [None]:
data.head()

In [None]:
# ------------------------------
# 1. EXPLORATORY DATA ANALYSIS (EDA)
# ------------------------------

# Describe basic statistics of the dataset
print("\nDescriptive statistics:")
print(data.describe())

In [None]:
# Check for missing values
print("\nMissing values in each column:")
print(data.isnull().sum())

In [None]:
# Plot distributions of numerical features
numeric_features = ['Avg Temperature (°C)', 'Min Temperature (°C)', 'Max Temperature (°C)',
                    'Wind Direction (°)', 'Wind Speed (m/s)', 'Pressure (hPa)', ]
data[numeric_features].hist(figsize=(12, 8), bins=20, color='skyblue', edgecolor='black')
plt.suptitle("Distributions of Climatic Variables", fontsize=16)
plt.show()

In [None]:
# Identify city and country with the highest and lowest temperatures

# Highest average temperature
max_temp_row = data.loc[data['Avg Temperature (°C)'].idxmax()]
print("\nCity and country with the highest average temperature:")
print(f"Date: {max_temp_row.name[0]}, Country: {max_temp_row.name[1]}, City: {max_temp_row.name[2]}")
print(f"Avg Temperature: {max_temp_row['Avg Temperature (°C)']} °C")

# Lowest average temperature
min_temp_row = data.loc[data['Avg Temperature (°C)'].idxmin()]
print("\nCity and country with the lowest average temperature:")
print(f"Date: {min_temp_row.name[0]}, Country: {min_temp_row.name[1]}, City: {min_temp_row.name[2]}")
print(f"Avg Temperature: {min_temp_row['Avg Temperature (°C)']} °C")

In [None]:
# Correlation analysis
plt.figure(figsize=(10, 6))
sns.heatmap(data[numeric_features].corr(), annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Correlation Heatmap of Climatic Variables", fontsize=14)
plt.show()

In [None]:
# Calculate number of rows and columns for subplots
num_features = len(numeric_features)
num_cols = 3
num_rows = (num_features // num_cols) + (num_features % num_cols > 0)

# Plot boxplots for outlier detection
plt.figure(figsize=(15, 5 * num_rows))
for i, feature in enumerate(numeric_features):
    plt.subplot(num_rows, num_cols, i + 1)
    sns.boxplot(data[feature], color='lightblue')
    plt.title(f"Boxplot of {feature}")
plt.tight_layout()
plt.show()

In [None]:
# ------------------------------
# 2. DATA CLEANING
# ------------------------------

# Replace missing values with the mean of each column
data.fillna(data.mean(), inplace=True)
print("\nData after handling missing values:")
print(data.isnull().sum())

In [None]:
# ------------------------------
# 3. FEATURE ENGINEERING: WIND COMPONENTS
# ------------------------------

# Calculate U and V components from wind speed and direction
data['U Component'] = data['Wind Speed (m/s)'] * np.cos(np.radians(data['Wind Direction (°)']))
data['V Component'] = data['Wind Speed (m/s)'] * np.sin(np.radians(data['Wind Direction (°)']))

In [None]:
# ------------------------------
# 4. NORMALIZATION
# ------------------------------

# Select columns for clustering
features = ['Avg Temperature (°C)', 'Min Temperature (°C)', 'Max Temperature (°C)',
            'Wind Speed (m/s)', 'Pressure (hPa)', 'U Component', 'V Component']
climate_data = data[features]

# Normalize the data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(climate_data)

print("\nFirst rows of normalized data:")
print(pd.DataFrame(scaled_data, columns=features).head())


In [None]:
# ------------------------------
# 5. FINDING THE OPTIMAL NUMBER OF CLUSTERS (Elbow Method)
# ------------------------------

inertia = []
for k in range(1, 11):
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(scaled_data)
    inertia.append(kmeans.inertia_)

# Plot the Elbow Method graph
plt.figure(figsize=(8, 5))
plt.plot(range(1, 11), inertia, marker='o', linestyle='--', color='b')
plt.title("Elbow Method", fontsize=14)
plt.xlabel("Number of Clusters")
plt.ylabel("Inertia")
plt.xticks(range(1, 11))
plt.show()

In [None]:
# ------------------------------
# 6. APPLYING K-MEANS
# ------------------------------

# Choose the number of clusters based on the Elbow Method
n_clusters = 3
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
data['Cluster'] = kmeans.fit_predict(scaled_data)

# Display the first results
print("\nData with assigned clusters:")
print(data[['Cluster']].head())

In [None]:
# ------------------------------
# 7. SAVE RESULTS
# ------------------------------

# Save the updated data to a new CSV file
data.to_csv('clustered_weather_data.csv', index=False)
print("\nResults saved as 'clustered_weather_data.csv'!")

In [None]:
# ------------------------------
# 8. CLUSTER ANALYSIS
# ------------------------------

# Statistical summary by cluster
cluster_summary = data.groupby('Cluster')[features].mean()
print("\nCluster summary:")
print(cluster_summary)

# Plot average characteristics of each cluster
cluster_summary.T.plot(kind='bar', figsize=(10, 6))
plt.title("Average Characteristics by Cluster", fontsize=14)
plt.ylabel("Normalized Values")
plt.xticks(rotation=30)
plt.show()

In [None]:
# ------------------------------
# 9. VISUALIZATION IN 2D USING PCA
# ------------------------------

# Reduce the data to 2 dimensions
pca = PCA(n_components=2)
reduced_data = pca.fit_transform(scaled_data)

# Create a DataFrame with the principal components
reduced_df = pd.DataFrame(reduced_data, columns=['PCA1', 'PCA2'], index=data.index)
reduced_df['Cluster'] = data['Cluster']

# Plot the clusters in 2D
plt.figure(figsize=(8, 6))
sns.scatterplot(data=reduced_df, x='PCA1', y='PCA2', hue='Cluster', palette='viridis', s=100)
plt.title("Clusters of Capitals Based on Climate (PCA)", fontsize=14)
plt.xlabel("PCA 1")
plt.ylabel("PCA 2")
plt.legend(title="Cluster")
plt.show()

In [None]:
# Group by cluster and count countries and cities
cluster_summary = data.reset_index().groupby('Cluster')[['Country', 'City']].agg(['unique', 'count'])

# Display the summary
print("Summary of Countries and Cities in Each Cluster:")
print(cluster_summary)

In [None]:
# Save the summary to a CSV file
cluster_summary.to_csv('cluster_summary.csv')
print("Cluster summary saved to 'cluster_summary.csv'.")

In [None]:
# Define a function to categorize seasons
def get_season(month):
    if month in [12, 1, 2]:
        return 'Winter'
    elif month in [3, 4, 5]:
        return 'Spring'
    elif month in [6, 7, 8]:
        return 'Summer'
    else:
        return 'Fall'

# Extract the month from the date index and assign seasons
data['Season'] = pd.to_datetime(data.index.get_level_values('Date')).month.map(get_season) # Convert 'Date' level to DatetimeIndex
# Group by season and cluster
seasonal_summary = data.groupby(['Season', 'Cluster']).mean()

# Display seasonal summary
print("\nSeasonal Characteristics by Cluster:")
print(seasonal_summary)

In [None]:
# Seasonal distribution of clusters
seasonal_distribution = data.groupby(['Season', 'Cluster']).size().unstack()

# Plot the seasonal distribution
seasonal_distribution.plot(kind='bar', figsize=(10, 6), stacked=True)
plt.title("Seasonal Distribution of Clusters")
plt.xlabel("Season")
plt.ylabel("Number of Locations")
plt.legend(title="Cluster")
plt.show()

In [None]:
!pip install keplergl

In [None]:
from google.colab import output
output.enable_custom_widget_manager()

In [None]:
from keplergl import KeplerGl

# Map color assignment
color_mapping = {
    0: [0, 0, 255],  # Blue
    1: [0, 255, 0],  # Green
    2: [255, 0, 0]   # Red
}

# Add a column for Kepler colors
data['Kepler Color'] = data['Cluster'].map(color_mapping)

# Create a Kepler map with colors
kepler_map = KeplerGl(height=600)
kepler_map.add_data(data=data.reset_index(), name="Weather Clusters")

# Display the map
kepler_map

In [None]:
kepler_map.save_to_html(file_name='kepler_weather_map_final.html')