In [13]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import skew, kurtosis

DATA_PATH = "../data/Global Temperature.csv"   
FIGURES_DIR = "../figures"
os.makedirs(FIGURES_DIR, exist_ok=True)

In [14]:
df = pd.read_csv(DATA_PATH)
print("Loaded dataset:", DATA_PATH)
print("Shape:", df.shape)
print("Columns:", df.columns.tolist())

Loaded dataset: ../data/Global Temperature.csv
Shape: (2077, 12)
Columns: ['Year', ' Month', 'Monthly Anomaly', ' Monthly Unc.', '  Annual Anomaly', ' Annual Unc', 'Five-Year Anomaly', ' Five-Year Unc.', ' Ten-Year Anomaly', ' Ten-Year Unc.', '  Twenty-Year Anomaly', ' Twenty-year Unc.']


In [15]:
# Data Cleaning
for col in df.columns:
    if 'Anomaly' in col or 'Unc' in col:
        # Converts columns to float
        df[col] = pd.to_numeric(df[col], errors='coerce')

print("Columns:", df.columns.tolist())


Columns: ['Year', ' Month', 'Monthly Anomaly', ' Monthly Unc.', '  Annual Anomaly', ' Annual Unc', 'Five-Year Anomaly', ' Five-Year Unc.', ' Ten-Year Anomaly', ' Ten-Year Unc.', '  Twenty-Year Anomaly', ' Twenty-year Unc.']


In [16]:
# Identify columns that are 100% NaN
full_empty_cols = df.columns[df.isna().all()].tolist()
if full_empty_cols:
    print(f"Columns found that are 100% NaN and will be dropped: {full_empty_cols}")
    
# Drop full-empty columns (if all NaN)
df = df.dropna(axis=1, how='all')


In [17]:
# Identify columns that contain AT LEAST ONE NaN
ffill_affected_cols = df.columns[df.isna().any()].tolist()
if ffill_affected_cols:
    print(f"Columns containing NaNs that will be imputed by ffill: {ffill_affected_cols}")
    
# Replace numeric NaNs with forward-fill
df = df.ffill()

Columns containing NaNs that will be imputed by ffill: [' Monthly Unc.', '  Annual Anomaly', ' Annual Unc', 'Five-Year Anomaly', ' Five-Year Unc.', ' Ten-Year Anomaly', ' Ten-Year Unc.', '  Twenty-Year Anomaly', ' Twenty-year Unc.']


In [18]:
# FIX Columns spaces
df.columns = df.columns.str.strip()

In [19]:
# Create a 'Date' column combining Year and Month
df['Date'] = pd.to_datetime(df[['Year', 'Month']].assign(day=1))
# df['Date'] = pd.to_datetime(df[['Year']].assign(Month=1, Day=1))

# SELECT NUMERIC COLUMNS
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
print("\nNumeric columns:", numeric_cols)


Numeric columns: ['Year', 'Month', 'Monthly Anomaly', 'Monthly Unc.', 'Annual Anomaly', 'Annual Unc', 'Five-Year Anomaly', 'Five-Year Unc.', 'Ten-Year Anomaly', 'Ten-Year Unc.', 'Twenty-Year Anomaly', 'Twenty-year Unc.']


In [20]:
# For our 4 statistical moments, we will focus on the Monthly Anomaly column
target_col = 'Monthly Anomaly' if 'Monthly Anomaly' in df.columns else numeric_cols[2]

series = df[target_col].dropna()

# STATISTICAL MOMENTS
mean_val = series.mean()
var_val = series.var(ddof=0)
skew_val = skew(series)
kurt_val = kurtosis(series)

print(f"\nStatistical Moments for {target_col}:")
print(f"  Mean: {mean_val:.4f}")
print(f"  Variance: {var_val:.4f}")
print(f"  Skewness: {skew_val:.4f}")
print(f"  Kurtosis: {kurt_val:.4f}")


Statistical Moments for Monthly Anomaly:
  Mean: 0.0396
  Variance: 0.1752
  Skewness: 0.7384
  Kurtosis: 0.1817


In [22]:
# Line plot over time
plt.figure(figsize=(10,5))
sns.lineplot(data=df, x='Date', y=target_col, color='#A52A2A')
plt.title(f"{target_col} Over Time")
plt.xlabel("Year")
plt.ylabel(target_col)
plt.tight_layout()
# plt.show()

plt.savefig(os.path.join(FIGURES_DIR, "relational_line.png"))
plt.close()
print("Saved: relational_line.png")

Saved: relational_line.png


In [27]:
# Bar plot per month
monthly_avg = df.groupby('Month')[target_col].mean().reset_index()

plt.figure(figsize=(11,5))
sns.barplot(data=monthly_avg, x='Month', y=target_col, hue='Month', palette="coolwarm", legend=False)
plt.title(f"Average {target_col} by Month")
plt.xlabel("Month")
plt.ylabel(f"Average {target_col}")
plt.tight_layout()
# plt.show()

plt.savefig(os.path.join(FIGURES_DIR, "categorical_bar.png"))
plt.close()
print("Saved: categorical_bar.png")

Saved: categorical_bar.png


In [29]:
# 7b. Correlation heatmap (for numeric columns)
corr = df[numeric_cols].corr()
plt.figure(figsize=(8,6))
sns.heatmap(corr, annot=True, fmt=".2f", cmap="coolwarm", linewidths=.5)
plt.title("Correlation Heatmap (Numeric Columns)")
plt.tight_layout()
# plt.show()

plt.savefig(os.path.join(FIGURES_DIR, "correlation_heatmap.png"))
plt.close()
print("Saved: correlation_heatmap.png")

Saved: correlation_heatmap.png


In [31]:
print("\n--- SUMMARY ---")
print(f"Data rows: {len(df)}")
print(f"Analyzed column: {target_col}")
print(f"Mean={mean_val:.3f}, Variance={var_val:.3f}, Skew={skew_val:.3f}, Kurtosis={kurt_val:.3f}")
print("Plots saved in:", FIGURES_DIR)
print("Done.")


--- SUMMARY ---
Data rows: 2077
Analyzed column: Monthly Anomaly
Mean=0.040, Variance=0.175, Skew=0.738, Kurtosis=0.182
Plots saved in: ../figures
Done.
