In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap
import seaborn as sns
import numpy as np
import datetime as dt
from matplotlib.colors import Normalize
from pathlib import Path

In [2]:
# Current working directory (notebook folder)
notebook_path = Path.cwd()  # MLT/code/data_processing/
base_path = notebook_path.parent.parent  # Go up to MLT/

functions_path = notebook_path.parent / "000_Functions.ipynb"
data_path = base_path / "data" / "raw"
bronze_path = base_path / "data" / "bronze"

In [None]:
# Load custom functions
%run "{functions_path.as_posix()}"

In [6]:
# Load CSVs
df = pd.read_csv( bronze_path / "bronze_saber_1d_tempoaral_agg.csv")

In [11]:
# Convert year_month strings to datetime (taking first day of month)
df['year_month'] = pd.to_datetime(df['year_month'], format='%Y-%m')

In [14]:
# df['year_month'] = pd.to_datetime(df['year_month']).dt.to_period('M')

In [15]:
# 1. Quick overview
print("=== Data Info ===")
print(df.info())
print("\n=== Head of Data ===")
print(df.head())

# 2. Check for missing values
print("\n=== Missing Values ===")
print(df.isna().sum())

# 3. Check for duplicate rows
duplicates = df.duplicated().sum()
print(f"\nDuplicate rows: {duplicates}")

# 4. Basic descriptive statistics
print("\n=== Descriptive Statistics ===")
print(df.describe(include='all'))

# 5. Check for unexpected values or outliers in key columns
numeric_cols = df.select_dtypes(include='number').columns
for col in numeric_cols:
    print(f"\nColumn: {col}")
    print(df[col].describe())
    print(f"Number of negative values: {(df[col] < 0).sum()}")  # if negatives are invalid

# 6. Check temporal consistency if you have a time column
if 'year_month' in df.columns:
    df['year_month'] = pd.to_datetime(df['year_month'], errors='coerce')
    print("\n=== Temporal Consistency ===")
    print(f"Invalid dates: {df['year_month'].isna().sum()}")
    print(f"Date range: {df['year_month'].min()} to {df['year_month'].max()}")

=== Data Info ===
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 283 entries, 0 to 282
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   year_month  283 non-null    datetime64[ns]
 1   ktemp       283 non-null    float64       
 2   year        283 non-null    int64         
 3   month       283 non-null    int64         
dtypes: datetime64[ns](1), float64(1), int64(2)
memory usage: 9.0 KB
None

=== Head of Data ===
  year_month       ktemp  year  month
0 2002-01-01  213.411947  2002      1
1 2002-02-01  213.660216  2002      2
2 2002-03-01  213.637311  2002      3
3 2002-04-01  214.023157  2002      4
4 2002-05-01  212.659695  2002      5

=== Missing Values ===
year_month    0
ktemp         0
year          0
month         0
dtype: int64

Duplicate rows: 0

=== Descriptive Statistics ===
                          year_month       ktemp         year       month
count                            283  28