In [37]:
import urllib3
import requests
import requests_cache

print(urllib3.__version__)
print(requests.__version__)
print(requests_cache.__version__)


1.26.18
2.31.0
1.2.1


In [38]:
abuja_latitude = 9.0765
abuja_longitude = 7.3986

print(f"Abuja Latitude: {abuja_latitude}")
print(f"Abuja Longitude: {abuja_longitude}")

Abuja Latitude: 9.0765
Abuja Longitude: 7.3986


In [41]:
import openmeteo_requests
import requests_cache
import pandas as pd
from retry_requests import retry

# 2. Create a requests_cache.CachedSession object to cache API responses
# Cache API requests for an hour to reduce redundant calls
cache_session = requests_cache.CachedSession('.cache', expire_after=3600)
retry_session = retry(cache_session, retries=5, backoff_factor=0.2)

# 3. Initialize the openmeteo_requests client with the cached session
openmeteo = openmeteo_requests.Client(session=retry_session)

# 4. Define the URL for the Open-Meteo Archive API
url = "https://archive-api.open-meteo.com/v1/archive"

# 5. Specify the parameters for the API request
params = {
    "latitude": abuja_latitude,
    "longitude": abuja_longitude,
    "start_date": "2024-01-01",
    "end_date": "2024-01-07",
    "daily": [
        "weather_code",
        "temperature_2m_max",
        "temperature_2m_min",
        "temperature_2m_mean",
        "apparent_temperature_max",
        "apparent_temperature_min",
        "apparent_temperature_mean",
        "sunrise",
        "sunset",
        "precipitation_sum",
        "rain_sum",
        "showers_sum",
        "snowfall_sum",
        "precipitation_hours",
        "wind_speed_10m_max",
        "wind_direction_10m_dominant",
        "shortwave_radiation_sum",
        "et0_fao_evapotranspiration"
    ],
    "timezone": "auto"
}


# 6. Make the API request
responses = openmeteo.weather_api(url, params=params)

# Assuming only one location is queried, get the first response
response = responses[0]

# Process daily data
daily = response.Daily()

daily_data = {
    "date": pd.date_range(
        start=pd.to_datetime(daily.Time(), unit="s"),
        end=pd.to_datetime(daily.TimeEnd(), unit="s"),
        freq=pd.Timedelta(seconds=daily.Interval()),
        inclusive="left"
    )
}

for i, param in enumerate(params["daily"]):
    daily_data[param] = daily.Variables(i).Values(0)

df_weather = pd.DataFrame(daily_data)


In [42]:
import sys

# Install necessary packages to ensure a fresh environment
# !pip install --upgrade --force-reinstall openmeteo-requests requests-cache retry-requests

import openmeteo_requests
import requests_cache
import pandas as pd
import numpy as np # Import numpy for NaNs
from retry_requests import retry

# 2. Create a requests_cache.CachedSession object to cache API responses
# Cache API requests for an hour to reduce redundant calls
cache_session = requests_cache.CachedSession('.cache', expire_after=3600)
retry_session = retry(cache_session, retries=5, backoff_factor=0.2)

# 3. Initialize the openmeteo_requests client with the cached session
openmeteo = openmeteo_requests.Client(session=retry_session)

# Define Abuja coordinates (re-added for scope)
abuja_latitude = 9.0765
abuja_longitude = 7.3986

# 4. Define the URL for the Open-Meteo Archive API
url = "https://archive-api.open-meteo.com/v1/archive"

# 5. Specify the parameters for the API request
params = {
    "latitude": abuja_latitude,
    "longitude": abuja_longitude,
    "start_date": "2000-01-01", # Example start date, can be adjusted
    "end_date": "2023-12-31",   # Example end date, ideally yesterday's date
    "daily": [
        "weather_code",
        "temperature_2m_max",
        "temperature_2m_min",
        "temperature_2m_mean",
        "apparent_temperature_max",
        "apparent_temperature_min",
        "apparent_temperature_mean",
        "sunrise",
        "sunset",
        "precipitation_sum",
        "rain_sum",
        "showers_sum",
        "snowfall_sum",
        "precipitation_hours",
        "wind_speed_10m_max",
        # Removed 'wind_gust_speed_max' due to previous error
        "wind_direction_10m_dominant",
        "shortwave_radiation_sum",
        "et0_fao_evapotranspiration"
    ],
    "timezone": "auto"
}

# 6. Make the API request using the client's weather_api method directly
responses = openmeteo.weather_api(url, params=params)

# Assuming only one location is queried, get the first response
response = responses[0]

# Process daily data
daily = response.Daily()

# Check if daily data is actually available
if daily is None:
    print("No daily data found in the API response.")
    df_weather = pd.DataFrame() # Create empty DataFrame
elif daily.VariablesLength() == 0:
    print("No daily variables found in the API response. Creating empty DataFrame.")
    df_weather = pd.DataFrame()
else:
    # Determine the number of days from the first variable's data length
    # This is more robust if daily.Time() is misbehaving
    num_days = len(daily.Variables(0).ValuesAsNumpy())

    # Extract date/time information
    times = daily.Time()
    if isinstance(times, (int, float)):
        # If it's a scalar, assume it's the start time and generate a range
        date_range = pd.date_range(start=pd.to_datetime(times, unit='s'), periods=num_days, freq='D')
        daily_data = {"date": date_range}
    elif times is not None and len(times) == num_days:
        daily_data = {"date": pd.to_datetime(times, unit='s')}
    else:
        # Fallback if times is not int/float, not None, and length mismatch
        print(f"Warning: Time data length mismatch. Expected {num_days}, got {len(times) if times is not None else 'None'}. Generating date range.")
        start_time_unix = daily.Time()[0] if times is not None and hasattr(times, '__getitem__') else pd.Timestamp(params['start_date']).timestamp()
        date_range = pd.date_range(start=pd.to_datetime(start_time_unix, unit='s'), periods=num_days, freq='D')
        daily_data = {"date": date_range}

    # Initialize daily_data for all requested parameters with NaN arrays
    # This ensures all requested columns are present even if data is missing
    for param in params["daily"]:
        daily_data[param] = np.full(num_days, np.nan)

    # Extract daily variables using ValuesAsNumpy()
    # Rely on the order of `daily.Variables(i)` matching `params["daily"]`
    for i, param_name_from_request in enumerate(params["daily"]):
        if i < daily.VariablesLength():
            variable = daily.Variables(i)
            try:
                values = variable.ValuesAsNumpy()
                if len(values) == num_days:
                    daily_data[param_name_from_request] = values
                else:
                    print(f"Warning: Data length mismatch for '{param_name_from_request}'. Expected {num_days}, got {len(values)}. Filling with NaNs.")
            except Exception as e:
                print(f"Error extracting '{param_name_from_request}' values: {e}. Filling with NaNs.")
        else:
            print(f"Warning: Parameter '{param_name_from_request}' requested but no corresponding variable found in API response. Filling with NaNs.")


    # 7. Convert to Pandas DataFrame
    df_weather = pd.DataFrame(daily_data)

    # Set 'date' as index
    df_weather = df_weather.set_index('date')

# 8. Display the first few rows of the DataFrame
print("Historical Weather Data for Abuja:")
if not df_weather.empty:
    print(df_weather.head())
    print(f"\nDataFrame shape: {df_weather.shape}")
else:
    print("DataFrame is empty.")

Error extracting 'sunrise' values: object of type 'int' has no len(). Filling with NaNs.
Error extracting 'sunset' values: object of type 'int' has no len(). Filling with NaNs.
Historical Weather Data for Abuja:
                     weather_code  temperature_2m_max  temperature_2m_min  \
date                                                                        
1999-12-31 23:00:00           2.0           34.332001           22.882000   
2000-01-01 23:00:00           3.0           34.632000           22.431999   
2000-01-02 23:00:00           3.0           35.181999           23.431999   
2000-01-03 23:00:00           3.0           34.281998           23.581999   
2000-01-04 23:00:00           3.0           34.931999           22.382000   

                     temperature_2m_mean  apparent_temperature_max  \
date                                                                 
1999-12-31 23:00:00            28.802834                 34.801472   
2000-01-01 23:00:00            28.5632

In [44]:
import numpy as np

# 1. Define the target variable 'is_rain'
# Set 'is_rain' to 1 if precipitation_sum > 0.0, and 0 otherwise
df_weather['is_rain'] = (df_weather['precipitation_sum'] > 0.0).astype(int)

# 2. Extract temporal features from the DataFrame's index
df_weather['year'] = df_weather.index.year
df_weather['month'] = df_weather.index.month
df_weather['day'] = df_weather.index.day

# 3. Select and prepare features for the model
# Exclude 'date' (which is the index now) and the target variable 'is_rain'
# Also exclude 'sunrise' and 'sunset' as they consistently have NaNs from the API response and are less direct for rain prediction
# 'weather_code' is categorical and should be handled separately or one-hot encoded for some models, for simplicity we'll keep it numerical for now.
# Remove 'precipitation_sum' as it's directly used to create 'is_rain' and would cause data leakage.
feature_columns = [
    'weather_code',
    'temperature_2m_max',
    'temperature_2m_min',
    'temperature_2m_mean',
    'apparent_temperature_max',
    'apparent_temperature_min',
    'apparent_temperature_mean',
    'rain_sum',
    'showers_sum',
    'snowfall_sum',
    'precipitation_hours',
    'wind_speed_10m_max',
    'wind_direction_10m_dominant',
    'shortwave_radiation_sum',
    'et0_fao_evapotranspiration',
    'year',
    'month',
    'day'
]

# Filter df_weather to include only feature columns and the target variable for further processing
df_model = df_weather[feature_columns + ['is_rain']].copy()

# 4. Handle missing values
# Fill NaN values in feature columns with the mean of their respective columns
for col in feature_columns:
    if df_model[col].isnull().any():
        df_model[col] = df_model[col].fillna(df_model[col].mean())

# Check for any remaining NaNs in the target variable and fill with 0 if any (assuming no rain if data is missing)
if df_model['is_rain'].isnull().any():
    df_model['is_rain'] = df_model['is_rain'].fillna(0).astype(int)

# Display the first few rows of the preprocessed DataFrame and its info to verify
print("Preprocessed DataFrame for ML Model:")
print(df_model.head())
print(f"\nDataFrame shape: {df_model.shape}")
print("\nMissing values after preprocessing:")
print(df_model.isnull().sum())


Preprocessed DataFrame for ML Model:
                     weather_code  temperature_2m_max  temperature_2m_min  \
date                                                                        
1999-12-31 23:00:00           2.0           34.332001           22.882000   
2000-01-01 23:00:00           3.0           34.632000           22.431999   
2000-01-02 23:00:00           3.0           35.181999           23.431999   
2000-01-03 23:00:00           3.0           34.281998           23.581999   
2000-01-04 23:00:00           3.0           34.931999           22.382000   

                     temperature_2m_mean  apparent_temperature_max  \
date                                                                 
1999-12-31 23:00:00            28.802834                 34.801472   
2000-01-01 23:00:00            28.563242                 35.943035   
2000-01-02 23:00:00            29.129919                 35.945816   
2000-01-03 23:00:00            28.823664                 34.895729   
200

In [45]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score

# 1. Define features (X) and target (y)
X = df_model.drop('is_rain', axis=1)
y = df_model['is_rain']

# 2. Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# print(f"X_train shape: {X_train.shape}")
# print(f"X_test shape: {X_test.shape}")
# print(f"y_train shape: {y_train.shape}")
# print(f"y_test shape: {y_test.shape}")

# 3. Instantiate a RandomForestClassifier model
# Using n_estimators=100 and random_state=42 for consistent results
model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)

# 4. Train the RandomForestClassifier model
model.fit(X_train, y_train)

# 5. Make predictions on the test set
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1] # Probability of the positive class (rain)

