# Data Preprocessing

In [1]:
import pandas as pd
from haversine import haversine

# Load environmental data
env_data = pd.read_csv('../Data/historical_environmental_data.csv', parse_dates=['timestamp'])
# Load wildfire data
fire_data = pd.read_csv('../Data/historical_wildfiredata.csv', parse_dates=['timestamp', 'fire_start_time'])

# Feature Engineering

In [2]:
# Define parameters
time_window_hours = 24 * 7  # 1 week
max_distance_km = 50  # 50 km radius

# Prepare to collect samples
samples = []

for idx, fire in fire_data.iterrows():
    fire_time = fire['fire_start_time']
    fire_loc = (fire['latitude'], fire['longitude'])
    
    # Filter environmental data within time window
    mask = (env_data['timestamp'] >= fire_time - pd.Timedelta(hours=time_window_hours)) & \
           (env_data['timestamp'] < fire_time)
    relevant_env = env_data[mask].copy()
    
    # Calculate distance to fire
    relevant_env['distance'] = relevant_env.apply(
        lambda row: haversine((row['latitude'], row['longitude']), fire_loc), axis=1
    )
    
    # Filter by distance
    nearby_env = relevant_env[relevant_env['distance'] <= max_distance_km]
    
    if not nearby_env.empty:
        # Aggregate features (e.g., mean values)
        aggregated = nearby_env.groupby('timestamp').agg({
            'temperature': 'mean',
            'humidity': 'mean',
            'wind_speed': 'mean',
            'precipitation': 'mean',
            'vegetation_index': 'mean',
            'human_activity_index': 'mean'
        }).reset_index()
        
        # Assign fire severity as target
        aggregated['severity'] = fire['severity']
        samples.append(aggregated)

# Combine into a single DataFrame
if samples:
    fire_features = pd.concat(samples)
else:
    raise ValueError("No overlapping environmental data found for fires.")

# Handle Negative Samples

In [3]:
# Generate negative samples (no fires)
non_fire_samples = env_data.sample(n=len(fire_data) * 10)  # Adjust based on your data
non_fire_samples['severity'] = 'none'  # Indicate no fire

# Combine positive and negative samples
all_data = pd.concat([fire_features, non_fire_samples], ignore_index=True)

# Feature Selection and Encoding

In [4]:
from sklearn.preprocessing import LabelEncoder

# Encode severity
le = LabelEncoder()
all_data['severity_encoded'] = le.fit_transform(all_data['severity'])

# Features and target
features = ['temperature', 'humidity', 'wind_speed', 'precipitation', 'vegetation_index', 'human_activity_index']
X = all_data[features]
y = all_data['severity_encoded']

# Train-Test Split

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Model Training

In [6]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

model = RandomForestClassifier(class_weight='balanced', random_state=42)
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred, target_names=le.classes_))

              precision    recall  f1-score   support

        high       0.16      0.05      0.07       171
         low       0.35      0.56      0.43       362
      medium       0.28      0.20      0.23       239
        none       0.28      0.25      0.27       266

    accuracy                           0.32      1038
   macro avg       0.27      0.27      0.25      1038
weighted avg       0.29      0.32      0.29      1038



# Prediction

In [None]:
# Example prediction
new_data = pd.DataFrame([{
    'temperature': 30.0,
    'humidity': 40,
    'wind_speed': 20,
    'precipitation': 0.5,
    'vegetation_index': 50,
    'human_activity_index': 30
}])

prediction = model.predict(new_data)
print('Predicted severity:', le.inverse_transform(prediction)[0])

# Real test for 2025

In [7]:
# Load future environmental data (only once)
future_env = pd.read_csv('../Data/future_environmental_data.csv', parse_dates=['timestamp'])

# Select the same features used in training
features = ['temperature', 'humidity', 'wind_speed', 'precipitation', 'vegetation_index', 'human_activity_index']

# Make predictions for all rows
X_future = future_env[features]
predictions = model.predict(X_future)

# Add predictions to the dataframe
future_env['fire_risk'] = le.inverse_transform(predictions)

# Show results
print(future_env[future_env['fire_risk'] == 'high'])

               timestamp  temperature  humidity  wind_speed  precipitation  \
21   2025-01-01 21:00:00         37.1        46          38            4.1   
45   2025-01-02 21:00:00         36.0        78          40            1.4   
71   2025-01-03 23:00:00         20.0        65          29            5.0   
106  2025-01-05 10:00:00         19.9        30          39            0.1   
146  2025-01-07 02:00:00         36.8        69           0            3.0   
...                  ...          ...       ...         ...            ...   
8658 2025-12-27 18:00:00         37.4        77           9            2.6   
8670 2025-12-28 06:00:00         18.0        28          19            1.8   
8703 2025-12-29 15:00:00         37.1        85           8            2.2   
8749 2025-12-31 13:00:00         25.1        79          32            2.9   
8752 2025-12-31 16:00:00         27.3        87          38            1.4   

      vegetation_index  human_activity_index  latitude  longitu

# Save Trained Model Assets

In [None]:
import joblib
joblib.dump(model, '../../MlService/Models/wildfire_model.pkl')
joblib.dump(le, '../../MlService/Models/label_encoder.pkl')