# Step 1: Import Libraries

In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder
from statsmodels.tsa.arima.model import ARIMA
import warnings
warnings.filterwarnings('ignore')


# Step 1: Load and Preprocess Data

In [None]:
spark = SparkSession.builder.appName('WildfirePredictionModel').getOrCreate()
spark_df = spark.read.csv('hdfs://hdfs-container:8020/cleaned/final_merged_data.csv', header=True, inferSchema=True)
df = spark_df.toPandas()
df['acq_date'] = pd.to_datetime(df['acq_date'])
df.head()


#### Replace invalid values

In [None]:
replace_cols = [
    'temperature', 'temperature_7d', 'temperature_15d', 'temperature_30d',
    'precipitation', 'precipitation_7d', 'precipitation_15d', 'precipitation_30d',
    'wind_speed', 'wind_speed_7d', 'wind_speed_15d', 'wind_speed_30d',
    'humidity', 'humidity_7d', 'humidity_15d', 'humidity_30d'
]
df.replace(-1.0, np.nan, inplace=True)
df.dropna(subset=replace_cols + ['latitude', 'longitude', 'frp'], inplace=True)


# Step 2: Feature Engineering

In [None]:
FRP_THRESHOLD = 15

df['wildfire_label'] = np.where(df['frp'] >= FRP_THRESHOLD, 1, 0)

le_daynight = LabelEncoder()
le_state = LabelEncoder()
df['daynight_enc'] = le_daynight.fit_transform(df['daynight'])
df['state_enc'] = le_state.fit_transform(df['state'])

 # Step 3: Machine Learning Model

In [None]:
features = [
    'latitude', 'longitude', 'confidence', 'daynight_enc', 'state_enc',
    'temperature', 'temperature_7d', 'temperature_15d', 'temperature_30d',
    'precipitation', 'precipitation_7d', 'precipitation_15d', 'precipitation_30d',
    'wind_speed', 'wind_speed_7d', 'wind_speed_15d', 'wind_speed_30d',
    'humidity', 'humidity_7d', 'humidity_15d', 'humidity_30d'
]
target = 'wildfire_label'

X = df[features]
y = df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Step 4: Model Evaluation

In [None]:
acc = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
cr = classification_report(y_test, y_pred)
print(f"Accuracy: {acc:.4f}\n")
print("Classification Report:\n", cr)

# Step 5: Visualization of Predictions

In [None]:

importances = model.feature_importances_
feature_df = pd.DataFrame({"feature": features, "importance": importances}).sort_values("importance", ascending=False)
fig_feat_imp = px.bar(feature_df, x='importance', y='feature', orientation='h', title="Feature Importance for Wildfire Prediction")
fig_feat_imp.write_html("feature_importance.html")

fig_cm = px.imshow(cm, text_auto=True, labels=dict(x="Predicted", y="Actual", color="Count"), x=["No Fire", "Fire"], y=["No Fire", "Fire"], title="Confusion Matrix")
fig_cm.write_html("confusion_matrix.html")

#### Add Predictions back for visualization

In [None]:
df_test = X_test.copy()
df_test['predicted_fire'] = y_pred
df_test['actual_fire'] = y_test
df_test['latitude'] = df.loc[y_test.index, 'latitude']
df_test['longitude'] = df.loc[y_test.index, 'longitude']
df_test['acq_date'] = df.loc[y_test.index, 'acq_date']
df_test['state'] = df.loc[y_test.index, 'state']
df_test['confidence'] = df.loc[y_test.index, 'confidence']

fig_map = px.scatter_map(
    df_test,
    lat="latitude",
    lon="longitude",
    color="predicted_fire",
    hover_data=["state", "confidence", "actual_fire"],
    title="Wildfire Prediction Across US",
    color_continuous_scale=['blue', 'red'],
    zoom=3
)
fig_map.write_html("wildfire_predictions_map.html")

#### Temperature vs Confidence colored by fire prediction

In [None]:
fig_temp = px.scatter(
    df_test,
    x="temperature",
    y="confidence",
    color="predicted_fire",
    hover_data=["state"],
    title="Temperature vs Confidence (Predicted Fire)"
)
fig_temp.write_html("temperature_vs_fire.html")

#### Confidence histogram

In [None]:
fig_conf = px.histogram(
    df_test,
    x="confidence",
    color="predicted_fire",
    barmode="overlay",
    title="Confidence Distribution by Prediction"
)
fig_conf.write_html("confidence_distribution.html")

#### Pie chart: Day vs Night fire prediction

In [None]:
fig_pie = px.pie(
    df.loc[y_test.index],
    names="daynight",
    title="Day vs Night Wildfire Predictions"
)
fig_pie.write_html("daynight_predictions.html")

# Step 6: Future Trend Prediction using ARIMA (Monthly forecast 2025-2030)

In [None]:
df['month'] = df['acq_date'].dt.to_period('M')
fires_monthly = df[df['wildfire_label'] == 1].groupby('month').size()
fires_monthly.index = fires_monthly.index.to_timestamp()

model_arima = ARIMA(fires_monthly, order=(2, 1, 2))
model_fit = model_arima.fit()
future_steps = 72  # 6 years
forecast = model_fit.forecast(steps=future_steps)

future_months = pd.date_range(start='2025-01-01', periods=future_steps, freq='MS')

#### Historical wildfires monthly

In [None]:
sampled_locs = df[df['wildfire_label'] == 1][['latitude', 'longitude']].sample(n=future_steps, replace=True, random_state=42).reset_index(drop=True)
forecast_df = pd.DataFrame({
    "Month": future_months,
    "Forecasted_Fires": forecast.values,
    "Latitude": sampled_locs['latitude'],
    "Longitude": sampled_locs['longitude']
})

#### Generated Future Forecasted Data

In [None]:
forecast_df.to_csv("future_wildfire_forecast.csv", index=False)

#### wildfire future trend forecast

In [None]:
fig_forecast = go.Figure()
fig_forecast.add_trace(go.Scatter(x=fires_monthly.index, y=fires_monthly.values, mode='lines', name='Actual Fires'))
fig_forecast.add_trace(go.Scatter(x=forecast_df['Month'], y=forecast_df['Forecasted_Fires'], mode='lines', name='Forecasted Fires'))
fig_forecast.update_layout(title="Wildfire Monthly Forecast (2025-2030)", xaxis_title="Date", yaxis_title="Wildfires")
fig_forecast.write_html("wildfire_future_trend_forecast.html")

#### US Choropleth Map for Predicted Future Fires (By State)

In [None]:
state_summary = df.groupby('state').agg({'wildfire_label': 'sum'}).reset_index()
fig_state_map = px.choropleth(
    state_summary,
    locations='state',
    locationmode="USA-states",
    color='wildfire_label',
    scope="usa",
    title="Total Wildfires per US State (Historical Data)"
)
fig_state_map.write_html("wildfire_us_state_map.html")

#### Map for Future Forecasted Wildfires

In [5]:
fig_future_map = px.scatter_mapbox(
    forecast_df,
    lat="Latitude",
    lon="Longitude",
    color="Forecasted_Fires",
    size="Forecasted_Fires",
    color_continuous_scale="OrRd",
    hover_data=["Month"],
    zoom=3,
    title="Forecasted Wildfires (2025–2030)"
)
fig_future_map.update_layout(mapbox_style="open-street-map")
fig_future_map.write_html("wildfire_future_map.html")

print("✅ Completed all steps including forecasted wildfire map with lat/lon!")

Accuracy: 0.8370

Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.87      0.85     24115
           1       0.85      0.80      0.83     22538

    accuracy                           0.84     46653
   macro avg       0.84      0.84      0.84     46653
weighted avg       0.84      0.84      0.84     46653

✅ Completed all steps including forecasted wildfire map with lat/lon!
