In [1]:
import pandas as pd

# Load the CSV file
df = pd.read_csv('final_merged_weather_data.csv')

# Display the first few rows
print(df.head())


                  time   lat    lon    TMP_sfc     RH_2m  APCP_sfc
0  2002-01-01 09:00:00  21.0  88.00  297.15700  58.58463       NaN
1  2002-01-01 09:00:00  21.0  88.25  297.08203  56.35963       NaN
2  2002-01-01 09:00:00  21.0  88.50  297.03200  54.65963       NaN
3  2002-01-01 09:00:00  21.0  88.75  296.98203  53.65963       NaN
4  2002-01-01 09:00:00  21.0  89.00  297.03200  53.23463       NaN


In [2]:
import pandas as pd

# Load the CSV file
df = pd.read_csv('final_file_point_2003-2024.csv')

# Display the first few rows
print(df.head())

     firedate sourcetype  longitude  latitude    state  district
0  2003-02-03      MODIS     93.411    23.655  MIZORAM  CHAMPHAI
1  2003-02-03      MODIS     92.847    23.695  MIZORAM    AIZAWL
2  2003-02-03      MODIS     92.856    23.696  MIZORAM    AIZAWL
3  2003-02-03      MODIS     92.845    23.704  MIZORAM    AIZAWL
4  2003-02-03      MODIS     92.845    23.704  MIZORAM    AIZAWL


In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Load weather data
weather_df = pd.read_csv("final_merged_weather_data.csv")
fire_df = pd.read_csv("final_file_point_2003-2024.csv")

# Convert timestamp to datetime for merging
weather_df['time'] = pd.to_datetime(weather_df['time'])
fire_df['firedate'] = pd.to_datetime(fire_df['firedate'])

# Convert Kelvin to Celsius for temperature
weather_df['TMP_sfc'] = weather_df['TMP_sfc'] - 273.15

# Handling missing precipitation (APCP_sfc) by filling with 0 (assuming no rain)
weather_df['APCP_sfc'] = weather_df['APCP_sfc'].fillna(0)

# Round latitude and longitude to match precision
weather_df['lat'] = weather_df['lat'].round(2)
weather_df['lon'] = weather_df['lon'].round(2)
fire_df['latitude'] = fire_df['latitude'].round(2)
fire_df['longitude'] = fire_df['longitude'].round(2)

# Add fire risk label (1 = Fire, 0 = No Fire)
fire_df['fire_risk'] = 1

# Merge datasets on latitude, longitude, and nearest timestamp
merged_df = pd.merge(weather_df, fire_df, left_on=['lat', 'lon'], right_on=['latitude', 'longitude'], how='left')

# Fill missing fire_risk values (no fire cases)
merged_df['fire_risk'] = merged_df['fire_risk'].fillna(0)

# Select relevant features
merged_df = merged_df[['APCP_sfc', 'RH_2m', 'TMP_sfc', 'fire_risk']]


In [2]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# 1️⃣ Load the large datasets in chunks (to handle memory issues)
weather_file = "final_merged_weather_data.csv"
fire_file = "final_file_point_2003-2024.csv"

# Use `low_memory=False` to prevent dtype guessing issues
weather_df = pd.read_csv(weather_file, low_memory=False)
fire_df = pd.read_csv(fire_file, low_memory=False)

# 2️⃣ Preprocess: Merge datasets based on latitude, longitude, and date
fire_df['firedate'] = pd.to_datetime(fire_df['firedate'])
weather_df['time'] = pd.to_datetime(weather_df['time'])

# Round lat/lon to match (if necessary)
weather_df["lat"] = weather_df["lat"].round(2)
weather_df["lon"] = weather_df["lon"].round(2)
fire_df["latitude"] = fire_df["latitude"].round(2)
fire_df["longitude"] = fire_df["longitude"].round(2)

# Merge datasets on lat/lon & time
merged_df = fire_df.merge(
    weather_df, 
    left_on=['firedate', 'latitude', 'longitude'], 
    right_on=['time', 'lat', 'lon'], 
    how='left'
)

# Drop NaNs (optional: consider imputation if needed)
merged_df.dropna(subset=['APCP_sfc', 'RH_2m', 'TMP_sfc'], inplace=True)

# Create target column (binary fire risk: 1 if fire occurred, 0 otherwise)
merged_df['fire_risk'] = 1

# 3️⃣ Select features and target
X = merged_df[['APCP_sfc', 'RH_2m', 'TMP_sfc']]
y = merged_df['fire_risk']

# 4️⃣ Train-test split (efficiently load in batches)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 5️⃣ Train Random Forest Model (optimized for large datasets)
model = RandomForestClassifier(n_estimators=200, max_depth=15, n_jobs=-1, random_state=42)
model.fit(X_train, y_train)

# 6️⃣ Evaluate Model Performance
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"🔥 Model Accuracy: {accuracy:.2f}")

# 7️⃣ Save Model
with open("fire_model.pkl", "wb") as model_file:
    pickle.dump(model, model_file)

print("✅ Model saved as fire_model.pkl")


ValueError: With n_samples=0, test_size=0.2 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.

In [3]:
print("Merged DF Shape:", merged_df.shape)
print("Merged DF Preview:\n", merged_df.head())


Merged DF Shape: (0, 13)
Merged DF Preview:
 Empty DataFrame
Columns: [firedate, sourcetype, longitude, latitude, state, district, time, lat, lon, TMP_sfc, RH_2m, APCP_sfc, fire_risk]
Index: []


In [4]:
print("🔥 Fire DataFrame Info:")
print(fire_df.info())  # Check column types and missing values

print("🌧️ Weather DataFrame Info:")
print(weather_df.info())  # Check column types and missing values

print("\n📌 Fire Data Sample:")
print(fire_df.head())

print("\n🌍 Weather Data Sample:")
print(weather_df.head())


🔥 Fire DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 757563 entries, 0 to 757562
Data columns (total 6 columns):
 #   Column      Non-Null Count   Dtype         
---  ------      --------------   -----         
 0   firedate    757563 non-null  datetime64[ns]
 1   sourcetype  757563 non-null  object        
 2   longitude   757563 non-null  float64       
 3   latitude    757563 non-null  float64       
 4   state       757563 non-null  object        
 5   district    757563 non-null  object        
dtypes: datetime64[ns](1), float64(2), object(3)
memory usage: 34.7+ MB
None
🌧️ Weather DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13230016 entries, 0 to 13230015
Data columns (total 6 columns):
 #   Column    Dtype         
---  ------    -----         
 0   time      datetime64[ns]
 1   lat       float64       
 2   lon       float64       
 3   TMP_sfc   float64       
 4   RH_2m     float64       
 5   APCP_sfc  float64       
dtypes: datetime6

In [5]:
import pandas as pd

# 🔹 Load data (if not already loaded)
fire_df = pd.read_csv("final_file_point_2003-2024.csv", parse_dates=['firedate'])
weather_df = pd.read_csv("final_merged_weather_data.csv", parse_dates=['time'])

# ✅ Step 1: Convert timestamps to date format for matching
fire_df['firedate'] = fire_df['firedate'].dt.date  # Remove time part
weather_df['time'] = weather_df['time'].dt.date    # Remove time part

# ✅ Step 2: Keep only weather data from 2003 onwards
weather_df = weather_df[weather_df['time'] >= fire_df['firedate'].min()]

# ✅ Step 3: Round lat/lon for better matching (1 km precision)
fire_df['latitude'] = fire_df['latitude'].round(2)
fire_df['longitude'] = fire_df['longitude'].round(2)
weather_df['lat'] = weather_df['lat'].round(2)
weather_df['lon'] = weather_df['lon'].round(2)

# ✅ Step 4: Drop rows where precipitation is missing
weather_df = weather_df.dropna(subset=['APCP_sfc'])

# ✅ Step 5: Merge datasets (matching date, lat, lon)
merged_df = fire_df.merge(
    weather_df,
    left_on=['firedate', 'latitude', 'longitude'],
    right_on=['time', 'lat', 'lon'],
    how='inner'
)

# ✅ Step 6: Add target column (fire risk = 1 since fire occurred)
merged_df['fire_risk'] = 1

print(f"✅ Merged Data Shape: {merged_df.shape}")
print(merged_df.head())


✅ Merged Data Shape: (127, 13)
     firedate sourcetype  longitude  latitude    state district        time  \
0  2004-04-30      MODIS      92.50      23.0  MIZORAM  LUNGLEI  2004-04-30   
1  2004-04-30      MODIS      92.50      23.0  MIZORAM  LUNGLEI  2004-04-30   
2  2004-04-30      MODIS      92.50      23.0  MIZORAM  LUNGLEI  2004-04-30   
3  2004-04-30      MODIS      92.50      23.0  MIZORAM  LUNGLEI  2004-04-30   
4  2007-02-28      MODIS      92.75      24.0  MIZORAM   AIZAWL  2007-02-28   

    lat    lon  TMP_sfc  RH_2m  APCP_sfc  fire_risk  
0  23.0  92.50      NaN    NaN    2.5000          1  
1  23.0  92.50      NaN    NaN    0.6875          1  
2  23.0  92.50      NaN    NaN    2.0000          1  
3  23.0  92.50      NaN    NaN    0.5625          1  
4  24.0  92.75      NaN    NaN    0.8125          1  


In [9]:
merged_df['TMP_sfc'] = merged_df['TMP_sfc'].fillna(merged_df['TMP_sfc'].mean())
merged_df['RH_2m'] = merged_df['RH_2m'].fillna(merged_df['RH_2m'].mean())
# Save the cleaned dataset
merged_df.to_csv("cleaned_merged_data.csv", index=False)
print("✅ Cleaned dataset saved as cleaned_merged_data.csv")


✅ Cleaned dataset saved as cleaned_merged_data.csv


In [10]:
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Load the cleaned merged dataset
merged_df = pd.read_csv("cleaned_merged_data.csv")  # Ensure the cleaned dataset is saved

# Selecting features and target variable
X = merged_df[['TMP_sfc', 'RH_2m', 'APCP_sfc']]
y = merged_df['fire_risk']

# Train-test split (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the Random Forest model with tuned hyperparameters
model = RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42)
model.fit(X_train, y_train)

# Evaluate model accuracy
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"🔥 Model Accuracy: {accuracy:.2f}")

# Save the trained model
with open("fire_model.pkl", "wb") as model_file:
    pickle.dump(model, model_file)

print("✅ Model saved as fire_model.pkl")


🔥 Model Accuracy: 1.00
✅ Model saved as fire_model.pkl


In [11]:
import pickle

# Assuming your trained model is named `model`
with open("fire_model.pkl", "wb") as file:
    pickle.dump(model, file)
