In [1]:
import pandas as pd
import numpy as np
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [2]:
df = pd.read_csv('/content/drive/MyDrive/404 STUFF/full_month_data.csv')
df['SqMiles'] = df['Acres'] / 640
df['IG_DATE'] = pd.to_datetime(df['IG_DATE'])

# Set the threshold for longitude
threshold = -110

# Create 'region' column based on 'longitude' values
df['Region'] = np.where(df['Longitude'] > threshold, 'East', 'West')
east_df = df[df['Region'] == 'East']
west_df = df[df['Region'] == 'West']
east_df = east_df[east_df['SqMiles'] < 50]
west_df = west_df[west_df['SqMiles'] < 100]

df = pd.concat([west_df, east_df])

df['State'] = df['Fire_ID']
df['State'] = df['State'].str.replace('\d+', '', regex=True)
df.drop(columns='Fire_ID', inplace=True)
df = df[df.State != 'AK']
df = df[df.State != 'HI']
df = df[df.State != 'PR']

df['Day'] = df['IG_DATE'].dt.day_of_year

df.head()

Unnamed: 0,IG_DATE,AvgTemp,AvgDew,AvgHumidity,TotalPrecip,TotalPrecipCover,AvgWindspeed,AvgPressure,Latitude,Longitude,Acres,SqMiles,Region,State,Day
1428,2012-05-08,69.0,19.158333,17.625,0.0,0.0,18.908333,1009.9,31.362,-110.417,8374,13.084375,West,AZ,129
1429,2011-05-23,66.258333,25.8,24.483333,0.068,4.17,17.6,1008.95,31.379,-110.489,10586,16.540625,West,AZ,143
1430,2011-02-14,43.95,7.691667,25.9,0.0,0.0,13.816667,1017.741667,31.34,-110.739,1961,3.064062,West,AZ,45
1431,2011-04-29,68.841667,23.183333,20.3,0.209,8.33,17.841667,1009.258333,31.335,-111.071,16339,25.529688,West,AZ,119
1432,2002-06-12,78.608333,30.241667,18.116667,0.0,0.0,14.083333,1013.916667,31.346,-111.067,17438,27.246875,West,AZ,163


In [3]:
df = pd.get_dummies(data=df, columns=['Region'])

df.head()

Unnamed: 0,IG_DATE,AvgTemp,AvgDew,AvgHumidity,TotalPrecip,TotalPrecipCover,AvgWindspeed,AvgPressure,Latitude,Longitude,Acres,SqMiles,State,Day,Region_East,Region_West
1428,2012-05-08,69.0,19.158333,17.625,0.0,0.0,18.908333,1009.9,31.362,-110.417,8374,13.084375,AZ,129,0,1
1429,2011-05-23,66.258333,25.8,24.483333,0.068,4.17,17.6,1008.95,31.379,-110.489,10586,16.540625,AZ,143,0,1
1430,2011-02-14,43.95,7.691667,25.9,0.0,0.0,13.816667,1017.741667,31.34,-110.739,1961,3.064062,AZ,45,0,1
1431,2011-04-29,68.841667,23.183333,20.3,0.209,8.33,17.841667,1009.258333,31.335,-111.071,16339,25.529688,AZ,119,0,1
1432,2002-06-12,78.608333,30.241667,18.116667,0.0,0.0,14.083333,1013.916667,31.346,-111.067,17438,27.246875,AZ,163,0,1


In [4]:
def categorize_acres(acres):
    if acres < 1000:
        return 1  # Small
    elif 1000 <= acres <= 10000:
        return 2  # Medium
    elif 10000 < acres <= 50000:
        return 3  # Large
    else:
        return 4  # Very Large

df['Category'] = df['Acres'].apply(categorize_acres)

df.head()

Unnamed: 0,IG_DATE,AvgTemp,AvgDew,AvgHumidity,TotalPrecip,TotalPrecipCover,AvgWindspeed,AvgPressure,Latitude,Longitude,Acres,SqMiles,State,Day,Region_East,Region_West,Category
1428,2012-05-08,69.0,19.158333,17.625,0.0,0.0,18.908333,1009.9,31.362,-110.417,8374,13.084375,AZ,129,0,1,2
1429,2011-05-23,66.258333,25.8,24.483333,0.068,4.17,17.6,1008.95,31.379,-110.489,10586,16.540625,AZ,143,0,1,3
1430,2011-02-14,43.95,7.691667,25.9,0.0,0.0,13.816667,1017.741667,31.34,-110.739,1961,3.064062,AZ,45,0,1,2
1431,2011-04-29,68.841667,23.183333,20.3,0.209,8.33,17.841667,1009.258333,31.335,-111.071,16339,25.529688,AZ,119,0,1,3
1432,2002-06-12,78.608333,30.241667,18.116667,0.0,0.0,14.083333,1013.916667,31.346,-111.067,17438,27.246875,AZ,163,0,1,3


In [5]:
df.dropna(inplace=True)

X = df.drop(columns=['IG_DATE', 'Acres', 'SqMiles', 'State', 'Category'])
y = df.Category

In [6]:
from sklearn.preprocessing import  StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split

scaler = StandardScaler()

# Split X and y into training and test set in 80:20 ratio
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [8]:
from sklearn.ensemble import RandomForestClassifier

# Defining the RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100, random_state=42, verbose=1)
clf.fit(X_train_scaled, y_train)

# Checking the training and testing accuracy again
train_accuracy = clf.score(X_train, y_train)
test_accuracy = clf.score(X_test, y_test)

[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    2.3s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s


In [9]:
train_accuracy, test_accuracy

(0.7606740411454564, 0.7928896238114923)