# Modeling

## Import Data

In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

In [2]:
url = "https://raw.githubusercontent.com/AsmitC/Austin_Trees/refs/heads/victournguyen/purpleair-pull-data/full_data.csv"
full_data = pd.read_csv(url, index_col=0)

In [3]:
data = full_data
data.shape

(139802, 17)

In [4]:
data.head()

Unnamed: 0,time_stamp,humidity,temperature,pressure,pm2.5_alt,pm2.5_atm,sensor_index,latitude,longitude,altitude,Oak,Elm,Pecan,Crape Myrtle,Ashe Juniper,other,total_diameter
0,2024-10-24T18:00:00Z,35.447,92.044,981.676,2.5,3.2195,2862,30.366156,-97.76529,899.0,1192,426,2,49,10,130,22595.5
1,2024-10-24T06:00:00Z,61.95,78.405,984.603,3.6,5.647,2862,30.366156,-97.76529,899.0,1192,426,2,49,10,130,22595.5
2,2024-10-24T00:00:00Z,45.133,83.672,985.485,3.1,4.76,2862,30.366156,-97.76529,899.0,1192,426,2,49,10,130,22595.5
3,2024-10-23T00:00:00Z,48.216,83.123,986.001,2.6,3.9515,2862,30.366156,-97.76529,899.0,1192,426,2,49,10,130,22595.5
4,2024-10-23T18:00:00Z,33.578,93.095,985.318,2.5,3.2565,2862,30.366156,-97.76529,899.0,1192,426,2,49,10,130,22595.5


## Make season variable

In [5]:
# Ensure 'time_stamp' column is datetime type
data['time_stamp'] = pd.to_datetime(data['time_stamp'])

# Define a function to assign seasons based on specific start dates
def get_accurate_season(date):
    if pd.isna(date):
        return np.nan  # Return NaN if date is missing (NaT)
    
    month = date.month
    day = date.day
    
    if (month == 12 and day >= 21) or month in [1, 2] or (month == 3 and day < 20):
        return 'Winter'
    elif (month == 3 and day >= 20) or month in [4, 5] or (month == 6 and day < 21):
        return 'Spring'
    elif (month == 6 and day >= 21) or month in [7, 8] or (month == 9 and day < 22):
        return 'Summer'
    else:
        return 'Fall'

# Apply the function to create a new 'season' column
data['season'] = data['time_stamp'].apply(get_accurate_season)

# Display the first few rows
data[['time_stamp', 'season']].head()

Unnamed: 0,time_stamp,season
0,2024-10-24 18:00:00+00:00,Fall
1,2024-10-24 06:00:00+00:00,Fall
2,2024-10-24 00:00:00+00:00,Fall
3,2024-10-23 00:00:00+00:00,Fall
4,2024-10-23 18:00:00+00:00,Fall


### Encode Season Variable

In [6]:
data = data.dropna(subset=['season'])
# One-hot encode the 'season' column
data = pd.get_dummies(data, columns=['season'], prefix='season')

# Display the first few rows
data.head()

Unnamed: 0,time_stamp,humidity,temperature,pressure,pm2.5_alt,pm2.5_atm,sensor_index,latitude,longitude,altitude,...,Elm,Pecan,Crape Myrtle,Ashe Juniper,other,total_diameter,season_Fall,season_Spring,season_Summer,season_Winter
0,2024-10-24 18:00:00+00:00,35.447,92.044,981.676,2.5,3.2195,2862,30.366156,-97.76529,899.0,...,426,2,49,10,130,22595.5,1,0,0,0
1,2024-10-24 06:00:00+00:00,61.95,78.405,984.603,3.6,5.647,2862,30.366156,-97.76529,899.0,...,426,2,49,10,130,22595.5,1,0,0,0
2,2024-10-24 00:00:00+00:00,45.133,83.672,985.485,3.1,4.76,2862,30.366156,-97.76529,899.0,...,426,2,49,10,130,22595.5,1,0,0,0
3,2024-10-23 00:00:00+00:00,48.216,83.123,986.001,2.6,3.9515,2862,30.366156,-97.76529,899.0,...,426,2,49,10,130,22595.5,1,0,0,0
4,2024-10-23 18:00:00+00:00,33.578,93.095,985.318,2.5,3.2565,2862,30.366156,-97.76529,899.0,...,426,2,49,10,130,22595.5,1,0,0,0


## Random Forest

In [7]:
data.columns

Index(['time_stamp', 'humidity', 'temperature', 'pressure', 'pm2.5_alt',
       'pm2.5_atm', 'sensor_index', 'latitude', 'longitude', 'altitude', 'Oak',
       'Elm', 'Pecan', 'Crape Myrtle', 'Ashe Juniper', 'other',
       'total_diameter', 'season_Fall', 'season_Spring', 'season_Summer',
       'season_Winter'],
      dtype='object')

In [9]:
# Define features and target
features = ['humidity', 'temperature', 'pressure','altitude', 'Oak',
       'Elm', 'Pecan', 'Crape Myrtle', 'Ashe Juniper', 'other',
       'total_diameter', 'season_Fall', 'season_Spring', 'season_Summer',
       'season_Winter']
target = 'pm2.5_alt'

# Drop NA values
data = data.dropna(subset=features+[target])

In [10]:
X = data[features]
y = data[target]

In [11]:
# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

rf_regressor = RandomForestRegressor(
    n_estimators=100,         # Number of trees in the forest
    max_depth=None,           # Maximum depth of each tree
    min_samples_split=2,      # Minimum samples required to split a node
    min_samples_leaf=1,       # Minimum samples required at a leaf node
    random_state=42           # Ensures reproducibility
)

# Step 2: Fit the model
rf_regressor.fit(X_train, y_train)

# Step 3: Make predictions
y_pred = rf_regressor.predict(X_test)

# Step 4: Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R^2 Score: {r2}")

Mean Squared Error: 28.06538195562027
R^2 Score: 0.39880719794064134
