# Machine Learning 1

This code focuses on sampling data that can be used in the model building. As the data sets contain a lot of data, the data will be sampled to reduce size

In [1]:
import pandas as pd
import os
import numpy as np
from sklearn.model_selection import train_test_split
import random

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.metrics import f1_score

In [2]:
directory_path = 'C:/Users/sixte/University of Toronto/Antoine Pepin - Big Project/Data/Full Final DataFrames'

Using 1 % of data means we are still using over 2.7 million rows of data in the model training and validation.

Choose month:

In [3]:
m = 8

For train data

In [4]:
dataframes = []

for file_name in os.listdir(directory_path):
    if file_name.endswith('.parquet'):
        
        parts = file_name.split('_')
        year = int(parts[0])
        month = int(parts[1])

        if month == m and 2001 <= year <= 2018:
            file_path = os.path.join(directory_path, file_name)
            df = pd.read_parquet(file_path)
            
            # Perform random sampling (1% of the DataFrame)
            sample_size = int(0.01 * len(df))
            random_sample = df.sample(n=sample_size, random_state=42)
            
            dataframes.append(random_sample)

FileNotFoundError: [WinError 3] The system cannot find the path specified: 'C:/Users/sixte/University of Toronto/Antoine Pepin - Big Project/Data/Full Final DataFrames'

Since there potentially is a trend over the years in the data (i.e. we are working with a time series), the test data is chosen as the last three of the 21 years. This is about 15 % of the data, locked away.

Same for test data

In [None]:
testframes = []

for file_name in os.listdir(directory_path):
    if file_name.endswith('.parquet'):
        parts = file_name.split('_')
        year = int(parts[0])
        month = int(parts[1])

        if month == m and 2019 <= year <= 2021:
            file_path = os.path.join(directory_path, file_name)
            df = pd.read_parquet(file_path)
            
            # Perform random sampling (1% of the DataFrame)
            sample_size = int(0.01 * len(df))
            random_sample = df.sample(n=sample_size, random_state=42)
            
            testframes.append(random_sample)

For train data

In [None]:
# Too many of the values are NaN to be useful in a simple model
columns_drop = ['harvested', 'BS', 'BS%']
numericals = ['age', 'vol', 'lon', 'lat', 'Tm', 'Tx', 'Tn', 'P', 'P%N', 'Pd']

for df in dataframes:
    df[numericals] = df[numericals].astype('float32')
    df.drop(columns=columns_drop, inplace=True)

Same for test data

In [None]:
columns_drop = ['harvested', 'BS', 'BS%']
numericals = ['age', 'vol', 'lon', 'lat', 'Tm', 'Tx', 'Tn', 'P', 'P%N', 'Pd']

for df in testframes:
    df[numericals] = df[numericals].astype('float32')
    df.drop(columns=columns_drop, inplace=True)

Concat to one

In [None]:
data = pd.concat(dataframes)
test = pd.concat(testframes)

1 % of 15123785 * 18

In [14]:
print(len(data))

2722266


1 % of 15123785 * 3

In [15]:
print(len(test))

453711


In [16]:
data.head()

Unnamed: 0_level_0,age,vol,burned,dist_weight,lon,lat,Tm,Tx,Tn,P,P%N,Pd
pixel_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
3666906,38.916668,39.54644,0.0,,-91.465622,55.19109,16.4,32.099998,2.0,80.5,,13.0
11952703,97.916664,131.356522,0.0,,-83.761139,49.126392,17.6,33.599998,0.3,,,
12214921,101.916664,134.552734,0.0,,-84.541862,47.475681,16.5,29.299999,5.0,92.400002,100.0,7.0
4125732,84.25,117.746391,0.0,,-92.524483,53.474644,17.799999,32.5,3.5,100.800003,129.0,10.0
3305484,66.083336,92.079544,0.0,,-94.950882,51.561138,18.6,33.200001,2.6,82.5,93.0,12.0


In [None]:
def feature_engineering(data):
    feature_selected =[
    'age', 'vol', 'Tm', 'Tx', 'Tn', 'P'
    ]
    return data[feature_selected]

X = feature_engineering(data)

# Fit and transform the data (some values are of course NaN, even in temp and percipitation. I should have thought of this before we started to assign stations to every forest pixel)
imputer = SimpleImputer(strategy='median')
X = imputer.fit_transform(X)

y = data['burned']

X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.8, random_state=42)

In [None]:
model = RandomForestClassifier()

In [11]:
param_grid = {
    'n_estimators': [150],
    'max_depth': [10, 20],
    'min_samples_split': [4],
    'min_samples_leaf': [1, 2],
    'max_features': ['sqrt'],
    'bootstrap': [True]
}

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, scoring='f1', n_jobs=-1, verbose=2)

grid_search.fit(X_train, y_train) 

best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

best_model = grid_search.best_estimator_

Fitting 3 folds for each of 4 candidates, totalling 12 fits


In [None]:
model.fit(X_train, y_train)
#best_model.fit(X_train, y_train)

In [None]:
y_pred = best_model.predict(X_val)

In [None]:
print(f1_score(y_val, y_pred))