In [1]:
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split



In [2]:
data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/forest-fires/forestfires.csv')

data.head(5)
#data.info()

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area
0,7,5,mar,fri,86.2,26.2,94.3,5.1,8.2,51,6.7,0.0,0.0
1,7,4,oct,tue,90.6,35.4,669.1,6.7,18.0,33,0.9,0.0,0.0
2,7,4,oct,sat,90.6,43.7,686.9,6.7,14.6,33,1.3,0.0,0.0
3,8,6,mar,fri,91.7,33.3,77.5,9.0,8.3,97,4.0,0.2,0.0
4,8,6,mar,sun,89.3,51.3,102.2,9.6,11.4,99,1.8,0.0,0.0


In [3]:
# mapping "month" and "day" columns into integers
month_mapping = {'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4, 
                 'may': 5, 'jun': 6, 'jul': 7, 'aug': 8, 
                 'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12}
day_mapping = {'mon': 1, 'tue': 2, 'wed': 3, 'thu': 4, 
               'fri': 5, 'sat': 6, 'sun': 7 }
data = data.replace({'month': month_mapping, 'day': day_mapping});


data.head()

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area
0,7,5,3,5,86.2,26.2,94.3,5.1,8.2,51,6.7,0.0,0.0
1,7,4,10,2,90.6,35.4,669.1,6.7,18.0,33,0.9,0.0,0.0
2,7,4,10,6,90.6,43.7,686.9,6.7,14.6,33,1.3,0.0,0.0
3,8,6,3,5,91.7,33.3,77.5,9.0,8.3,97,4.0,0.2,0.0
4,8,6,3,7,89.3,51.3,102.2,9.6,11.4,99,1.8,0.0,0.0


In [4]:
data.corr()

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area
X,1.0,0.539548,-0.065003,-0.024922,-0.021039,-0.048384,-0.085916,0.00621,-0.051258,0.085223,0.018798,0.065387,0.063385
Y,0.539548,1.0,-0.066292,-0.005453,-0.046308,0.007782,-0.101178,-0.024488,-0.024103,0.062221,-0.020341,0.033234,0.044873
month,-0.065003,-0.066292,1.0,-0.050837,0.291477,0.466645,0.868698,0.186597,0.368842,-0.09528,-0.086368,0.013438,0.056496
day,-0.024922,-0.005453,-0.050837,1.0,-0.041068,0.06287,0.000105,0.032909,0.05219,0.092151,0.032478,-0.04834,0.023226
FFMC,-0.021039,-0.046308,0.291477,-0.041068,1.0,0.382619,0.330512,0.531805,0.431532,-0.300995,-0.028485,0.056702,0.040122
DMC,-0.048384,0.007782,0.466645,0.06287,0.382619,1.0,0.682192,0.305128,0.469594,0.073795,-0.105342,0.07479,0.072994
DC,-0.085916,-0.101178,0.868698,0.000105,0.330512,0.682192,1.0,0.229154,0.496208,-0.039192,-0.203466,0.035861,0.049383
ISI,0.00621,-0.024488,0.186597,0.032909,0.531805,0.305128,0.229154,1.0,0.394287,-0.132517,0.106826,0.067668,0.008258
temp,-0.051258,-0.024103,0.368842,0.05219,0.431532,0.469594,0.496208,0.394287,1.0,-0.52739,-0.227116,0.069491,0.097844
RH,0.085223,0.062221,-0.09528,0.092151,-0.300995,0.073795,-0.039192,-0.132517,-0.52739,1.0,0.06941,0.099751,-0.075519


In [5]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn import preprocessing

# Normalizing the data
scaler = preprocessing.Normalizer()
data[['X','Y','FFMC','DMC','DC','ISI','temp','RH','wind','rain','area']] = scaler.fit_transform(data[['X','Y','FFMC','DMC','DC','ISI','temp','RH','wind','rain','area']])
area = data[["area"]].copy()


#area = np.log(area['area']+1) by taking the logarithm, the r2_score went down
data = data.drop("area", axis=1)

data.head(5)

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain
0,0.049719,0.035513,3,5,0.612249,0.18609,0.66978,0.036224,0.058242,0.362235,0.047588,0.0
1,0.010336,0.005906,10,2,0.133774,0.05227,0.987953,0.009893,0.026578,0.048726,0.001329,0.0
2,0.010068,0.005753,10,6,0.130314,0.062856,0.987999,0.009637,0.021,0.047465,0.00187,0.0
3,0.050396,0.037797,3,5,0.577668,0.209775,0.488214,0.056696,0.052286,0.611055,0.025198,0.00126
4,0.045308,0.033981,3,7,0.505745,0.290534,0.578803,0.054369,0.064563,0.56068,0.010194,0.0


In [6]:
area.head()

Unnamed: 0,area
0,0.0
1,0.0
2,0.0
3,0.0
4,0.0


In [7]:
# Splitting the data
X_train, X_test, y_train, y_test = train_test_split(data,area, test_size=0.3, random_state=1 )

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(361, 12)
(361, 1)
(156, 12)
(156, 1)


In [8]:
from sklearn.metrics import explained_variance_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

# Using Linear Regression and calculating errors and r2_score
lin = LinearRegression()
lin.fit(X_train, y_train)
lin_pred = lin.predict(X_test)

r2 = r2_score(y_test, lin_pred)

# Evaluate the model
score = explained_variance_score(y_test, lin_pred)
mae = mean_absolute_error(y_test, lin_pred)
mse = mean_squared_error(y_test, lin_pred)


print('r2_score: ', r2)
print('mean absolute error: ', mae)
print('mean squared error: ', mse)
print('Root mean squared error: ', np.sqrt(mse))

r2_score:  0.27305643945314273
mean absolute error:  0.03643686075274298
mean squared error:  0.004610892338252906
Root mean squared error:  0.06790355173518471


In [9]:
# Grid Search and Cross Validation with Linear Regression
model = LinearRegression()
parameters = {'fit_intercept':[True,False], 'normalize':[True,False], 'copy_X':[True, False]}
grid = GridSearchCV(model,parameters, cv=5, scoring='neg_mean_squared_error')
grid.fit(X_train, y_train)
grid.best_params_

{'copy_X': True, 'fit_intercept': True, 'normalize': False}

In [10]:
# Model evaluation using GridSearchCV with Linear Regression
y_pred = grid.predict(X_test)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)

print('r2_score: ', r2)
print('mean absolute error: ', mae)
print('mean squared error: ', mse)

r2_score:  0.27305643945314273
mean absolute error:  0.03643686075274298
mean squared error:  0.004610892338252906


In [11]:
from sklearn.tree import DecisionTreeRegressor

# Using Decision Tree Regressor and calculating errors and r2_score
tree = DecisionTreeRegressor()
tree.fit(X_train, y_train)
tree_pred = tree.predict(X_test)

r2 = r2_score(y_test, tree_pred)

# Evaluate the model
score = explained_variance_score(y_test, tree_pred)
mae = mean_absolute_error(y_test, tree_pred)
mse = mean_squared_error(y_test, tree_pred)


print('r2_score: ', r2)
print('mean absolute error: ', mae)
print('mean squared error: ', mse)
print('Root mean squared error: ', np.sqrt(mse))

r2_score:  0.20479234734010576
mean absolute error:  0.03178311694515081
mean squared error:  0.005043881082337811
Root mean squared error:  0.07102028641407898


In [12]:
# Using GridSearchCV with Decision Tree Regressor
tuned_parameters = [{'max_features': range(4,13),
                             'max_depth': range(1,20),
                             }
                            ]

reg = GridSearchCV(tree, tuned_parameters, cv=5, scoring='neg_mean_squared_error')
reg.fit(X_train, y_train)



print("Best parameters set found on development set:\n")
reg.best_params_

Best parameters set found on development set:



{'max_depth': 3, 'max_features': 6}

In [13]:
# Model evaluation using GridSearchCV with Decision Tree Regressor
y_pred = reg.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print('r2_score: ', r2)
print('mean absolute error: ', mae)
print('mean squared error: ', mse)

r2_score:  0.13320868022819365
mean absolute error:  0.03059641561346532
mean squared error:  0.0054979253852848855


In [14]:
from sklearn.neighbors import KNeighborsRegressor

# Using KNeighbors Regressor and calculating errors and r2_score
knn = KNeighborsRegressor()
knn.fit(X_train, y_train)
knn_pred = knn.predict(X_test)

r2 = r2_score(y_test, knn_pred)

# Evaluate the model
score = explained_variance_score(y_test, knn_pred)
mae = mean_absolute_error(y_test, knn_pred)
mse = mean_squared_error(y_test, knn_pred)


print('r2_score: ', r2)
print('mean absolute error: ', mae)
print('mean squared error: ', mse)
print('Root mean squared error: ', np.sqrt(mse))

r2_score:  -0.05626005732583117
mean absolute error:  0.03146906771191448
mean squared error:  0.00669969674380563
Root mean squared error:  0.08185167526572458


In [15]:
from sklearn import neighbors

# Using GridSearchCV with KNeighbors Regressor
params = {'n_neighbors':range(1,20)}

knn = neighbors.KNeighborsRegressor()

knng = GridSearchCV(knn, params, cv=5, scoring='neg_mean_squared_error')
knng.fit(X_train,y_train)
knng.best_params_

{'n_neighbors': 19}

In [16]:
# Model evaluation using GridSearchCV with KNeighbors Regressor
y_pred = knng.predict(X_test)
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print('r2_score: ', r2)
print('mean absolute error: ', mae)
print('mean squared error: ', mse)

r2_score:  -0.02031893121985906
mean absolute error:  0.03168705946464952
mean squared error:  0.006471727652414901


In [17]:
area2 = area.values

In [18]:
from sklearn.ensemble import RandomForestClassifier

min_threshold = np.Inf

lin_max_r2_score = np.Inf
lin_mae_value = np.Inf
lin_mse_value = np.Inf 


tree_max_r2_score = np.Inf
tree_mae_value = np.Inf
tree_mse_value = np.Inf 

knn_max_r2_score = np.Inf
knn_mae_value = np.Inf
knn_mse_value = np.Inf 

# try thresholds from 0 to 1 by incrementing 0.01 each time
for t in np.arange(0, 1.01, 0.01) :
    #create new column for small fires (1 it is a small fire, 0 it is not a small fire)
    small_fire  = []
    for i in range(0,len(data)) :
        if (area2[i] < t) :
            small_fire.append(1)
        else:
            small_fire.append(0)

    # Use RandomForestClassifier for predicting whether it is a small fire or not 
    r_forest = RandomForestClassifier(n_estimators=10)
    r_forest.fit(data, small_fire)
    r_forest_pred = r_forest.predict(data) 
    
    # Add new column to the dataset, this column has the predictions result of a small fire
    data['random_forest'] =  r_forest_pred
    X_train, X_test, y_train, y_test = train_test_split(data,area, test_size=0.3, random_state=1 )
    
    # Trying the regression algorithms again 
    lin = LinearRegression()
    lin.fit(X_train, y_train)
    lin_pred = lin.predict(X_test)

    lin_r2 = r2_score(y_test, lin_pred)
    lin_mae = mean_absolute_error(y_test, lin_pred)
    lin_mse = mean_squared_error(y_test, lin_pred)
    
    
    tree = DecisionTreeRegressor()
    tree.fit(X_train, y_train)
    tree_pred = tree.predict(X_test)

    tree_r2 = r2_score(y_test, tree_pred)
    tree_mae = mean_absolute_error(y_test, tree_pred)
    tree_mse = mean_squared_error(y_test, tree_pred)

    
    knn = KNeighborsRegressor()
    knn.fit(X_train, y_train)
    knn_pred = knn.predict(X_test)

    knn_r2 = r2_score(y_test, knn_pred)
    knn_mae = mean_absolute_error(y_test, knn_pred)
    knn_mse = mean_squared_error(y_test, knn_pred)

    #change the values of defined parameters from infinity to numbers 
    if(t==0):
        min_threshold = t
        
        lin_max_r2_score = lin_r2 
        lin_mae_value = lin_mae
        lin_mse_value = lin_mse

        tree_max_r2_score = tree_r2
        tree_mae_value = tree_mae
        tree_mse_value = tree_mse 

        knn_max_r2_score = knn_r2
        knn_mae_value = knn_mae
        knn_mse_value = knn_mse  
        
    # if we have bigger r2_score for the run, then change the values of the parameters
    if(lin_r2 > lin_max_r2_score):
        min_threshold = t
        
        lin_max_r2_score = lin_r2 
        lin_mae_value = lin_mae
        lin_mse_value = lin_mse

        tree_max_r2_score = tree_r2
        tree_mae_value = tree_mae
        tree_mse_value = tree_mse 

        knn_max_r2_score = knn_r2
        knn_mae_value = knn_mae
        knn_mse_value = knn_mse
    
    #drop "random_forest" column from the data
    data = data.drop("random_forest", axis=1)

print('min_threshold: ', min_threshold)
print() 
print('linear r2_score: ', lin_max_r2_score)
print('linear mean absolute error: ', lin_mae_value)
print('linear mean squared error: ', lin_mse_value)
print('linear Root mean squared error: ', np.sqrt(lin_mse_value))
print()   
print('tree r2_score: ', tree_max_r2_score)
print('tree mean absolute error: ', tree_mae_value)
print('tree mean squared error: ', tree_mse_value)
print('tree Root mean squared error: ', np.sqrt(tree_mse_value))
print() 
print('knn r2_score: ', knn_max_r2_score)
print('knn mean absolute error: ', knn_mae_value)
print('knn mean squared error: ', knn_mse_value)
print('knn Root mean squared error: ', np.sqrt(knn_mse_value))
print() 


min_threshold:  0.21

linear r2_score:  0.7335149520808134
linear mean absolute error:  0.02233441103649689
linear mean squared error:  0.0016902740905844155
linear Root mean squared error:  0.04111294310292582

tree r2_score:  0.5079416684824366
tree mean absolute error:  0.024604691116797535
tree mean squared error:  0.003121051088286788
tree Root mean squared error:  0.055866368132238456

knn r2_score:  0.02736588988954569
knn mean absolute error:  0.02691810504854509
knn mean squared error:  0.006169270091419501
knn Root mean squared error:  0.07854470123069729

