## Imports

In [114]:
# Import libraries
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn import metrics

rainTree_file = '../models/output/avg_rain_tree.csv'

# Read the data from the csv file
df1 = pd.read_csv(rainTree_file)
df1.rename(columns = {'Unnamed: 0': 'ID'}, inplace = True)
df1['avg'] = df1['avg'].apply(lambda x: round(x, 4))
df1.loc[df1["DBH-2"] != 1, 'DBH'] = 0
df1.loc[df1["DBH-2"] == 1, 'DBH'] = 1
df1 = df1.drop(['COUNTRY', 'SPECIES NAME', "DBH-1", 'NFI', 'FF', 'BS', "DBH-2", "Latitude", 'Longitude'], axis = 1)

# Show first five rows
df1.reset_index()
display(df1)

Unnamed: 0,ID,avg,DBH
0,0,12.8411,0.0
1,1,12.2536,1.0
2,2,15.3262,1.0
3,3,12.8411,1.0
4,4,16.0190,1.0
...,...,...,...
9378,9378,11.3339,1.0
9379,9379,15.1417,1.0
9380,9380,12.4464,1.0
9381,9381,11.3339,1.0


In [115]:
def clean_dataset(df):
    assert isinstance(df, pd.DataFrame), "df needs to be a pd.DataFrame"
    df.dropna(inplace=True)
    indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(1)
    return df[indices_to_keep].astype(np.float64)
df1 = clean_dataset(df1)

## Regression Model

In [116]:
# Initialize Features
X = df1.drop(['DBH', 'ID'], axis = 1)
# Initialize Target
y = df1['DBH']
# Create a model
lr = LinearRegression()
# Fit the model
lr.fit(X, y)
# make predictions
pred = lr.predict(X)

In [117]:
print('Intercept:', lr.intercept_)


Intercept: 1.185680384735373


In [118]:
# Initialize Coefficient
coeff_df = pd.DataFrame(lr.coef_, X.columns,columns=['Coefficient'])
coeff_df

Unnamed: 0,Coefficient
avg,-0.020369


In [119]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y, pred))
print('Mean Squared Error:', metrics.mean_squared_error(y, pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y, pred)))
print('R2:', np.sqrt(metrics.r2_score(y, pred)))

Mean Absolute Error: 0.1881248065768289
Mean Squared Error: 0.0940494002391139
Root Mean Squared Error: 0.30667474666022615
R2: 0.09783490350269836


In [120]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score

In [121]:
import statsmodels.api as sm

X = sm.add_constant(X)
reg = sm.OLS(y, X).fit()
reg.summary()

0,1,2,3
Dep. Variable:,DBH,R-squared:,0.01
Model:,OLS,Adj. R-squared:,0.009
Method:,Least Squares,F-statistic:,90.49
Date:,"Tue, 17 May 2022",Prob (F-statistic):,2.33e-21
Time:,18:03:50,Log-Likelihood:,-2219.2
No. Observations:,9365,AIC:,4442.0
Df Residuals:,9363,BIC:,4457.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,1.1857,0.031,38.431,0.000,1.125,1.246
avg,-0.0204,0.002,-9.512,0.000,-0.025,-0.016

0,1,2,3
Omnibus:,4494.221,Durbin-Watson:,1.76
Prob(Omnibus):,0.0,Jarque-Bera (JB):,17629.2
Skew:,-2.52,Prob(JB):,0.0
Kurtosis:,7.448,Cond. No.,141.0
