<a href="https://colab.research.google.com/github/david-garza/final_project/blob/ml_refinement/machine_learning/ml_regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Import dependencies
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# to import log function to transform y variable
import numpy as np

# Added SQLalchemy
import sqlalchemy as db
from config import password

# Setup Database Connection

In [2]:
# create the connection to the PostgreSQL database.
db_string = f"postgresql://postgres1:{password}@final-project-database.crwsgvv9ibw0.us-east-1.rds.amazonaws.com:5432/final_project_db"
con = db.create_engine(db_string).connect()

  """)


# Import Database Table

In [3]:
data_df = pd.read_sql_table("galveston_bacteria_data",con)
data_df.head()

Unnamed: 0,beach_id,beach_name,start_lat,start_long,end_lat,end_long,waterbody_type,station_id,station_name,bacteria_count,date1,avg_temp1,max_temp1,min_temp1,precipitation1,precipitation54,precipitation18
0,TX767833,Sea Isle,29.157639,-95.011542,29.125974,-95.062028,Open Coast,GAL005,Terramar Beach,40.0,2007-01-22,52.0,53.0,50.0,0.0,,
1,TX767833,Sea Isle,29.157639,-95.011542,29.125974,-95.062028,Open Coast,GAL005,Terramar Beach,38.0,2007-01-22,52.0,53.0,50.0,0.0,,
2,TX767833,Sea Isle,29.157639,-95.011542,29.125974,-95.062028,Open Coast,GAL007,Sea Isle South,58.0,2007-01-22,52.0,53.0,50.0,0.0,,
3,TX767833,Sea Isle,29.157639,-95.011542,29.125974,-95.062028,Open Coast,GAL007,Sea Isle South,48.0,2007-01-22,52.0,53.0,50.0,0.0,,
4,TX974690,Jamaica Beach,29.182981,-94.969426,29.176498,-94.980493,Open Coast,GAL014,Jamaica Beach South,64.0,2007-01-22,52.0,53.0,50.0,0.0,,


In [5]:
# Create DF of bactiera counts and basic weather station 1 data only
columns=["bacteria_count","avg_temp1","max_temp1","min_temp1","precipitation1"]
basic_df = data_df[columns]
basic_df.head()

Unnamed: 0,bacteria_count,avg_temp1,max_temp1,min_temp1,precipitation1
0,40.0,52.0,53.0,50.0,0.0
1,38.0,52.0,53.0,50.0,0.0
2,58.0,52.0,53.0,50.0,0.0
3,48.0,52.0,53.0,50.0,0.0
4,64.0,52.0,53.0,50.0,0.0


In [6]:
basic_df.shape

(29743, 5)

In [7]:
basic_df.dropna(inplace=True)
basic_df.shape

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


(29248, 5)

# Preprocessing 
## View Data Types


In [8]:
basic_df.dtypes

bacteria_count    float64
avg_temp1         float64
max_temp1         float64
min_temp1         float64
precipitation1    float64
dtype: object

## Seperate data into Training and Features

In [9]:
y=basic_df["bacteria_count"]
X=basic_df.drop("bacteria_count",1)
print(y.shape)
print(X.shape)

(29248,)
(29248, 4)


  


## Split Data Into Training and Testing

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=42)

## Scale Data

### Scale Features Like Normal

In [11]:
# Start an instance of Standard Scaler()
scale=StandardScaler()

In [12]:
# Fit scaler data
scale.fit(X_train)



StandardScaler()

In [13]:
# Apply scaling to freature data
X_train_scale = scale.transform(X_train)
X_test_scale = scale.transform(X_test)



In [14]:
# Convert scaled array back to df to get feature names back
X_train_scale_df = pd.DataFrame(X_train_scale,columns=X.columns)
X_test_scale_df = pd.DataFrame(X_test_scale,columns=X.columns)

### Scale the y variable

In [15]:
y_train.describe()

count    21936.000000
mean        52.318988
std        452.673105
min          1.000000
25%          5.000000
50%         10.000000
75%         25.840000
max      24200.000000
Name: bacteria_count, dtype: float64

In [16]:
y_train_log = np.log(y_train+1)

In [17]:
y_train_log.describe()

count    21936.000000
mean         2.606150
std          1.233294
min          0.693147
25%          1.791759
50%          2.397895
75%          3.289893
max         10.094149
Name: bacteria_count, dtype: float64

In [18]:
y_test_log = np.log(y_test+1)

# Model 1

Basic model only uses bacteria counts and one weather station

In [19]:
# Setup the instance of the linear regression model, find intercept is false since X values are normalized
lr_model=LinearRegression()

In [20]:
# Fit the lr_model with the scaled features and y-variable
lr_model.fit(X_train_scale_df,y_train_log)



LinearRegression()

## Model Coefficients

In [21]:
# Return the coefficeints of the linear model
base_coef = lr_model.coef_
pd.DataFrame(base_coef.reshape(1,4),columns=X.columns)

Unnamed: 0,avg_temp1,max_temp1,min_temp1,precipitation1
0,0.435268,-0.637812,0.331752,0.342874


## R-squared scores

In [22]:
# Measure the R-squred value for the model using the training data, test to see if any realtionship was detected
lr_model.score(X_train_scale_df,y_train_log)



0.10778724992425082

In [23]:
lr_model.score(X_test_scale_df,y_test_log)



0.1012132278348995

## Reisduals

In [24]:
# First predict the values y_hat values for both the trained and test sets
y_hat_train = np.expm1(lr_model.predict(X_train_scale_df))
y_hat_test = np.expm1(lr_model.predict(X_test_scale_df))



In [25]:
# compute the residuals
residual_train = y_hat_train-y_train
residual_test = y_hat_test - y_test

In [26]:
# Import Plotly
import plotly.express as px

In [27]:
# Plot residuals
fig = px.scatter(x=y_train, y=residual_train,labels=dict(x="Actual",y="Residual"),title="Residuals of Training Data")
fig.show()

In [28]:
fig = px.scatter(x=y_test, y=residual_test,labels=dict(x="Actual",y="Residual"),title="Residuals of Testing Data")
fig.show()

# Model 2

Include the beach testing station as a feature

In [48]:
# Create DF of bactiera counts and basic weather station 1 data only and the test station_id
columns=["bacteria_count","avg_temp1","max_temp1","min_temp1","precipitation1","station_id"]
basic_sta_df = data_df[columns]
basic_sta_df.dropna(inplace=True)
basic_sta_df.head()



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,bacteria_count,avg_temp1,max_temp1,min_temp1,precipitation1,station_id
0,40.0,52.0,53.0,50.0,0.0,GAL005
1,38.0,52.0,53.0,50.0,0.0,GAL005
2,58.0,52.0,53.0,50.0,0.0,GAL007
3,48.0,52.0,53.0,50.0,0.0,GAL007
4,64.0,52.0,53.0,50.0,0.0,GAL014


## Preprocessing

In [49]:
basic_sta_df_encode = pd.get_dummies(basic_sta_df)
basic_sta_df_encode.head()

Unnamed: 0,bacteria_count,avg_temp1,max_temp1,min_temp1,precipitation1,station_id_GAL001,station_id_GAL003,station_id_GAL005,station_id_GAL007,station_id_GAL013,...,station_id_GAL042,station_id_GAL044,station_id_GAL045,station_id_GAL046,station_id_GAL047,station_id_GAL048,station_id_GAL049,station_id_GAL050,station_id_GAL053,station_id_GAL055
0,40.0,52.0,53.0,50.0,0.0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,38.0,52.0,53.0,50.0,0.0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,58.0,52.0,53.0,50.0,0.0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,48.0,52.0,53.0,50.0,0.0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,64.0,52.0,53.0,50.0,0.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Seperate data into Training and Features

In [50]:
y=basic_sta_df_encode["bacteria_count"]
X=basic_sta_df_encode.drop("bacteria_count",1)
print(y.shape)
print(X.shape)

(29248,)
(29248, 40)



In a future version of pandas all arguments of DataFrame.drop except for the argument 'labels' will be keyword-only



## Split Data Into Training and Testing

In [51]:
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=42)

## Scale Data

### Scale Features Like Normal

In [52]:
# Fit scaler data
scale.fit(X_train)


Feature names only support names that are all strings. Got feature names with dtypes: ['quoted_name', 'str']. An error will be raised in 1.2.



StandardScaler()

In [53]:
# Apply scaling to freature data
X_train_scale = scale.transform(X_train)
X_test_scale = scale.transform(X_test)


Feature names only support names that are all strings. Got feature names with dtypes: ['quoted_name', 'str']. An error will be raised in 1.2.


Feature names only support names that are all strings. Got feature names with dtypes: ['quoted_name', 'str']. An error will be raised in 1.2.



In [54]:
# Convert scaled array back to df to get feature names back
X_train_scale_df = pd.DataFrame(X_train_scale,columns=X.columns)
X_test_scale_df = pd.DataFrame(X_test_scale,columns=X.columns)

### Scale the y variable

In [55]:
y_train.describe()

count    21936.000000
mean        52.318988
std        452.673105
min          1.000000
25%          5.000000
50%         10.000000
75%         25.840000
max      24200.000000
Name: bacteria_count, dtype: float64

In [56]:
y_train_log = np.log(y_train+1)

In [57]:
y_train_log.describe()

count    21936.000000
mean         2.606150
std          1.233294
min          0.693147
25%          1.791759
50%          2.397895
75%          3.289893
max         10.094149
Name: bacteria_count, dtype: float64

In [58]:
y_test_log = np.log(y_test+1)

## Fit the model

In [59]:
# Fit the lr_model with the scaled features and y-variable
lr_model.fit(X_train_scale_df,y_train_log)


Feature names only support names that are all strings. Got feature names with dtypes: ['quoted_name', 'str']. An error will be raised in 1.2.



LinearRegression()

## Model Coefficients

In [61]:
# Return the coefficeints of the linear model
base_coef = lr_model.coef_
pd.DataFrame(base_coef.reshape(1,40),columns=X.columns)

Unnamed: 0,avg_temp1,max_temp1,min_temp1,precipitation1,station_id_GAL001,station_id_GAL003,station_id_GAL005,station_id_GAL007,station_id_GAL013,station_id_GAL014,...,station_id_GAL042,station_id_GAL044,station_id_GAL045,station_id_GAL046,station_id_GAL047,station_id_GAL048,station_id_GAL049,station_id_GAL050,station_id_GAL053,station_id_GAL055
0,0.433181,-0.626688,0.322637,0.330179,-0.039339,-0.020449,-0.036716,-0.023857,-0.025867,-0.016822,...,0.012967,0.021519,0.024169,0.048535,0.031098,0.019964,0.01528,0.011745,-0.024085,-0.028871


## R-squared scores

In [62]:
# Measure the R-squred value for the model using the training data, test to see if any realtionship was detected
lr_model.score(X_train_scale_df,y_train_log)


Feature names only support names that are all strings. Got feature names with dtypes: ['quoted_name', 'str']. An error will be raised in 1.2.



0.13010401960172757

In [63]:
lr_model.score(X_test_scale_df,y_test_log)


Feature names only support names that are all strings. Got feature names with dtypes: ['quoted_name', 'str']. An error will be raised in 1.2.



0.11626411436319606

## Reisduals

In [64]:
# First predict the values y_hat values for both the trained and test sets
y_hat_train = np.expm1(lr_model.predict(X_train_scale_df))
y_hat_test = np.expm1(lr_model.predict(X_test_scale_df))


Feature names only support names that are all strings. Got feature names with dtypes: ['quoted_name', 'str']. An error will be raised in 1.2.


Feature names only support names that are all strings. Got feature names with dtypes: ['quoted_name', 'str']. An error will be raised in 1.2.



In [65]:
# compute the residuals
residual_train = y_hat_train-y_train
residual_test = y_hat_test - y_test

In [66]:
# Plot residuals
fig = px.scatter(x=y_train, y=residual_train,labels=dict(x="Actual",y="Residual"),title="Residuals of Training Data")
fig.show()

In [67]:
fig = px.scatter(x=y_test, y=residual_test,labels=dict(x="Actual",y="Residual"),title="Residuals of Testing Data")
fig.show()