In [31]:
# Import dependencies
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Wrapper
from sklearn.compose import TransformedTargetRegressor

# Added SQLalchemy
import sqlalchemy as db
from config import password

# Setup Database Connection

In [5]:
# create the connection to the PostgreSQL database.
db_string = f"postgresql://postgres1:{password}@final-project-database.crwsgvv9ibw0.us-east-1.rds.amazonaws.com:5432/final_project_db"
con = db.create_engine(db_string).connect()

  """)


# Import Database Table

In [6]:
# Read beach attributes and create dataframe
beach_attributes_df = pd.read_sql_table("beach_attributes",con)
beach_attributes_df.head()

Unnamed: 0,beach_id,beach_name,tier,start_lat,start_long,end_lat,end_long,waterbody_type
0,TX710697,25th St.,1,29.298146,-94.777565,29.284662,-94.794776,Open Coast
1,TX214299,45th St.,1,29.284667,-94.79477,29.271917,-94.815865,Open Coast
2,TX486021,61st St.,1,29.271922,-94.815859,29.264091,-94.830244,Open Coast
3,TX327206,Appfel Park,1,29.337451,-94.73301,29.32425,-94.739129,Open Coast
4,TX940700,Caplen,2,29.503046,-94.510477,29.494188,-94.532478,Open Coast


In [7]:
water_quality_df = pd.read_sql_table("water_quality",con)
water_quality_df.head()

Unnamed: 0,date,year,beach_id,beach_name,station_id,station_name,identifier,start_time,zone_code,bacteria_count,result_measure_unit,result_analytical_method_identifier,result_analytical_method_name
0,2021-12-27,2021,TX974690,Jamaica Beach,GAL014,Jamaica Beach South,GAL014_20211227_90752,09:00:00 AM,CDT,7.94,MPN/100ml,19299,ENTEROLERT
1,2021-12-27,2021,TX974690,Jamaica Beach,GAL014,Jamaica Beach South,GAL014_20211227_90752,09:00:00 AM,CDT,20.0,MPN/100ml,19299,ENTEROLERT
2,2021-12-15,2021,TX974690,Jamaica Beach,GAL014,Jamaica Beach South,GAL014_20211215_90599,09:00:00 AM,CDT,6.3,MPN/100ml,19299,ENTEROLERT
3,2021-12-15,2021,TX974690,Jamaica Beach,GAL014,Jamaica Beach South,GAL014_20211215_90599,09:00:00 AM,CDT,5.0,MPN/100ml,19299,ENTEROLERT
4,2021-12-01,2021,TX974690,Jamaica Beach,GAL014,Jamaica Beach South,GAL014_20211201_90435,09:00:00 AM,CDT,7.07,MPN/100ml,19299,ENTEROLERT


In [8]:
weather_station1_df = pd.read_sql_table("weather_station1",con)
weather_station1_df

Unnamed: 0,date1,avg_temp1,max_temp1,min_temp1,precipitation1,snowfall1,snow_depth1
0,1946-08-01,,86.0,77.0,0.00,0.0,0.0
1,1946-08-02,,80.0,78.0,0.00,0.0,0.0
2,1946-08-03,,90.0,80.0,0.00,0.0,0.0
3,1946-08-04,,91.0,81.0,0.00,0.0,0.0
4,1946-08-05,,91.0,80.0,0.00,0.0,0.0
...,...,...,...,...,...,...,...
16978,2022-05-31,86.0,91.0,83.0,0.00,0.0,0.0
16979,2022-06-01,86.0,90.0,81.0,0.00,0.0,0.0
16980,2022-06-02,85.0,93.0,77.0,0.59,0.0,0.0
16981,2022-06-03,83.0,90.0,78.0,0.00,0.0,0.0


# Merge Water Quality and Weather Station 1 on Date

In [10]:
# Attempt to merge water quality table and weather data
bacteria_wx_df = water_quality_df.merge(weather_station1_df,how="left",left_on="date",right_on="date1")
bacteria_wx_df.head()

Unnamed: 0,date,year,beach_id,beach_name,station_id,station_name,identifier,start_time,zone_code,bacteria_count,result_measure_unit,result_analytical_method_identifier,result_analytical_method_name,date1,avg_temp1,max_temp1,min_temp1,precipitation1,snowfall1,snow_depth1
0,2021-12-27,2021,TX974690,Jamaica Beach,GAL014,Jamaica Beach South,GAL014_20211227_90752,09:00:00 AM,CDT,7.94,MPN/100ml,19299,ENTEROLERT,2021-12-27,73.0,77.0,71.0,0.0,0.0,0.0
1,2021-12-27,2021,TX974690,Jamaica Beach,GAL014,Jamaica Beach South,GAL014_20211227_90752,09:00:00 AM,CDT,20.0,MPN/100ml,19299,ENTEROLERT,2021-12-27,73.0,77.0,71.0,0.0,0.0,0.0
2,2021-12-15,2021,TX974690,Jamaica Beach,GAL014,Jamaica Beach South,GAL014_20211215_90599,09:00:00 AM,CDT,6.3,MPN/100ml,19299,ENTEROLERT,2021-12-15,74.0,79.0,70.0,0.0,0.0,0.0
3,2021-12-15,2021,TX974690,Jamaica Beach,GAL014,Jamaica Beach South,GAL014_20211215_90599,09:00:00 AM,CDT,5.0,MPN/100ml,19299,ENTEROLERT,2021-12-15,74.0,79.0,70.0,0.0,0.0,0.0
4,2021-12-01,2021,TX974690,Jamaica Beach,GAL014,Jamaica Beach South,GAL014_20211201_90435,09:00:00 AM,CDT,7.07,MPN/100ml,19299,ENTEROLERT,2021-12-01,67.0,77.0,59.0,0.0,0.0,0.0


In [12]:
# Create DF of bactiera counts and basic weather data
columns=["bacteria_count","avg_temp1","max_temp1","min_temp1","precipitation1"]
basic_df = bacteria_wx_df[columns]
basic_df.head()

Unnamed: 0,bacteria_count,avg_temp1,max_temp1,min_temp1,precipitation1
0,7.94,73.0,77.0,71.0,0.0
1,20.0,73.0,77.0,71.0,0.0
2,6.3,74.0,79.0,70.0,0.0
3,5.0,74.0,79.0,70.0,0.0
4,7.07,67.0,77.0,59.0,0.0


In [13]:
basic_df.shape

(42540, 5)

In [35]:
basic_df.dropna(inplace=True)
basic_df.shape

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


(41967, 5)

# Preprocessing 
## View Data Types


In [36]:
basic_df.dtypes

bacteria_count    float64
avg_temp1         float64
max_temp1         float64
min_temp1         float64
precipitation1    float64
dtype: object

## Seperate data into Training and Features

In [37]:
y=basic_df["bacteria_count"]
X=basic_df.drop("bacteria_count",1)
print(y.shape)
print(X.shape)

(41967,)
(41967, 4)


  


## Split Data Into Training and Testing

In [38]:
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=42)

## Scale Data

### Scale Features Like Normal

In [39]:
# Start an instance of Standard Scaler()
scale=StandardScaler()

In [40]:
# Fit scaler data
scale.fit(X_train)



StandardScaler()

In [41]:
# Apply scaling to freature data
X_train_scale = scale.transform(X_train)
X_test_scale = scale.transform(X_test)



In [42]:
# Convert scaled array back to df to get feature names back
X_train_scale_df = pd.DataFrame(X_train_scale,columns=X.columns)
X_test_scale_df = pd.DataFrame(X_test_scale,columns=X.columns)

# Modeling

In [43]:
# Setup the instance of the linear regression model, find intercept is false since X values are normalized
lr_model=LinearRegression(fit_intercept=False)

In [44]:
# Use TransformedTargetRegressor() to transform the target variable and transform it back for testing
wrapped_model = TransformedTargetRegressor(regressor=lr_model,transformer=StandardScaler())

In [45]:
# Fit the model
wrapped_model.fit(X_train_scale_df,y_train)



TransformedTargetRegressor(regressor=LinearRegression(fit_intercept=False),
                           transformer=StandardScaler())

# Evaluate

## Model Coefficients

In [58]:
# Return the coefficeints of the linear model
base_coef = wrapped_model.regressor_.coef_
pd.DataFrame(base_coef.reshape(1,4),columns=X.columns)

Unnamed: 0,avg_temp1,max_temp1,min_temp1,precipitation1
0,-0.06258,-0.03458,0.111303,0.178431


## R-squared scores

In [46]:
# Measure the R-squred value for the model using the training data, test to see if any realtionship was detected
wrapped_model.score(X_train_scale_df,y_train)



0.031715102303935594

In [47]:
wrapped_model.score(X_test_scale_df,y_test)



0.016662125623697954

## Reisduals