In [1]:
import plotly.express as pe
from plotly.figure_factory import create_distplot
import pandas as pd
import datetime as dt
import seaborn as sns
import matplotlib.dates as mdates
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import matplotlib.dates as mdates
import matplotlib.cbook as cbook
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import numpy as np
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import classification_report


hatchery = 'WALLACE R HATCHERY'
species = 'Coho'
river = 'sky.txt'
def df(hatchery, species, river):
    #Create df from hatchery file
    esc = pd.read_csv('wdfwHatchery6_18.csv', low_memory=False)
    esc['Date'] = pd.to_datetime(esc['Date'])
    esc = esc[['Date', 'Facility', 'Adult Count', 'Species', 'Event']]
    esc = esc[esc.Facility == hatchery]
    esc = esc[esc.Event == 'Trap Estimate']
    esc = esc[esc.Species == species]
    esc = esc.sort_values(by=['Date'])
    esc = esc[esc['Adult Count'] != 0]

    #Creat df from weather file
    wdf = pd.read_csv('weatherMonroe.csv', low_memory=False)
    wdf = wdf.rename(columns={'DATE':'Date'})
    wdf['Date'] = wdf['Date'].astype('datetime64[ns]')

    #Creat df from rivers file
    df = pd.read_csv(river, sep='\t', header=None)
    df = df.rename(columns ={2:"Date",3:"cfs"})
    df = df.drop(columns=[0,1,4],)
    df['Date'] = df['Date'].astype('datetime64[ns]')
    df['cfsDelta1'] = df.cfs.diff(1)
    df['cfsDelta3'] = df.cfs.diff(3)
    df['cfsDelta7'] = df.cfs.diff(7)
    df['cfsDelta14'] = df.cfs.diff(14)

    #Assing binary code for increase/decrease in river levels
    deltas = ['cfsDelta1','cfsDelta3','cfsDelta7','cfsDelta14']
    for delta in deltas:
        df.loc[df[delta] < 0, delta] = 0
        df.loc[df[delta] > 0, delta] = 1

    #Get entire time period for species
    df_time = esc
    first = df_time.iloc[0].Date
    last = df_time.iloc[-1].Date

    #Merge data frames and trim for species date range
    plt_tst = pd.merge(df, esc, on=['Date'], how='outer')
    plt_tst = pd.merge(plt_tst, wdf, on=['Date'], how='outer')
    rt = (plt_tst['Date'] >= first) & (plt_tst['Date'] <= last)
    plt_tst = plt_tst.loc[rt]

    #Some data formatting
    plt_tst['DOY'] = plt_tst['Date'].dt.strftime('%j')
    plt_tst['Month'] = plt_tst['Date'].dt.strftime('%b')
    plt_tst['Month#'] = plt_tst['Date'].dt.strftime('%m')
    plt_tst['Year'] = plt_tst['Date'].dt.strftime('%Y')
    plt_tst['Adult Count'] = plt_tst['Adult Count'].fillna(0)

    plt_tst['returnBinary'] = plt_tst['Adult Count'].astype(np.int64)
    plt_tst.returnBinary[plt_tst.returnBinary > 0] = 1
    plt_tst['Facility'] = hatchery
    plt_tst['Species'] = species
    #Define season for fish
    mask = plt_tst.returnBinary == 1
    season = plt_tst.loc[mask, 'DOY'].tolist()
    season = list(set(season))
    m = plt_tst['DOY'].isin(season)
    plt_tst = plt_tst[m]
    

    plt_tst = plt_tst.dropna(subset=['cfsDelta1','cfsDelta3','cfsDelta7','cfsDelta14','PRCP','TMAX','TMIN'])

    df_out = plt_tst[['Date','returnBinary','cfs','cfsDelta1','cfsDelta3','cfsDelta7','cfsDelta14','PRCP','TMAX','TMIN','Year']]
    return(df_out)

In [2]:
af = df(hatchery,species,river)
af



Unnamed: 0,Date,returnBinary,cfs,cfsDelta1,cfsDelta3,cfsDelta7,cfsDelta14,PRCP,TMAX,TMIN,Year
90,1997-10-01,0,3510.0,1.0,0.0,1.0,0.0,0.46,70.0,52.0,1997
91,1997-10-02,1,4990.0,1.0,1.0,1.0,0.0,0.23,63.0,51.0,1997
92,1997-10-03,0,7390.0,1.0,1.0,1.0,1.0,0.58,61.0,50.0,1997
93,1997-10-04,0,24400.0,1.0,1.0,1.0,1.0,0.73,58.0,54.0,1997
94,1997-10-05,0,18000.0,0.0,1.0,1.0,1.0,0.13,63.0,50.0,1997
...,...,...,...,...,...,...,...,...,...,...,...
8579,2020-11-09,1,4510.0,0.0,0.0,1.0,1.0,0.12,46.0,25.0,2020
8580,2020-11-10,1,4040.0,0.0,0.0,1.0,1.0,0.45,41.0,35.0,2020
8581,2020-11-10,1,4040.0,0.0,0.0,1.0,1.0,0.45,41.0,35.0,2020
8582,2020-11-11,1,3560.0,0.0,0.0,0.0,1.0,0.00,41.0,32.0,2020


In [77]:
df_in = af
# feature_cols = ['cfs','cfsDelta1','cfsDelta3','cfsDelta7','cfsDelta14','PRCP','TMAX','TMIN']
feature_cols = ['cfs','PRCP','TMAX','TMIN'] #independent variables
X = df_in[feature_cols] 
y = df_in.returnBinary #target variable
X_train = df_in[df_in.Year < '2017']
X_train = X_train[feature_cols]
X_test = df_in[df_in.Year > '2017']
X_test = X_test[feature_cols]
y_train = df_in[df_in.Year < '2017']
y_train = y_train.returnBinary
y_test = df_in[df_in.Year > '2017']
y_test = y_test.returnBinary

In [78]:
logreg = LogisticRegression()

logreg.fit(X_train,y_train)

y_pred=logreg.predict(X_test)

cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
print(cnf_matrix)



score = logreg.score(X_test, y_test)
print(classification_report(y_test, y_pred))

[[155   0]
 [114   0]]
              precision    recall  f1-score   support

           0       0.58      1.00      0.73       155
           1       0.00      0.00      0.00       114

    accuracy                           0.58       269
   macro avg       0.29      0.50      0.37       269
weighted avg       0.33      0.58      0.42       269



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
