# Streamflow Project


## Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
from sklearn import linear_model
from sklearn.model_selection import train_test_split
import matplotlib
%matplotlib inline
import seaborn as sns
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from datascience import Table
from sklearn.tree import DecisionTreeRegressor
import math
from sklearn.neighbors import KNeighborsClassifier
df = pd.read_csv('ColoradoRiverData.csv')

## Data cleaning

In [2]:
df['datetime'] = df['Date'].apply(lambda x: datetime.strptime(str(x),"%Y-%m"))
df['year'] = df['datetime'].apply(lambda x: int(x.year))
df

Unnamed: 0,Date,ppt (inches),tmin (degrees F),tmean (degrees F),tmax (degrees F),Flowrate (cfs),datetime,year
0,1988-03,0.63,37.7,54.6,71.5,2.680,1988-03-01,1988
1,1988-04,0.69,45.5,62.2,78.8,2.330,1988-04-01,1988
2,1988-05,2.50,57.2,70.4,83.6,4.560,1988-05-01,1988
3,1988-06,2.28,65.9,78.7,91.5,18.200,1988-06-01,1988
4,1988-07,3.62,68.9,79.9,90.8,76.100,1988-07-01,1988
...,...,...,...,...,...,...,...,...
394,2021-01,0.97,32.1,42.4,52.7,0.004,2021-01-01,2021
395,2021-02,0.07,26.6,38.5,50.5,0.000,2021-02-01,2021
396,2021-03,0.56,43.4,57.3,71.2,1.160,2021-03-01,2021
397,2021-04,1.02,47.7,60.1,72.4,0.410,2021-04-01,2021


In [3]:
df['Flowstate'] = (df['Flowrate (cfs)']>0).astype('uint8')

In [4]:
df.fillna(0, inplace=True)

## Exploratory Data Analysis

In [5]:
df90 = df.loc[(df["year"]>=1990)&(df["year"]<2000)]
print("Max flowrate for 90's:", max(df90["Flowrate (cfs)"]))
print("Number of no flow days in the 90's:",(df90["Flowrate (cfs)"] == 0).sum().sum())

Max flowrate for 90's: 263.2
Number of no flow days in the 90's: 43


In [6]:
df00 = df.loc[(df["year"]>=2000)&(df["year"]<2010)]
print("Max flowrate for 2000's:", max(df00["Flowrate (cfs)"]))
print("Number of no flow days in the 2000's:",(df00["Flowrate (cfs)"] == 0).sum().sum())

Max flowrate for 2000's: 179.3
Number of no flow days in the 2000's: 23


In [7]:
df10 = df.loc[(df["year"]>=2010)&(df["year"]<2020)]
print("Max flowrate for 2010's:", max(df10["Flowrate (cfs)"]))
print("Number of no flow days in the 2010's:",(df10["Flowrate (cfs)"] == 0).sum().sum())

Max flowrate for 2010's: 709.9
Number of no flow days in the 2010's: 52


In [8]:
plt.scatter(df['ppt (inches)'],df['Flowrate (cfs)'])
plt.xlabel("Precipitation by inches")
plt.ylabel("Flowrate")
plt.show()
plt.scatter(df['tmean (degrees F)'],df['Flowrate (cfs)'])
plt.xlabel("Temperature (F)")
plt.ylabel("Flowrate")
plt.show()

  plt.show()
  plt.show()


## Flowrate prediction model 1

In [9]:
df['LastMonthFlowrate'] = df['Flowrate (cfs)'].shift(1).bfill()

In [10]:
columns = []
for i,j in zip(df.columns,df.dtypes):
    print(i,j)
    if "float" in str(j) or "int" in str(j):
            columns.append(i)
print(columns)

Date object
ppt (inches) float64
tmin (degrees F) float64
tmean (degrees F) float64
tmax (degrees F) float64
Flowrate (cfs) float64
datetime datetime64[ns]
year int64
Flowstate uint8
LastMonthFlowrate float64
['ppt (inches)', 'tmin (degrees F)', 'tmean (degrees F)', 'tmax (degrees F)', 'Flowrate (cfs)', 'year', 'Flowstate', 'LastMonthFlowrate']


In [11]:
x = df[columns]
y = df['Flowrate (cfs)']
x.drop(['Flowrate (cfs)','Flowstate','year'], axis='columns', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [12]:
x_train, x_test, y_train, y_test = train_test_split(x,y,train_size=0.75,random_state = 0)
print(len(x_train))
print(len(y_train))
print(len(x_test))
print(len(y_test))

299
299
100
100


In [13]:
model1 = DecisionTreeRegressor( max_depth=5, min_samples_leaf=1,)
model1.fit(x_train,y_train)
predictions = model1.predict(x_test)
print("Accuracy:" , model1.score(x_test, y_test))

Accuracy: 0.6069662758333196


In [14]:
i = input("Enter ppt, tmin, tmean, tmax, last month's flowrate - seperated by commas: ").split(",")
r = model1.predict([(i)])
print("Estimated Flowrate:", r)

Enter ppt, tmin, tmean, tmax, last month's flowrate - seperated by commas:  3,4,56,9,65


Estimated Flowrate: [17.30346939]


### Testing flowrate model

In [15]:
testnumbers = pd.read_csv('Testnumbers.csv')
testnumbers

Unnamed: 0,ppt,tmin (degrees F),tmean (degrees F),tmax (degrees F),LastMonthFlowrate
0,0.0,85,99,113,0.0
1,4.5,75,85,95,74.5
2,2.2,0,10,20,55.0
3,1.0,40,60,80,36.3
4,0.0,40,60,80,12.0


In [16]:
for i in range(0,5):
    prediction = model1.predict([testnumbers.iloc[i]])
    print(i,prediction)

0 [1.08282723]
1 [5.967]
2 [17.30346939]
3 [5.35552]
4 [1.08282723]


### Flowrate prediction model 2

In [17]:
model2 = linear_model.LinearRegression( fit_intercept=True)
model2.fit(x_train, y_train)
print(model2.score(x_test,y_test))

0.39266971804642825


## Flowstate prediction model 1

In [18]:
x = df[columns]
y = df['Flowstate']
x.drop(['Flowrate (cfs)', 'Flowstate','year','LastMonthFlowrate'], axis='columns', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [19]:
x_train, x_test, y_train, y_test = train_test_split(x,y,train_size=0.75,random_state=0)

In [20]:
model3 = LogisticRegression()
model3.fit(x_train,y_train)
predict = model3.predict(x_test)
print("Accuracy:",model3.score(x_test, y_test))

Accuracy: 0.79


In [21]:
i = input("Enter ppt, tmin, tmean, tmax, - seperated by commas: ").split(",")
r = model3.predict([(i)])
print("Estimated Flowstate:", r)

Enter ppt, tmin, tmean, tmax, - seperated by commas:  3,3,4,5


Estimated Flowstate: [1]


  return f(*args, **kwargs)


### Testing flowstate model

In [22]:
testnumbers.drop('LastMonthFlowrate', axis='columns', inplace=True)
for i in range(0,5):
    prediction = model3.predict([testnumbers.iloc[i]])
    print(i,prediction)

0 [0]
1 [1]
2 [1]
3 [0]
4 [0]


### Flowstate prediction model's 2 & 3

In [23]:
model4 = RandomForestClassifier(n_estimators=100)
model4.fit(x_train, y_train)
predictions = model4.predict(x_test)
print(model4.score(x_test, y_test))

0.77


In [25]:
model5 = KNeighborsClassifier(n_neighbors=10)
model5.fit(x_train, y_train)
predictions = model5.predict(x_test)
print(model5.score(x_test,y_test))

0.73
