## Imports 

In [8]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import math

from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import tree, svm
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import ExtraTreesRegressor

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader

## Data

In [2]:
bottle1 = pd.read_csv("bottle_data_2010-14.csv")
bottle2 = pd.read_csv("bottle_Data_2015to19.csv", header=None)
cast = pd.read_csv("cast_Data_2015to19.csv")
cc = pd.read_csv('CC_Bottle_Cast.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
bottle = pd.read_csv("bottle.csv", encoding='utf-8')

  interactivity=interactivity, compiler=compiler, result=result)


In [None]:
print(bottle1.columns)
bottle1.head()
bottle1.info()

In [None]:
#extracting features we will use
bottle1['nitrogen'] = bottle1['NO3uM'] + bottle1['NO2uM'] + bottle1['NH3uM']
temp = bottle1[['Depthm', 'T_degC', 'SiO3uM', 'PO4uM', 'nitrogen', 'ChlorA']]
temp.dropna(inplace=True)
X = temp[['Depthm', 'T_degC', 'SiO3uM', 'PO4uM', 'nitrogen']]
y = temp['ChlorA']


X.info()


In [16]:
display(X) #inputs
display(y) #targets

Unnamed: 0,Depthm,T_degC,SiO3uM,PO4uM,nitrogen
792576,0,12.140,9.60,0.88,8.780
792577,2,12.140,9.60,0.88,8.780
792578,6,12.140,9.50,0.87,8.770
792579,10,12.150,9.50,0.87,8.790
792580,20,12.150,9.50,0.88,8.720
...,...,...,...,...,...
887002,200,9.989,17.02,1.31,17.450
887013,0,17.790,2.22,0.16,0.122
887014,2,17.790,2.22,0.16,0.100
887015,6,17.278,2.40,0.21,0.240


792576    1.000
792577    1.000
792578    1.060
792579    1.000
792580    0.960
          ...  
887002    0.017
887013    2.123
887014    2.123
887015    2.664
887017    3.306
Name: ChlorA, Length: 58927, dtype: float64

In [None]:
y.isna().sum()

## Trying out different algorithms from sci-kit learn on Bottle 2010-2014

In [None]:
# goal: predict chlorA amount using inputs, and check performance by target - unscaled 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

reg = LinearRegression().fit(X_train, y_train)
print('Linear regression:')
print(reg.score(X_train, y_train))
print(reg.score(X_test, y_test))

tree = tree.DecisionTreeRegressor().fit(X_train,y_train)
print('Decision tree:')
print(tree.score(X_train, y_train))
print(tree.score(X_test, y_test))


svm = svm.SVR().fit(X_train,y_train)
print('SVM:')
print(svm.score(X_train,y_train))
print(svm.score(X_test,y_test))

In [None]:
nn = MLPRegressor(random_state=1, max_iter=500).fit(X_train, y_train)
y_pred = nn.predict(X_test)
print('NN:')
print(nn.score(X_train,y_train))
print(nn.score(X_test, y_test))
print(r2_score(y_test,y_pred))


In [None]:
rf = RandomForestRegressor().fit(X_train, y_train)
print('RF:')
print(rf.score(X_train,y_train))
print(rf.score(X_test, y_test))

In [None]:
rf = RandomForestRegressor().fit(X_train, y_train)
print('RF:')
print(rf.score(X_train,y_train))
print(rf.score(X_test, y_test))

In [None]:
knn = KNeighborsRegressor().fit(X_train, y_train)
print('RF:')
print(knn.score(X_train,y_train))
print(knn.score(X_test, y_test))

In [None]:
# scaled
X_train_t= StandardScaler().fit_transform(X_train)
X_test_t = StandardScaler().fit_transform(X_test)

reg = LinearRegression().fit(X_train_t, y_train)
print('Linear regression:')
print(reg.score(X_train_t, y_train))
print(reg.score(X_test_t, y_test))

tree = tree.DecisionTreeRegressor().fit(X_train_t,y_train)
print('Decision tree:')
print(tree.score(X_train_t, y_train))
print(tree.score(X_test_t, y_test))


svm = svm.SVR().fit(X_train_t,y_train)
print('SVM:')
print(svm.score(X_train_t,y_train))
print(svm.score(X_test_t,y_test))

nn = MLPRegressor(random_state=1, max_iter=500).fit(X_train_t, y_train)
y_pred = nn.predict(X_test) # this is for r2_score
print('NN:')
print(nn.score(X_train_t,y_train))
print(nn.score(X_test_t, y_test))
print(r2_score(y_test_t,y_pred)) #same as above line


## Same algos but Bottle 2010-2019

In [4]:
#extracting features we will use
bottle['nitrogen'] = bottle['NO3uM'] + bottle['NO2uM'] + bottle['NH3uM']
temp = bottle[['Depthm', 'T_degC', 'SiO3uM', 'PO4uM', 'nitrogen', 'ChlorA']]
temp.dropna(inplace=True)
X = temp[['Depthm', 'T_degC', 'SiO3uM', 'PO4uM', 'nitrogen']]
y = temp['ChlorA']


X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 58927 entries, 792576 to 887017
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Depthm    58927 non-null  int64  
 1   T_degC    58927 non-null  float64
 2   SiO3uM    58927 non-null  float64
 3   PO4uM     58927 non-null  float64
 4   nitrogen  58927 non-null  float64
dtypes: float64(4), int64(1)
memory usage: 2.7 MB


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [5]:
# goal: predict chlorA amount using inputs, and check performance by target - unscaled 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [6]:
reg = LinearRegression().fit(X_train, y_train)
print('Linear regression:')
print(reg.score(X_train, y_train))
print(reg.score(X_test, y_test))

tree = tree.DecisionTreeRegressor().fit(X_train,y_train)
print('Decision tree:')
print(tree.score(X_train, y_train))
print(tree.score(X_test, y_test))


svm = svm.SVR().fit(X_train,y_train)
print('SVM:')
print(svm.score(X_train,y_train))
print(svm.score(X_test,y_test))

nn = MLPRegressor(random_state=1, max_iter=500).fit(X_train, y_train)
y_pred = nn.predict(X_test)
print('NN:')
print(nn.score(X_train,y_train))
print(nn.score(X_test, y_test))
print(r2_score(y_test,y_pred))

rf = RandomForestRegressor().fit(X_train, y_train)
print('RF:')
print(rf.score(X_train,y_train))
print(rf.score(X_test, y_test))

knn = KNeighborsRegressor().fit(X_train, y_train)
print('KNN:')
print(knn.score(X_train,y_train))
print(knn.score(X_test, y_test))

Linear regression:
0.17027485904710193
0.16695400863129872
Decision tree:
0.9999999754969974
0.38053934711366033
SVM:
0.1789310928744765
0.16648054769683873
NN:
0.36501000120319094
0.363351226922043
0.363351226922043
RF:
0.94601213994346
0.616950192561204
RF:
0.5821620076922642
0.3299325546804426


In [11]:
# scaled
X_train_t= StandardScaler().fit_transform(X_train)
X_test_t = StandardScaler().fit_transform(X_test)

reg = LinearRegression().fit(X_train_t, y_train)
print('Linear regression:')
print(reg.score(X_train_t, y_train))
print(reg.score(X_test_t, y_test))

#tree = tree.DecisionTreeRegressor().fit(X_train_t,y_train)
#print('Decision tree:')
#print(tree.score(X_train_t, y_train))
#print(tree.score(X_test_t, y_test))


svm = svm.SVR().fit(X_train_t,y_train)
print('SVM:')
print(svm.score(X_train_t,y_train))
print(svm.score(X_test_t,y_test))

nn = MLPRegressor(random_state=1, max_iter=500).fit(X_train_t, y_train)
y_pred = nn.predict(X_test) # this is for r2_score
print('NN:')
print(nn.score(X_train_t,y_train))
print(nn.score(X_test_t, y_test))
print(r2_score(y_test_t,y_pred)) #same as above line

rf = RandomForestRegressor().fit(X_train_t, y_train)
print('RF:')
print(rf.score(X_train_t,y_train))
print(rf.score(X_test_t, y_test))

knn = KNeighborsRegressor().fit(X_train_t, y_train)
print('KNN:')
print(knn.score(X_train_t,y_train))
print(knn.score(X_test_t, y_test))

Linear regression:
0.17027485904710193
0.16696932414268473
SVM:
0.22465970573688987
0.22219723724761295
NN:
0.4881825725856932
0.46367574334096534


NameError: name 'y_test_t' is not defined

In [10]:
et = ExtraTreesRegressor().fit( X_train, y_train)
print('extra trees regressor:')
print(et.score(X_train, y_train))
print(et.score(X_test, y_test))

et = ExtraTreesRegressor().fit( X_train_t, y_train)
print('extra trees regressor, standardized:')
print(et.score(X_train_t, y_train))
print(et.score(X_test_t, y_test))

extra trees regressor:
0.9999999755927401
0.6130708082726
extra trees regressor, standardized:
0.9999999755926477
0.5958338630989947


### Updates so far...
- The best algorithms are: Random Forest Regressor and Extra Trees Regressor
- Unstandarzied data seems better for those algorithms(we wont use the standard scaler)
- They only reach about .61 in test scoring, which we want to be as close to 1 as possible 
- Linear regression , SVM, NN and KNN give trash scores

## Using CC_Bottle data
The hope here is that with this cleaner, larger data with more predictive inputs we can make better predictions and get higher scores in testing. 

In [29]:
cc.head()
cc.shape

(221588, 16)

In [30]:
temp2 = cc[['Depth', 'Temp', 'Silicate', 'Phosphate', 'Nitrogen', 'ChlorA', 
            'Month', 'Year', 'Lat_Dec', 'Lon_Dec']]
temp2.dropna(inplace=True)
X2 = temp2[['Depth', 'Temp', 'Silicate', 'Phosphate', 'Nitrogen', 'Month', 'Year', 'Lat_Dec', 'Lon_Dec']]
y2 = temp2['ChlorA']

print(temp2.shape)

(221588, 10)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [26]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, test_size=0.20, random_state=42)

In [28]:
rf = RandomForestRegressor().fit(X_train2, y_train2)
print('random forest regressor training & testing scores:')
print(rf.score(X_train2,y_train2))
print(rf.score(X_test2, y_test2))

et = ExtraTreesRegressor().fit( X_train2, y_train2)
print('extra trees regressor trianing & testing scores:')
print(et.score(X_train2, y_train2))
print(et.score(X_test2, y_test2))

random forest regressor training & testing scores:
0.9682409401904101
0.8066752091338605
extra trees regressor trianing & testing scores:
0.9999999276094511
0.846105616888695


In [32]:
# first attempt to play with parameters, notice that the scores went down.
et = ExtraTreesRegressor(n_jobs=-1, n_estimators=100, max_depth=5, random_state=1).fit( X_train2, y_train2)
print('extra trees regressor trianing & testing scores:')
print(et.score(X_train2, y_train2))
print(et.score(X_test2, y_test2))

extra trees regressor trianing & testing scores:
0.2746054747027029
0.28453982333697936
