In [1]:
# Import modules
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler

In [2]:
# Read the CSV file into a Pandas DataFrame
old_df = pd.read_csv(
    Path('1_summer_turbidity.csv')   
)

# Review the DataFrame
old_df.head()

Unnamed: 0.1,Unnamed: 0,Date,LKSPOMET_ATemp,LKSPOMET_TotPrcp,LKSBAWQ_Temp,LKSBAWQ_Sal,LKSBAWQ_Depth,LKSBAWQ_pH,LKSBAWQ_Turb,Turbidity_Range,Month,Day,Year
0,0,06/01/2018,7.012903,0.0,16.662366,0.1,1.673441,7.896774,14.473118,</=20,6,1,2018
1,1,06/01/2019,12.39375,0.0,15.069792,0.1,2.066562,7.795833,8.875,</=10,6,1,2019
2,2,06/01/2022,14.470833,0.0,14.061458,0.1,1.229271,7.5,13.21875,</=20,6,1,2022
3,3,06/02/2018,5.244792,0.217708,14.355208,0.1,1.607708,7.89375,10.9375,</=10,6,2,2018
4,4,06/02/2019,10.332292,0.0,15.451042,0.1,2.1125,7.760417,7.916667,</=10,6,2,2019


In [3]:
df = old_df[['Month', 'Day', 'Year',
                    'LKSPOMET_ATemp', 
                    'LKSPOMET_TotPrcp',
                    'LKSBAWQ_Temp', 
                    'LKSBAWQ_Sal', 
                    'LKSBAWQ_Depth',
                    'LKSBAWQ_pH',
                     'Turbidity_Range',
                   ]]
df.head(100)

Unnamed: 0,Month,Day,Year,LKSPOMET_ATemp,LKSPOMET_TotPrcp,LKSBAWQ_Temp,LKSBAWQ_Sal,LKSBAWQ_Depth,LKSBAWQ_pH,Turbidity_Range
0,6,1,2018,7.012903,0.000000,16.662366,0.1,1.673441,7.896774,</=20
1,6,1,2019,12.393750,0.000000,15.069792,0.1,2.066562,7.795833,</=10
2,6,1,2022,14.470833,0.000000,14.061458,0.1,1.229271,7.500000,</=20
3,6,2,2018,5.244792,0.217708,14.355208,0.1,1.607708,7.893750,</=10
4,6,2,2019,10.332292,0.000000,15.451042,0.1,2.112500,7.760417,</=10
...,...,...,...,...,...,...,...,...,...,...
95,6,30,2022,22.101042,0.000000,19.505208,0.1,1.246458,7.666667,</=20
96,7,1,2018,17.076842,0.371579,19.856842,0.1,1.545158,7.514737,</=40
97,7,1,2019,21.730526,0.015789,19.623158,0.1,2.114632,7.657895,</=20
98,7,1,2022,18.694737,0.000000,19.341053,0.1,1.287474,7.701053,</=20


In [4]:
#print(df['Month'].to_list())

### 2. Separate the features `X` from the target `y`

In [5]:
# Seperate the features, X,  from the target variable, y
y = df['Turbidity_Range']
X = df.drop(columns='Turbidity_Range')

In [6]:
# Preview the features data
X.head()

Unnamed: 0,Month,Day,Year,LKSPOMET_ATemp,LKSPOMET_TotPrcp,LKSBAWQ_Temp,LKSBAWQ_Sal,LKSBAWQ_Depth,LKSBAWQ_pH
0,6,1,2018,7.012903,0.0,16.662366,0.1,1.673441,7.896774
1,6,1,2019,12.39375,0.0,15.069792,0.1,2.066562,7.795833
2,6,1,2022,14.470833,0.0,14.061458,0.1,1.229271,7.5
3,6,2,2018,5.244792,0.217708,14.355208,0.1,1.607708,7.89375
4,6,2,2019,10.332292,0.0,15.451042,0.1,2.1125,7.760417


In [7]:
# Preview the first five entries for the target variable
y[:5]

0    </=20
1    </=10
2    </=20
3    </=10
4    </=10
Name: Turbidity_Range, dtype: object

### 3. Encode the categorical variables from the features data using `get_dummies`.

In [8]:
# Encode the categorical variables using get_dummies
X = pd.get_dummies(X)

In [9]:
# Review the features data
X.head()

Unnamed: 0,Month,Day,Year,LKSPOMET_ATemp,LKSPOMET_TotPrcp,LKSBAWQ_Temp,LKSBAWQ_Sal,LKSBAWQ_Depth,LKSBAWQ_pH
0,6,1,2018,7.012903,0.0,16.662366,0.1,1.673441,7.896774
1,6,1,2019,12.39375,0.0,15.069792,0.1,2.066562,7.795833
2,6,1,2022,14.470833,0.0,14.061458,0.1,1.229271,7.5
3,6,2,2018,5.244792,0.217708,14.355208,0.1,1.607708,7.89375
4,6,2,2019,10.332292,0.0,15.451042,0.1,2.1125,7.760417


### 4. Separate the data into training and testing subsets.

In [10]:
# Split the dataset using train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

### 5. Scale the data using `StandardScaler`

In [11]:
# Instantiate a StandardScaler instance
scaler = StandardScaler()

# Fit the training data to the standard scaler
X_scaler = scaler.fit(X_train)

# Transform the training data using the scaler
X_train_scaled = X_scaler.transform(X_train)

# Transform the testing data using the scaler
X_test_scaled = X_scaler.transform(X_test)

### 6. Instantiate an K Nearest Neighbor Classifier instance.

In [12]:
# Import the KNeighborsClassifier module from sklearn
from sklearn.neighbors import KNeighborsClassifier

# Instantiate the KNeighborsClassifier model with n_neighbors = 3 
knn = KNeighborsClassifier(n_neighbors=3)

### 7. Fit the model using the training data.

In [13]:
# Train the model using the training data
knn.fit(X_train_scaled, y_train)

### 8. Make predictions using the testing data.

In [14]:
# Create predictions using the testing data
y_pred = knn.predict(X_test_scaled)

### 9. Generate the classification report for the test data.

In [15]:
# Print the classification report comparing the testing data to the model predictions
print(classification_report(y_test, y_pred, zero_division=1))

              precision    recall  f1-score   support

       </=10       0.93      0.78      0.85        55
       </=20       0.68      0.90      0.77        30
       </=30       0.60      1.00      0.75         3
       </=40       1.00      0.00      0.00         3

    accuracy                           0.80        91
   macro avg       0.80      0.67      0.59        91
weighted avg       0.84      0.80      0.79        91

