In [1]:
# Import modules
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler

In [2]:
# Read the CSV file into a Pandas DataFrame
old_df = pd.read_csv(
    Path('6_turbidity.csv')   
)

# Review the DataFrame
old_df.head()

Unnamed: 0,Date,LKSPOMET_ATemp,LKSPOMET_TotPrcp,LKSBAWQ_Temp,LKSBAWQ_Sal,LKSBAWQ_Depth,LKSBAWQ_pH,LKSBAWQ_Turb,Turbidity_Range
0,05/11/2022,8.36875,0.36875,12.458333,0.1,1.21625,7.70625,11.8125,</=15
1,05/12/2022,10.997917,0.180208,11.485417,0.1,1.15375,7.6125,19.645833,</=20
2,05/13/2022,11.973958,0.060417,11.833333,0.1,1.139271,7.602083,18.802083,</=20
3,05/14/2019,16.433333,0.014815,12.425926,0.1,2.018704,7.616667,8.555556,</=10
4,05/14/2022,15.015625,0.0,13.057292,0.1,1.129479,7.572917,16.416667,</=20


In [3]:
df = old_df[['Date',
                    'LKSPOMET_ATemp', 
                    'LKSPOMET_TotPrcp',
                    'LKSBAWQ_Temp', 
                    'LKSBAWQ_Sal', 
                    'LKSBAWQ_Depth',
                    'LKSBAWQ_pH',
                     'Turbidity_Range',
                   ]]
df.head(100)

Unnamed: 0,Date,LKSPOMET_ATemp,LKSPOMET_TotPrcp,LKSBAWQ_Temp,LKSBAWQ_Sal,LKSBAWQ_Depth,LKSBAWQ_pH,Turbidity_Range
0,05/11/2022,8.368750,0.368750,12.458333,0.1,1.216250,7.706250,</=15
1,05/12/2022,10.997917,0.180208,11.485417,0.1,1.153750,7.612500,</=20
2,05/13/2022,11.973958,0.060417,11.833333,0.1,1.139271,7.602083,</=20
3,05/14/2019,16.433333,0.014815,12.425926,0.1,2.018704,7.616667,</=10
4,05/14/2022,15.015625,0.000000,13.057292,0.1,1.129479,7.572917,</=20
...,...,...,...,...,...,...,...,...
95,06/22/2022,20.564130,0.000000,19.893478,0.1,1.284130,7.471739,</=15
96,06/23/2019,9.622917,0.092708,17.177083,0.1,2.121979,7.923958,</=10
97,06/23/2021,20.188542,0.000000,18.267708,0.1,1.350729,8.023958,</=10
98,06/23/2022,23.675000,0.000000,20.007292,0.1,1.264687,7.484375,</=15


In [4]:
#print(df['Month'].to_list())

### 2. Separate the features `X` from the target `y`

In [5]:
# Seperate the features, X,  from the target variable, y
y = df['Turbidity_Range']
X = df.drop(columns='Turbidity_Range')

In [6]:
# Preview the features data
X.head()

Unnamed: 0,Date,LKSPOMET_ATemp,LKSPOMET_TotPrcp,LKSBAWQ_Temp,LKSBAWQ_Sal,LKSBAWQ_Depth,LKSBAWQ_pH
0,05/11/2022,8.36875,0.36875,12.458333,0.1,1.21625,7.70625
1,05/12/2022,10.997917,0.180208,11.485417,0.1,1.15375,7.6125
2,05/13/2022,11.973958,0.060417,11.833333,0.1,1.139271,7.602083
3,05/14/2019,16.433333,0.014815,12.425926,0.1,2.018704,7.616667
4,05/14/2022,15.015625,0.0,13.057292,0.1,1.129479,7.572917


In [7]:
# Preview the first five entries for the target variable
y[:5]

0    </=15
1    </=20
2    </=20
3    </=10
4    </=20
Name: Turbidity_Range, dtype: object

### 3. Encode the categorical variables from the features data using `get_dummies`.

In [8]:
# Encode the categorical variables using get_dummies
X = pd.get_dummies(X)

In [9]:
# Review the features data
X.head()

Unnamed: 0,LKSPOMET_ATemp,LKSPOMET_TotPrcp,LKSBAWQ_Temp,LKSBAWQ_Sal,LKSBAWQ_Depth,LKSBAWQ_pH,Date_05/11/2022,Date_05/12/2022,Date_05/13/2022,Date_05/14/2019,...,Date_11/05/2021,Date_11/06/2018,Date_11/06/2021,Date_11/07/2018,Date_11/07/2021,Date_11/08/2021,Date_11/09/2021,Date_11/10/2021,Date_11/11/2021,Date_11/12/2021
0,8.36875,0.36875,12.458333,0.1,1.21625,7.70625,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,10.997917,0.180208,11.485417,0.1,1.15375,7.6125,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,11.973958,0.060417,11.833333,0.1,1.139271,7.602083,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,16.433333,0.014815,12.425926,0.1,2.018704,7.616667,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,15.015625,0.0,13.057292,0.1,1.129479,7.572917,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### 4. Separate the data into training and testing subsets.

In [10]:
# Split the dataset using train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

### 5. Scale the data using `StandardScaler`

In [11]:
# Instantiate a StandardScaler instance
scaler = StandardScaler()

# Fit the training data to the standard scaler
X_scaler = scaler.fit(X_train)

# Transform the training data using the scaler
X_train_scaled = X_scaler.transform(X_train)

# Transform the testing data using the scaler
X_test_scaled = X_scaler.transform(X_test)

### 6. Instantiate an K Nearest Neighbor Classifier instance.

In [12]:
# Import the KNeighborsClassifier module from sklearn
from sklearn.neighbors import KNeighborsClassifier

# Instantiate the KNeighborsClassifier model with n_neighbors = 3 
knn = KNeighborsClassifier(n_neighbors=3)

### 7. Fit the model using the training data.

In [13]:
# Train the model using the training data
knn.fit(X_train_scaled, y_train)

### 8. Make predictions using the testing data.

In [14]:
# Create predictions using the testing data
y_pred = knn.predict(X_test_scaled)

### 9. Generate the classification report for the test data.

In [15]:
# Print the classification report comparing the testing data to the model predictions
print(classification_report(y_test, y_pred, zero_division=1))

              precision    recall  f1-score   support

       </=10       0.61      0.77      0.68        70
       </=15       0.41      0.37      0.39        30
       </=20       0.50      0.17      0.25         6
       </=25       0.00      0.00      0.00         5
       </=30       0.00      0.00      0.00         2
       </=35       0.00      0.00      0.00         3
        </=5       0.73      0.57      0.64        42

    accuracy                           0.57       158
   macro avg       0.32      0.27      0.28       158
weighted avg       0.56      0.57      0.56       158

