In [1]:
# Import modules
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler

In [2]:
# Read the CSV file into a Pandas DataFrame
# To run for the different dataframes: 

old_df = pd.read_csv(
    Path('2_turbidity.csv')   
)

# Review the DataFrame
old_df.head()

Unnamed: 0.1,Unnamed: 0,Date,Time,LKSPOMET_ATemp,LKSPOMET_F_ATemp,LKSPOMET_TotPrcp,LKSPOMET_F_TotPrcp,LKSBAWQ_Temp,LKSBAWQ_F_Temp,LKSBAWQ_Sal,LKSBAWQ_F_Sal,LKSBAWQ_Depth,LKSBAWQ_F_Depth,LKSBAWQ_pH,LKSBAWQ_F_pH,LKSBAWQ_Turb,LKSBAWQ_F_Turb,Turbidity_Range
0,14454,05/31/2018,13:00,23.9,<0>,0.0,<0>,17.2,<0>,0.1,<0>,1.49,<0>,7.8,<0>,15.0,<0>,</=20
1,14455,05/31/2018,13:15,24.0,<0>,0.0,<0>,17.2,<0>,0.1,<0>,1.51,<0>,7.8,<0>,15.0,<0>,</=20
2,14456,05/31/2018,13:30,23.9,<0>,0.0,<0>,17.3,<0>,0.1,<0>,1.53,<0>,7.8,<0>,15.0,<0>,</=20
3,14457,05/31/2018,13:45,24.2,<0>,0.0,<0>,17.4,<0>,0.1,<0>,1.55,<0>,7.8,<0>,14.0,<0>,</=20
4,14458,05/31/2018,14:00,24.4,<0>,0.0,<0>,17.4,<0>,0.1,<0>,1.58,<0>,7.8,<0>,15.0,<0>,</=20


In [3]:
df = old_df[['Date','Time',
                    'LKSPOMET_ATemp', 
                    'LKSPOMET_TotPrcp',
                    'LKSBAWQ_Temp', 
                    'LKSBAWQ_Sal', 
                    'LKSBAWQ_Depth',
                    'LKSBAWQ_pH',
                     'Turbidity_Range',
                   ]]
df.head(100)

Unnamed: 0,Date,Time,LKSPOMET_ATemp,LKSPOMET_TotPrcp,LKSBAWQ_Temp,LKSBAWQ_Sal,LKSBAWQ_Depth,LKSBAWQ_pH,Turbidity_Range
0,05/31/2018,13:00,23.9,0.0,17.2,0.1,1.49,7.8,</=20
1,05/31/2018,13:15,24.0,0.0,17.2,0.1,1.51,7.8,</=20
2,05/31/2018,13:30,23.9,0.0,17.3,0.1,1.53,7.8,</=20
3,05/31/2018,13:45,24.2,0.0,17.4,0.1,1.55,7.8,</=20
4,05/31/2018,14:00,24.4,0.0,17.4,0.1,1.58,7.8,</=20
...,...,...,...,...,...,...,...,...,...
95,06/01/2018,12:45,7.7,0.0,16.7,0.1,1.76,8.0,</=20
96,06/01/2018,13:00,7.5,0.0,16.9,0.1,1.77,7.9,</=20
97,06/01/2018,13:15,7.2,0.0,16.9,0.1,1.80,8.0,</=20
98,06/01/2018,13:30,7.5,0.0,16.9,0.1,1.79,7.9,</=20


In [4]:
#print(df['Month'].to_list())

### 2. Separate the features `X` from the target `y`

In [5]:
# Seperate the features, X,  from the target variable, y
y = df['Turbidity_Range']
X = df.drop(columns='Turbidity_Range')

In [6]:
# Preview the features data
X.head()

Unnamed: 0,Date,Time,LKSPOMET_ATemp,LKSPOMET_TotPrcp,LKSBAWQ_Temp,LKSBAWQ_Sal,LKSBAWQ_Depth,LKSBAWQ_pH
0,05/31/2018,13:00,23.9,0.0,17.2,0.1,1.49,7.8
1,05/31/2018,13:15,24.0,0.0,17.2,0.1,1.51,7.8
2,05/31/2018,13:30,23.9,0.0,17.3,0.1,1.53,7.8
3,05/31/2018,13:45,24.2,0.0,17.4,0.1,1.55,7.8
4,05/31/2018,14:00,24.4,0.0,17.4,0.1,1.58,7.8


In [7]:
# Preview the first five entries for the target variable
y[:5]

0    </=20
1    </=20
2    </=20
3    </=20
4    </=20
Name: Turbidity_Range, dtype: object

### 3. Encode the categorical variables from the features data using `get_dummies`.

In [8]:
# Encode the categorical variables using get_dummies
X = pd.get_dummies(X)

In [9]:
# Review the features data
X.head()

Unnamed: 0,LKSPOMET_ATemp,LKSPOMET_TotPrcp,LKSBAWQ_Temp,LKSBAWQ_Sal,LKSBAWQ_Depth,LKSBAWQ_pH,Date_05/11/2022,Date_05/12/2022,Date_05/13/2022,Date_05/14/2019,...,Time_7:30,Time_7:45,Time_8:00,Time_8:15,Time_8:30,Time_8:45,Time_9:00,Time_9:15,Time_9:30,Time_9:45
0,23.9,0.0,17.2,0.1,1.49,7.8,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,24.0,0.0,17.2,0.1,1.51,7.8,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,23.9,0.0,17.3,0.1,1.53,7.8,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,24.2,0.0,17.4,0.1,1.55,7.8,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,24.4,0.0,17.4,0.1,1.58,7.8,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### 4. Separate the data into training and testing subsets.

In [10]:
# Split the dataset using train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

### 5. Scale the data using `StandardScaler`

In [11]:
# Instantiate a StandardScaler instance
scaler = StandardScaler()

# Fit the training data to the standard scaler
X_scaler = scaler.fit(X_train)

# Transform the training data using the scaler
X_train_scaled = X_scaler.transform(X_train)

# Transform the testing data using the scaler
X_test_scaled = X_scaler.transform(X_test)

### 6. Instantiate an K Nearest Neighbor Classifier instance.

In [12]:
# Import the KNeighborsClassifier module from sklearn
from sklearn.neighbors import KNeighborsClassifier

# Instantiate the KNeighborsClassifier model with n_neighbors = 3 
knn = KNeighborsClassifier(n_neighbors=3)

### 7. Fit the model using the training data.

In [13]:
# Train the model using the training data
knn.fit(X_train_scaled, y_train)

### 8. Make predictions using the testing data.

In [14]:
# Create predictions using the testing data
y_pred = knn.predict(X_test_scaled)

### 9. Generate the classification report for the test data.

In [15]:
# Print the classification report comparing the testing data to the model predictions
print(classification_report(y_test, y_pred, zero_division=1))

              precision    recall  f1-score   support

           0       0.00      1.00      0.00         0
       </=10       0.93      0.94      0.93      9365
       </=20       0.84      0.83      0.83      4353
       </=30       0.81      0.75      0.78       830
       </=40       0.73      0.65      0.69       230
       </=50       1.00      0.00      0.00        21
       </=60       1.00      0.00      0.00         4
       </=70       1.00      0.00      0.00         2
       </=80       1.00      0.00      0.00         2
       </=90       1.00      0.00      0.00         1

    accuracy                           0.89     14808
   macro avg       0.83      0.42      0.32     14808
weighted avg       0.89      0.89      0.89     14808

