In [1]:
# Import modules
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler

In [2]:
# Read the CSV file into a Pandas DataFrame
old_df = pd.read_csv(
    Path('turbidity_df2.csv')   
)

# Review the DataFrame
old_df.head()

Unnamed: 0.1,Unnamed: 0,Date,Time,LKSPOMET_ATemp,LKSPOMET_F_ATemp,LKSPOMET_TotPrcp,LKSPOMET_F_TotPrcp,LKSBAWQ_Temp,LKSBAWQ_F_Temp,LKSBAWQ_Sal,LKSBAWQ_F_Sal,LKSBAWQ_Depth,LKSBAWQ_F_Depth,LKSBAWQ_pH,LKSBAWQ_F_pH,LKSBAWQ_Turb,LKSBAWQ_F_Turb,Turbidity_Range
0,14454,05/31/2018,13:00,23.9,<0>,0.0,<0>,17.2,<0>,0.1,<0>,1.49,<0>,7.8,<0>,15.0,<0>,</=15
1,14455,05/31/2018,13:15,24.0,<0>,0.0,<0>,17.2,<0>,0.1,<0>,1.51,<0>,7.8,<0>,15.0,<0>,</=15
2,14456,05/31/2018,13:30,23.9,<0>,0.0,<0>,17.3,<0>,0.1,<0>,1.53,<0>,7.8,<0>,15.0,<0>,</=15
3,14457,05/31/2018,13:45,24.2,<0>,0.0,<0>,17.4,<0>,0.1,<0>,1.55,<0>,7.8,<0>,14.0,<0>,</=15
4,14458,05/31/2018,14:00,24.4,<0>,0.0,<0>,17.4,<0>,0.1,<0>,1.58,<0>,7.8,<0>,15.0,<0>,</=15


In [3]:
'LKSPOMET_ATemp', 
#'LKSPOMET_TotPrcp',

df = old_df[['LKSPOMET_ATemp', 
                    'LKSPOMET_TotPrcp',
                    'LKSBAWQ_Temp', 
                    'LKSBAWQ_Sal', 
                    'LKSBAWQ_Depth',
                    'LKSBAWQ_pH',
                     'Turbidity_Range'
                   ]]
df.head()

Unnamed: 0,LKSPOMET_ATemp,LKSPOMET_TotPrcp,LKSBAWQ_Temp,LKSBAWQ_Sal,LKSBAWQ_Depth,LKSBAWQ_pH,Turbidity_Range
0,23.9,0.0,17.2,0.1,1.49,7.8,</=15
1,24.0,0.0,17.2,0.1,1.51,7.8,</=15
2,23.9,0.0,17.3,0.1,1.53,7.8,</=15
3,24.2,0.0,17.4,0.1,1.55,7.8,</=15
4,24.4,0.0,17.4,0.1,1.58,7.8,</=15


### 2. Separate the features `X` from the target `y`

In [4]:
# Seperate the features, X,  from the target variable, y
y = df['Turbidity_Range']
X = df.drop(columns='Turbidity_Range')

In [5]:
# Preview the features data
X.head()

Unnamed: 0,LKSPOMET_ATemp,LKSPOMET_TotPrcp,LKSBAWQ_Temp,LKSBAWQ_Sal,LKSBAWQ_Depth,LKSBAWQ_pH
0,23.9,0.0,17.2,0.1,1.49,7.8
1,24.0,0.0,17.2,0.1,1.51,7.8
2,23.9,0.0,17.3,0.1,1.53,7.8
3,24.2,0.0,17.4,0.1,1.55,7.8
4,24.4,0.0,17.4,0.1,1.58,7.8


In [6]:
# Preview the first five entries for the target variable
y[:5]

0    </=15
1    </=15
2    </=15
3    </=15
4    </=15
Name: Turbidity_Range, dtype: object

### 3. Encode the categorical variables from the features data using `get_dummies`.

In [7]:
# Encode the categorical variables using get_dummies
X = pd.get_dummies(X)

In [8]:
# Review the features data
X.head()

Unnamed: 0,LKSPOMET_ATemp,LKSPOMET_TotPrcp,LKSBAWQ_Temp,LKSBAWQ_Sal,LKSBAWQ_Depth,LKSBAWQ_pH
0,23.9,0.0,17.2,0.1,1.49,7.8
1,24.0,0.0,17.2,0.1,1.51,7.8
2,23.9,0.0,17.3,0.1,1.53,7.8
3,24.2,0.0,17.4,0.1,1.55,7.8
4,24.4,0.0,17.4,0.1,1.58,7.8


### 4. Separate the data into training and testing subsets.

In [9]:
# Split the dataset using train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

### 5. Scale the data using `StandardScaler`

In [10]:
# Instantiate a StandardScaler instance
scaler = StandardScaler()

# Fit the training data to the standard scaler
X_scaler = scaler.fit(X_train)

# Transform the training data using the scaler
X_train_scaled = X_scaler.transform(X_train)

# Transform the testing data using the scaler
X_test_scaled = X_scaler.transform(X_test)

### 6. Instantiate an K Nearest Neighbor Classifier instance.

In [11]:
# Import the KNeighborsClassifier module from sklearn
from sklearn.neighbors import KNeighborsClassifier

# Instantiate the KNeighborsClassifier model with n_neighbors = 3 
knn = KNeighborsClassifier(n_neighbors=3)

### 7. Fit the model using the training data.

In [12]:
# Train the model using the training data
knn.fit(X_train_scaled, y_train)

### 8. Make predictions using the testing data.

In [13]:
# Create predictions using the testing data
y_pred = knn.predict(X_test_scaled)

### 9. Generate the classification report for the test data.

In [14]:
# Print the classification report comparing the testing data to the model predictions
print(classification_report(y_test, y_pred, zero_division=1))

              precision    recall  f1-score   support

           0       0.00      1.00      0.00         0
       </=10       0.76      0.82      0.79      5801
       </=15       0.69      0.70      0.69      3656
       </=20       0.59      0.49      0.54       697
       </=25       0.50      0.46      0.48       415
       </=30       0.58      0.58      0.58       415
       </=35       0.45      0.27      0.34       167
       </=40       0.38      0.19      0.25        63
       </=45       0.33      0.12      0.18        16
        </=5       0.78      0.72      0.75      3564
       </=50       1.00      0.00      0.00         5
       </=55       1.00      0.00      0.00         4
       </=65       1.00      0.00      0.00         1
       </=70       1.00      0.00      0.00         1
       </=75       1.00      0.00      0.00         2
       </=85       1.00      0.00      0.00         1

    accuracy                           0.72     14808
   macro avg       0.69   