In [1]:
# Import modules
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler

In [2]:
# Read the CSV file into a Pandas DataFrame
old_df = pd.read_csv(
    Path('3_summer_turbidity.csv')   
)

# Review the DataFrame
old_df.head()

Unnamed: 0.1,Unnamed: 0,_c0,Date,Time,LKSPOMET_ATemp,LKSPOMET_F_ATemp,LKSPOMET_TotPrcp,LKSPOMET_F_TotPrcp,LKSBAWQ_Temp,LKSBAWQ_F_Temp,...,LKSBAWQ_Depth,LKSBAWQ_F_Depth,LKSBAWQ_pH,LKSBAWQ_F_pH,LKSBAWQ_Turb,LKSBAWQ_F_Turb,Turbidity_Range,Month,Day,Year
0,0,14498,06/01/2018,2023-06-04 00:00:00,10.3,<0>,0.0,<0>,18.0,<0>,...,1.67,<0>,7.8,<0>,12.0,<0>,</=20,6,1,2018
1,1,14499,06/01/2018,2023-06-04 00:15:00,10.0,<0>,0.0,<0>,17.9,<0>,...,1.69,<0>,7.9,<0>,16.0,<0>,</=20,6,1,2018
2,2,14500,06/01/2018,2023-06-04 00:30:00,9.5,<0>,0.0,<0>,17.8,<0>,...,1.71,<0>,7.9,<0>,17.0,<0>,</=20,6,1,2018
3,3,14501,06/01/2018,2023-06-04 00:45:00,9.2,<0>,0.0,<0>,17.7,<0>,...,1.7,<0>,7.9,<0>,18.0,<0>,</=20,6,1,2018
4,4,14502,06/01/2018,2023-06-04 01:00:00,9.1,<0>,0.0,<0>,17.7,<0>,...,1.68,<0>,7.9,<0>,25.0,<0>,</=30,6,1,2018


In [3]:
df = old_df[['Month', 'Day', 'Year',
                    'LKSPOMET_ATemp', 
                    'LKSPOMET_TotPrcp',
                    'LKSBAWQ_Temp', 
                    'LKSBAWQ_Sal', 
                    'LKSBAWQ_Depth',
                    'LKSBAWQ_pH',
                     'Turbidity_Range',
                   ]]
df.head(100)

Unnamed: 0,Month,Day,Year,LKSPOMET_ATemp,LKSPOMET_TotPrcp,LKSBAWQ_Temp,LKSBAWQ_Sal,LKSBAWQ_Depth,LKSBAWQ_pH,Turbidity_Range
0,6,1,2018,10.3,0.0,18.0,0.1,1.67,7.8,</=20
1,6,1,2018,10.0,0.0,17.9,0.1,1.69,7.9,</=20
2,6,1,2018,9.5,0.0,17.8,0.1,1.71,7.9,</=20
3,6,1,2018,9.2,0.0,17.7,0.1,1.70,7.9,</=20
4,6,1,2018,9.1,0.0,17.7,0.1,1.68,7.9,</=30
...,...,...,...,...,...,...,...,...,...,...
95,6,2,2018,5.8,0.0,15.6,0.1,1.59,7.9,</=20
96,6,2,2018,5.9,0.0,15.6,0.1,1.59,7.9,</=20
97,6,2,2018,6.1,0.0,15.5,0.1,1.62,7.9,</=20
98,6,2,2018,6.1,0.0,15.4,0.1,1.57,7.9,</=20


In [4]:
#print(df['Month'].to_list())

### 2. Separate the features `X` from the target `y`

In [5]:
# Seperate the features, X,  from the target variable, y
y = df['Turbidity_Range']
X = df.drop(columns='Turbidity_Range')

In [6]:
# Preview the features data
X.head()

Unnamed: 0,Month,Day,Year,LKSPOMET_ATemp,LKSPOMET_TotPrcp,LKSBAWQ_Temp,LKSBAWQ_Sal,LKSBAWQ_Depth,LKSBAWQ_pH
0,6,1,2018,10.3,0.0,18.0,0.1,1.67,7.8
1,6,1,2018,10.0,0.0,17.9,0.1,1.69,7.9
2,6,1,2018,9.5,0.0,17.8,0.1,1.71,7.9
3,6,1,2018,9.2,0.0,17.7,0.1,1.7,7.9
4,6,1,2018,9.1,0.0,17.7,0.1,1.68,7.9


In [7]:
# Preview the first five entries for the target variable
y[:5]

0    </=20
1    </=20
2    </=20
3    </=20
4    </=30
Name: Turbidity_Range, dtype: object

### 3. Encode the categorical variables from the features data using `get_dummies`.

In [8]:
# Encode the categorical variables using get_dummies
X = pd.get_dummies(X)

In [9]:
# Review the features data
X.head()

Unnamed: 0,Month,Day,Year,LKSPOMET_ATemp,LKSPOMET_TotPrcp,LKSBAWQ_Temp,LKSBAWQ_Sal,LKSBAWQ_Depth,LKSBAWQ_pH
0,6,1,2018,10.3,0.0,18.0,0.1,1.67,7.8
1,6,1,2018,10.0,0.0,17.9,0.1,1.69,7.9
2,6,1,2018,9.5,0.0,17.8,0.1,1.71,7.9
3,6,1,2018,9.2,0.0,17.7,0.1,1.7,7.9
4,6,1,2018,9.1,0.0,17.7,0.1,1.68,7.9


### 4. Separate the data into training and testing subsets.

In [10]:
# Split the dataset using train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

### 5. Scale the data using `StandardScaler`

In [11]:
# Instantiate a StandardScaler instance
scaler = StandardScaler()

# Fit the training data to the standard scaler
X_scaler = scaler.fit(X_train)

# Transform the training data using the scaler
X_train_scaled = X_scaler.transform(X_train)

# Transform the testing data using the scaler
X_test_scaled = X_scaler.transform(X_test)

### 6. Instantiate an K Nearest Neighbor Classifier instance.

In [12]:
# Import the KNeighborsClassifier module from sklearn
from sklearn.neighbors import KNeighborsClassifier

# Instantiate the KNeighborsClassifier model with n_neighbors = 3 
knn = KNeighborsClassifier(n_neighbors=3)

### 7. Fit the model using the training data.

In [13]:
# Train the model using the training data
knn.fit(X_train_scaled, y_train)

### 8. Make predictions using the testing data.

In [14]:
# Create predictions using the testing data
y_pred = knn.predict(X_test_scaled)

### 9. Generate the classification report for the test data.

In [15]:
# Print the classification report comparing the testing data to the model predictions
print(classification_report(y_test, y_pred, zero_division=1))

              precision    recall  f1-score   support

       </=10       0.93      0.94      0.93      4767
       </=20       0.89      0.89      0.89      3281
       </=30       0.88      0.81      0.84       361
       </=40       0.90      0.84      0.87       153
       </=50       0.75      0.30      0.43        10

    accuracy                           0.91      8572
   macro avg       0.87      0.76      0.79      8572
weighted avg       0.91      0.91      0.91      8572

