In [96]:
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import accuracy_score, f1_score, classification_report

# df = pd.read_csv('training_data_fall2024.csv')
df = pd.read_csv('modified_training.csv')

# Step 1: Split the data into features (X) and target (y)
X = df.drop(columns=['increase_stock'])  # Replace 'target' with the actual target column name
y = df['increase_stock']
random_state = 43

In [None]:
# Step 2: Split the dataset into training and testing sets (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=random_state)


# Step 3.1: Feature Scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Step 4: Feature Selection using SelectKBest (ANOVA F-test)
selector = SelectKBest(f_classif, k=9)  # Select top 9 features based on ANOVA F-test
X_train_kbest = selector.fit_transform(X_train, y_train)
X_test_kbest = selector.transform(X_test)

# Step 5: Initialize the KNN classifier
knn = KNeighborsClassifier(n_neighbors=21, weights='distance', metric='minkowski', p=3)
knn.fit(X_train_kbest, y_train)

# Using the best model from GridSearchCV
y_pred = knn.predict(X_test_kbest)

# Evaluate the model performance
accuracy_test = accuracy_score(y_test, y_pred)  # Accuracy on test set
f1 = f1_score(y_test, y_pred, average='weighted')  # F1 score (weighted average for imbalanced classes)

# Train accuracy
y_train_pred = knn.predict(X_train_kbest)  # Predictions on the training set
accuracy_train = accuracy_score(y_train, y_train_pred)  # Accuracy on training set

print(f'Accuracy of KNN classifier on training set: {accuracy_train:.2f}')
print(f'Accuracy of KNN classifier on test set: {accuracy_test:.2f}')
print(f'F1 Score (weighted) on test set: {f1:.2f}')
print("\nClassification Report on test set:")
print(classification_report(y_test, y_pred))
stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=random_state)
cv_scores = cross_val_score(knn, X_train, y_train, cv=stratified_kfold)
print(f'\nCross-validation scores (5 folds): {cv_scores}')
print(f'Mean cross-validation score: {cv_scores.mean():.2f}')

Accuracy of KNN classifier on training set: 1.00
Accuracy of KNN classifier on test set: 0.92
F1 Score (weighted) on test set: 0.92

Classification Report on test set:
                  precision    recall  f1-score   support

high_bike_demand       0.81      0.74      0.77        58
 low_bike_demand       0.94      0.96      0.95       262

        accuracy                           0.92       320
       macro avg       0.88      0.85      0.86       320
    weighted avg       0.92      0.92      0.92       320


Cross-validation scores (5 folds): [0.8671875  0.84765625 0.84375    0.87109375 0.859375  ]
Mean cross-validation score: 0.86
