In [34]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.impute import KNNImputer

train_data = pd.read_csv("iith_foml_2023_train.csv")
#train_data = train_data.fillna(train_data.mean())
columns_with_missing = train_data.columns[train_data.isnull().any()].tolist()

imputer = KNNImputer(n_neighbors=1)
train_data_imputed = train_data.copy()
train_data_imputed[columns_with_missing] = imputer.fit_transform(train_data[columns_with_missing])
# changing the data
# train_data.to_csv("train.csv", index=False)
# X_train = train_data.iloc[:, :-1]
# Y_train = train_data.iloc[:, -1]
X_train = train_data_imputed.iloc[:, :-1]
Y_train = train_data_imputed.iloc[:, -1]
X_train_train, X_train_test, Y_train_train, Y_train_test = train_test_split(X_train, Y_train, test_size=0.3, random_state=42)


In [35]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_train_train_scaled = scaler.transform(X_train_train)
X_train_test_scaled = scaler.transform(X_train_test)

In [36]:
from sklearn.neighbors import KNeighborsClassifier

# Initialize a dictionary to store accuracy scores for different k values
accuracy_scores = {}
# Test k values from 5 to 19
for k in range(1, 30):
    # Create and train the KNN classifier
    knn_classifier = KNeighborsClassifier(n_neighbors=k)
    knn_classifier.fit(X_train_train_scaled, Y_train_train)

    # Make predictions on the test set
    Y_train_pred_knn = knn_classifier.predict(X_train_test_scaled)

    # Calculate accuracy and store it in the dictionary
    accuracy = accuracy_score(Y_train_test, Y_train_pred_knn)
    accuracy_scores[k] = accuracy

    print(f"KNN (k={k}) Accuracy: {accuracy * 100:.2f}%")

# Find the k value with the highest accuracy
best_k = max(accuracy_scores, key=accuracy_scores.get)
print(f"Best K value: {best_k} with accuracy {accuracy_scores[best_k] * 100:.2f}%")

KNN (k=1) Accuracy: 80.27%
KNN (k=2) Accuracy: 78.60%
KNN (k=3) Accuracy: 78.93%
KNN (k=4) Accuracy: 76.92%
KNN (k=5) Accuracy: 75.25%
KNN (k=6) Accuracy: 75.25%
KNN (k=7) Accuracy: 75.25%
KNN (k=8) Accuracy: 74.58%
KNN (k=9) Accuracy: 74.92%
KNN (k=10) Accuracy: 73.58%
KNN (k=11) Accuracy: 73.58%
KNN (k=12) Accuracy: 72.58%
KNN (k=13) Accuracy: 72.24%
KNN (k=14) Accuracy: 70.57%
KNN (k=15) Accuracy: 70.23%
KNN (k=16) Accuracy: 69.23%
KNN (k=17) Accuracy: 69.57%
KNN (k=18) Accuracy: 69.23%
KNN (k=19) Accuracy: 71.24%
KNN (k=20) Accuracy: 71.57%
KNN (k=21) Accuracy: 71.24%
KNN (k=22) Accuracy: 71.91%
KNN (k=23) Accuracy: 71.57%
KNN (k=24) Accuracy: 72.24%
KNN (k=25) Accuracy: 71.91%
KNN (k=26) Accuracy: 71.91%
KNN (k=27) Accuracy: 73.24%
KNN (k=28) Accuracy: 72.91%
KNN (k=29) Accuracy: 72.24%
Best K value: 1 with accuracy 80.27%


In [37]:
test_data = pd.read_csv("iith_foml_2023_test.csv")
imputer = KNNImputer(n_neighbors=1)
test_data_imputed = pd.DataFrame(imputer.fit_transform(test_data), columns=test_data.columns)
X_test_scaled = scaler.transform(test_data_imputed)
print(X_test_scaled)

knn_classifier = KNeighborsClassifier(n_neighbors=1)
knn_classifier.fit(X_train_scaled, Y_train)

# Make predictions on the test set
Y_pred_knn = knn_classifier.predict(X_test_scaled)

result_df = pd.DataFrame({
    'Id': test_data.index + 1,
    'Category': Y_pred_knn
})
result_df.to_csv('predictions_wkNN.csv', index=False)


[[-1.38598819  0.80700602 -1.1023188  ... -0.10082205 -1.38358317
   0.45450044]
 [-1.65964207 -0.78038972 -1.38360776 ... -1.26581489 -1.65620604
   1.12184474]
 [ 0.76379727  0.2778741   0.93233802 ... -0.7110564   0.75810279
   0.52673638]
 ...
 [ 0.60601485 -0.38354079  0.80106983 ...  1.59119135  0.60091483
  -1.49282809]
 [ 1.45163    -0.78038972  1.83246269 ... -1.12712526  1.45562436
   0.80410146]
 [ 1.54284796 -0.38354079  1.97310717 ... -0.98843564  1.54649865
   0.90111382]]


In [38]:

# Y_pred_rf = rf_classifier.predict(test_data)

# result_df = pd.DataFrame({
#     'Id': test_data.index + 1,
#     'Category': Y_pred_rf
# })
# result_df.to_csv('predictions.csv', index=False)

In [39]:
file_path1 = "predictions_wkNN.csv"
file_path2 = "1.csv"

# Read the CSV files into pandas DataFrames
df1 = pd.read_csv(file_path1)
df2 = pd.read_csv(file_path2)

# Find rows that are in df1 but not in df2
df_diff1 = pd.concat([df1, df2]).drop_duplicates(keep=False)

# Find rows that are in df2 but not in df1
df_diff2 = pd.concat([df2, df1]).drop_duplicates(keep=False)

# Display the differences
print("Rows in", file_path1, "but not in", file_path2)
print(df_diff1)

print("\nRows in", file_path2, "but not in", file_path1)
print(df_diff2)

Rows in predictions_wkNN.csv but not in 1.csv
      Id  Category
16    17         5
21    22         2
23    24         5
25    26         2
27    28         1
..   ...       ...
388  389        14
391  392         0
417  418         1
420  421         0
425  426        17

[104 rows x 2 columns]

Rows in 1.csv but not in predictions_wkNN.csv
      Id  Category
16    17         8
21    22         8
23    24         1
25    26         0
27    28         0
..   ...       ...
388  389         1
391  392        15
417  418         2
420  421         2
425  426         0

[104 rows x 2 columns]


In [40]:
file_path1 = "predictions_wkNN.csv"
file_path2 = "predictions_knn_1.csv"

# Read the CSV files into pandas DataFrames
df1 = pd.read_csv(file_path1)
df2 = pd.read_csv(file_path2)

# Find rows that are in df1 but not in df2
df_diff1 = pd.concat([df1, df2]).drop_duplicates(keep=False)

# Find rows that are in df2 but not in df1
df_diff2 = pd.concat([df2, df1]).drop_duplicates(keep=False)

# Display the differences
print("Rows in", file_path1, "but not in", file_path2)
print(df_diff1)

print("\nRows in", file_path2, "but not in", file_path1)
print(df_diff2)

Rows in predictions_wkNN.csv but not in predictions_knn_1.csv
      Id  Category
9     10         0
16    17         5
59    60         0
62    63         0
78    79         1
107  108         5
109  110         3
111  112         5
139  140         0
170  171         5
192  193         1
195  196         0
212  213         1
244  245         1
249  250         1
258  259         2
268  269         0
283  284         1
306  307         1
311  312         2
327  328         0
345  346         1
9     10         1
16    17         2
59    60         2
62    63         1
78    79         3
107  108         2
109  110         1
111  112         2
139  140         2
170  171         0
192  193         3
195  196         6
212  213         0
244  245         6
249  250         5
258  259         0
268  269         2
283  284         2
306  307         0
311  312         1
327  328         3
345  346         2

Rows in predictions_knn_1.csv but not in predictions_wkNN.csv
      Id  Category
9