In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.impute import KNNImputer
import numpy as np

np.random.seed(42)
train_data = pd.read_csv("iith_foml_2023_train.csv")

columns_with_missing = train_data.columns[train_data.isnull().any()].tolist()

imputer = KNNImputer(n_neighbors=1)
train_data_imputed = train_data.copy()
train_data_imputed[columns_with_missing] = imputer.fit_transform(train_data[columns_with_missing])
X_train = train_data_imputed.iloc[:, :-1]
Y_train = train_data_imputed.iloc[:, -1]
X_train_train, X_train_test, Y_train_train, Y_train_test = train_test_split(X_train, Y_train, test_size=0.2, random_state=42)


In [7]:
from imblearn.over_sampling import ADASYN
adasyn = ADASYN(random_state=42, n_neighbors=2)
X_train, Y_train = adasyn.fit_resample(X_train, Y_train)
X_train_train, Y_train_train = adasyn.fit_resample(X_train_train,Y_train_train)

ValueError: Expected n_neighbors <= n_samples,  but n_samples = 2, n_neighbors = 3

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_train_train_scaled = scaler.transform(X_train_train)
X_train_test_scaled = scaler.transform(X_train_test)

In [None]:
from sklearn.neighbors import KNeighborsClassifier

# Initialize a dictionary to store accuracy scores for different k values
accuracy_scores = {}
# Test k values from 5 to 19
for k in range(1, 30):
    # Create and train the KNN classifier
    knn_classifier = KNeighborsClassifier(n_neighbors=k,metric='manhattan')
    knn_classifier.fit(X_train_train_scaled, Y_train_train)

    # Make predictions on the test set
    Y_train_pred_knn = knn_classifier.predict(X_train_test_scaled)

    # Calculate accuracy and store it in the dictionary
    accuracy = accuracy_score(Y_train_test, Y_train_pred_knn)
    accuracy_scores[k] = accuracy

    print(f"KNN (k={k}) Accuracy: {accuracy * 100:.2f}%")

# Find the k value with the highest accuracy
best_k = max(accuracy_scores, key=accuracy_scores.get)
print(f"Best K value: {best_k} with accuracy {accuracy_scores[best_k] * 100:.2f}%")

KNN (k=1) Accuracy: 78.39%
KNN (k=2) Accuracy: 77.39%
KNN (k=3) Accuracy: 75.38%
KNN (k=4) Accuracy: 75.38%
KNN (k=5) Accuracy: 74.87%
KNN (k=6) Accuracy: 74.87%
KNN (k=7) Accuracy: 72.86%
KNN (k=8) Accuracy: 72.86%
KNN (k=9) Accuracy: 71.86%
KNN (k=10) Accuracy: 70.85%
KNN (k=11) Accuracy: 69.85%
KNN (k=12) Accuracy: 70.35%
KNN (k=13) Accuracy: 70.35%
KNN (k=14) Accuracy: 70.35%
KNN (k=15) Accuracy: 68.84%
KNN (k=16) Accuracy: 68.84%
KNN (k=17) Accuracy: 68.84%
KNN (k=18) Accuracy: 68.84%
KNN (k=19) Accuracy: 68.84%
KNN (k=20) Accuracy: 69.35%
KNN (k=21) Accuracy: 68.34%
KNN (k=22) Accuracy: 68.34%
KNN (k=23) Accuracy: 68.34%
KNN (k=24) Accuracy: 67.84%
KNN (k=25) Accuracy: 66.83%
KNN (k=26) Accuracy: 65.83%
KNN (k=27) Accuracy: 65.83%
KNN (k=28) Accuracy: 65.33%
KNN (k=29) Accuracy: 64.82%
Best K value: 1 with accuracy 78.39%


In [None]:
test_data = pd.read_csv("iith_foml_2023_test.csv")
test_data_imputed = pd.DataFrame(imputer.fit_transform(test_data), columns=test_data.columns)
X_test_scaled = scaler.transform(test_data_imputed)
print(X_test_scaled)

knn_classifier = KNeighborsClassifier(n_neighbors=3,metric='manhattan')
knn_classifier.fit(X_train_scaled, Y_train)

# Make predictions on the test set
Y_pred_knn = knn_classifier.predict(X_test_scaled)

result_df = pd.DataFrame({
    'Id': test_data.index + 1,
    'Category': Y_pred_knn
})
result_df.to_csv('predictions_knn_10.csv', index=False)


[[-1.55291357 -0.15759579 -1.30743515 ... -0.98159926 -1.55069439
  -4.28474601]
 [-1.65104202 -0.70367918 -1.4090902  ... -1.32432988 -1.64858909
  -4.03127658]
 [-0.78203065 -0.33962358 -0.57213025 ... -1.16112482 -0.78164769
  -4.2573095 ]
 ...
 [-0.83860921 -0.56715833 -0.61956928 ... -0.48382384 -0.83809149
  -5.0243766 ]
 [-0.53538347 -0.70367918 -0.24683407 ... -1.28352862 -0.53117836
  -4.15196123]
 [-0.50267399 -0.56715833 -0.19600655 ... -1.24272735 -0.49854679
  -4.11511418]]


In [None]:

# Y_pred_rf = rf_classifier.predict(test_data)

# result_df = pd.DataFrame({
#     'Id': test_data.index + 1,
#     'Category': Y_pred_rf
# })
# result_df.to_csv('predictions.csv', index=False)

In [None]:
file_path1 = "predictions_knn_10.csv"
file_path2 = "predictions_man_10.csv"

# Read the CSV files into pandas DataFrames
df1 = pd.read_csv(file_path1)
df2 = pd.read_csv(file_path2)

# Find rows that are in df1 but not in df2
df_diff1 = pd.concat([df1, df2]).drop_duplicates(keep=False)

# Find rows that are in df2 but not in df1
df_diff2 = pd.concat([df2, df1]).drop_duplicates(keep=False)

# Display the differences
print("Rows in", file_path1, "but not in", file_path2)
print(df_diff1)

print("\nRows in", file_path2, "but not in", file_path1)
print(df_diff2)

Rows in predictions_knn_10.csv but not in predictions_man_10.csv
      Id  Category
0      1         9
1      2         9
2      3         3
3      4        30
4      5         6
..   ...       ...
421  422         1
422  423         1
423  424         1
424  425         1
425  426         2

[848 rows x 2 columns]

Rows in predictions_man_10.csv but not in predictions_knn_10.csv
      Id  Category
0      1         6
1      2         5
2      3         1
3      4         1
4      5         1
..   ...       ...
421  422         3
422  423         3
423  424        24
424  425         3
425  426        33

[848 rows x 2 columns]


In [None]:
file_path1 = "predictions1.csv"
file_path2 = "predictions_knn_1.csv"

# Read the CSV files into pandas DataFrames
df1 = pd.read_csv(file_path1)
df2 = pd.read_csv(file_path2)

# Find rows that are in df1 but not in df2
df_diff1 = pd.concat([df1, df2]).drop_duplicates(keep=False)

# Find rows that are in df2 but not in df1
df_diff2 = pd.concat([df2, df1]).drop_duplicates(keep=False)

# Display the differences
print("Rows in", file_path1, "but not in", file_path2)
print(df_diff1)

print("\nRows in", file_path2, "but not in", file_path1)
print(df_diff2)

Rows in predictions1.csv but not in predictions_knn_1.csv
      Id  Category
8      9         1
9     10         0
16    17         1
23    24         1
27    28         0
..   ...       ...
328  329         3
353  354         5
386  387         8
391  392        15
420  421         2

[90 rows x 2 columns]

Rows in predictions_knn_1.csv but not in predictions1.csv
      Id  Category
8      9         5
9     10         1
16    17         2
23    24         5
27    28         1
..   ...       ...
328  329         1
353  354         1
386  387         2
391  392         0
420  421         0

[90 rows x 2 columns]
