In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.impute import KNNImputer

train_data = pd.read_csv("iith_foml_2023_train.csv")

class_counts = train_data['Target Variable (Discrete)'].value_counts()

# Identify classes with count less than 10
classes_to_discard = class_counts[class_counts < 10].index

# Discard rows with class labels that have count < 10
train_data = train_data[~train_data['Target Variable (Discrete)'].isin(classes_to_discard)]

#train_data = train_data.fillna(train_data.mean())
columns_with_missing = train_data.columns[train_data.isnull().any()].tolist()

imputer = KNNImputer(n_neighbors=5)
train_data_imputed = train_data.copy()
train_data_imputed[columns_with_missing] = imputer.fit_transform(train_data[columns_with_missing])
# changing the data
# train_data.to_csv("train.csv", index=False)
# X_train = train_data.iloc[:, :-1]
# Y_train = train_data.iloc[:, -1]
X_train = train_data_imputed.iloc[:, :-1]
Y_train = train_data_imputed.iloc[:, -1]
X_train_train, X_train_test, Y_train_train, Y_train_test = train_test_split(X_train, Y_train, test_size=0.3, random_state=42)


In [2]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_train_train_scaled = scaler.transform(X_train_train)
X_train_test_scaled = scaler.transform(X_train_test)

In [3]:
from sklearn.neighbors import KNeighborsClassifier

# Initialize a dictionary to store accuracy scores for different k values
accuracy_scores = {}
# Test k values from 5 to 19
for k in range(1, 30):
    # Create and train the KNN classifier
    knn_classifier = KNeighborsClassifier(n_neighbors=k, metric='manhattan')
    knn_classifier.fit(X_train_train_scaled, Y_train_train)

    # Make predictions on the test set
    Y_train_pred_knn = knn_classifier.predict(X_train_test_scaled)

    # Calculate accuracy and store it in the dictionary
    accuracy = accuracy_score(Y_train_test, Y_train_pred_knn)
    accuracy_scores[k] = accuracy

    print(f"KNN (k={k}) Accuracy: {accuracy * 100:.2f}%")

# Find the k value with the highest accuracy
best_k = max(accuracy_scores, key=accuracy_scores.get)
print(f"Best K value: {best_k} with accuracy {accuracy_scores[best_k] * 100:.2f}%")

KNN (k=1) Accuracy: 88.19%
KNN (k=2) Accuracy: 86.81%
KNN (k=3) Accuracy: 85.42%
KNN (k=4) Accuracy: 86.81%
KNN (k=5) Accuracy: 84.72%
KNN (k=6) Accuracy: 84.72%
KNN (k=7) Accuracy: 84.03%
KNN (k=8) Accuracy: 82.64%
KNN (k=9) Accuracy: 81.94%
KNN (k=10) Accuracy: 81.25%
KNN (k=11) Accuracy: 82.29%
KNN (k=12) Accuracy: 80.90%
KNN (k=13) Accuracy: 79.51%
KNN (k=14) Accuracy: 80.90%
KNN (k=15) Accuracy: 81.25%
KNN (k=16) Accuracy: 81.60%
KNN (k=17) Accuracy: 80.90%
KNN (k=18) Accuracy: 82.64%
KNN (k=19) Accuracy: 81.60%
KNN (k=20) Accuracy: 82.29%
KNN (k=21) Accuracy: 81.25%
KNN (k=22) Accuracy: 81.94%
KNN (k=23) Accuracy: 81.94%
KNN (k=24) Accuracy: 82.64%
KNN (k=25) Accuracy: 81.60%
KNN (k=26) Accuracy: 80.21%
KNN (k=27) Accuracy: 79.86%
KNN (k=28) Accuracy: 78.47%
KNN (k=29) Accuracy: 78.82%
Best K value: 1 with accuracy 88.19%


In [5]:
test_data = pd.read_csv("iith_foml_2023_test.csv")
imputer = KNNImputer(n_neighbors=5)
test_data_imputed = pd.DataFrame(imputer.fit_transform(test_data), columns=test_data.columns)
X_test_scaled = scaler.transform(test_data_imputed)
print(X_test_scaled)

knn_classifier = KNeighborsClassifier(n_neighbors=1,metric='manhattan')
knn_classifier.fit(X_train_scaled, Y_train)

# Make predictions on the test set
Y_pred_knn = knn_classifier.predict(X_test_scaled)

result_df = pd.DataFrame({
    'Id': test_data.index + 1,
    'Category': Y_pred_knn
})
result_df.to_csv('predictions_knn.csv', index=False)


[[-1.39430703  0.79808147 -1.10940731 ... -0.09545904 -1.39186208
   0.463873  ]
 [-1.66730434 -0.79006055 -1.38962623 ... -1.26631062 -1.66381426
   1.12865727]
 [ 0.75032049  0.2687008   0.91750957 ... -0.70876225  0.7445551
   0.53583183]
 ...
 [ 0.59291664 -0.39302504  0.78674074 ...  1.6050635   0.58775384
  -1.47598527]
 [ 1.43650292 -0.79006055  1.81421012 ... -1.12692353  1.44036069
   0.8121329 ]
 [ 1.52750202 -0.39302504  1.95431958 ... -0.98753644  1.53101142
   0.9087731 ]]


In [15]:

# Y_pred_rf = rf_classifier.predict(test_data)

# result_df = pd.DataFrame({
#     'Id': test_data.index + 1,
#     'Category': Y_pred_rf
# })
# result_df.to_csv('predictions.csv', index=False)

In [6]:
file_path1 = "predictions_knn.csv"
file_path2 = "1.csv"

# Read the CSV files into pandas DataFrames
df1 = pd.read_csv(file_path1)
df2 = pd.read_csv(file_path2)

# Find rows that are in df1 but not in df2
df_diff1 = pd.concat([df1, df2]).drop_duplicates(keep=False)

# Find rows that are in df2 but not in df1
df_diff2 = pd.concat([df2, df1]).drop_duplicates(keep=False)

# Display the differences
print("Rows in", file_path1, "but not in", file_path2)
print(df_diff1)

print("\nRows in", file_path2, "but not in", file_path1)
print(df_diff2)

Rows in predictions_knn.csv but not in 1.csv
      Id  Category
16    17         5
21    22         2
27    28         1
45    46         2
50    51         0
..   ...       ...
391  392         0
398  399         1
417  418         1
420  421         0
425  426        17

[98 rows x 2 columns]

Rows in 1.csv but not in predictions_knn.csv
      Id  Category
16    17         8
21    22         8
27    28         0
45    46         4
50    51         4
..   ...       ...
391  392         5
398  399         2
417  418         0
420  421         2
425  426         0

[98 rows x 2 columns]


In [17]:
file_path1 = "predictions_man.csv"
file_path2 = "1.csv"

# Read the CSV files into pandas DataFrames
df1 = pd.read_csv(file_path1)
df2 = pd.read_csv(file_path2)

# Find rows that are in df1 but not in df2
df_diff1 = pd.concat([df1, df2]).drop_duplicates(keep=False)

# Find rows that are in df2 but not in df1
df_diff2 = pd.concat([df2, df1]).drop_duplicates(keep=False)

# Display the differences
print("Rows in", file_path1, "but not in", file_path2)
print(df_diff1)

print("\nRows in", file_path2, "but not in", file_path1)
print(df_diff2)

Rows in predictions_man.csv but not in 1.csv
      Id  Category
14    15         1
15    16         5
16    17        13
21    22         2
25    26         2
..   ...       ...
388  389        14
391  392         0
417  418         1
420  421         0
425  426        17

[114 rows x 2 columns]

Rows in 1.csv but not in predictions_man.csv
      Id  Category
14    15         2
15    16         0
16    17         8
21    22         8
25    26         0
..   ...       ...
388  389         1
391  392         2
417  418         0
420  421         2
425  426         0

[114 rows x 2 columns]
