In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.impute import KNNImputer

train_data = pd.read_csv("iith_foml_2023_train.csv")
#train_data = train_data.fillna(train_data.mean())
columns_with_missing = train_data.columns[train_data.isnull().any()].tolist()

imputer = KNNImputer(n_neighbors=7)
train_data_imputed = train_data.copy()
train_data_imputed[columns_with_missing] = imputer.fit_transform(train_data[columns_with_missing])
# changing the data
# train_data.to_csv("train.csv", index=False)
# X_train = train_data.iloc[:, :-1]
# Y_train = train_data.iloc[:, -1]
X_train = train_data_imputed.iloc[:, :-1]
Y_train = train_data_imputed.iloc[:, -1]
X_train_train, X_train_test, Y_train_train, Y_train_test = train_test_split(X_train, Y_train, test_size=0.3, random_state=42)

In [19]:
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=42)
X_train, Y_train = ros.fit_resample(X_train, Y_train)
X_train_train, Y_train_train = ros.fit_resample(X_train_train,Y_train_train)

In [20]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_train_train_scaled = scaler.transform(X_train_train)
X_train_test_scaled = scaler.transform(X_train_test)

In [21]:
from sklearn.neighbors import KNeighborsClassifier

# Initialize a dictionary to store accuracy scores for different k values
accuracy_scores = {}
# Test k values from 5 to 19
for k in range(1, 30):
    # Create and train the KNN classifier
    knn_classifier = KNeighborsClassifier(n_neighbors=k)
    knn_classifier.fit(X_train_train_scaled, Y_train_train)

    # Make predictions on the test set
    Y_train_pred_knn = knn_classifier.predict(X_train_test_scaled)

    # Calculate accuracy and store it in the dictionary
    accuracy = accuracy_score(Y_train_test, Y_train_pred_knn)
    accuracy_scores[k] = accuracy

    print(f"KNN (k={k}) Accuracy: {accuracy * 100:.2f}%")

# Find the k value with the highest accuracy
best_k = max(accuracy_scores, key=accuracy_scores.get)
print(f"Best K value: {best_k} with accuracy {accuracy_scores[best_k] * 100:.2f}%")

KNN (k=1) Accuracy: 79.26%
KNN (k=2) Accuracy: 77.59%
KNN (k=3) Accuracy: 75.92%
KNN (k=4) Accuracy: 75.25%
KNN (k=5) Accuracy: 72.58%
KNN (k=6) Accuracy: 71.91%
KNN (k=7) Accuracy: 71.57%
KNN (k=8) Accuracy: 71.57%
KNN (k=9) Accuracy: 69.23%
KNN (k=10) Accuracy: 68.90%
KNN (k=11) Accuracy: 68.90%
KNN (k=12) Accuracy: 68.23%
KNN (k=13) Accuracy: 67.22%
KNN (k=14) Accuracy: 67.22%
KNN (k=15) Accuracy: 65.89%
KNN (k=16) Accuracy: 65.89%
KNN (k=17) Accuracy: 64.21%
KNN (k=18) Accuracy: 65.22%
KNN (k=19) Accuracy: 62.88%
KNN (k=20) Accuracy: 62.54%
KNN (k=21) Accuracy: 62.88%
KNN (k=22) Accuracy: 63.21%
KNN (k=23) Accuracy: 58.53%
KNN (k=24) Accuracy: 58.19%
KNN (k=25) Accuracy: 58.19%
KNN (k=26) Accuracy: 57.53%
KNN (k=27) Accuracy: 57.19%
KNN (k=28) Accuracy: 58.19%
KNN (k=29) Accuracy: 57.53%
Best K value: 1 with accuracy 79.26%


In [22]:
test_data = pd.read_csv("iith_foml_2023_test.csv")
imputer = KNNImputer(n_neighbors=7)
test_data_imputed = pd.DataFrame(imputer.fit_transform(test_data), columns=test_data.columns)
X_test_scaled = scaler.transform(test_data_imputed)
print(X_test_scaled)

knn_classifier = KNeighborsClassifier(n_neighbors=10)
knn_classifier.fit(X_train_scaled, Y_train)

# Make predictions on the test set
Y_pred_knn = knn_classifier.predict(X_test_scaled)

result_df = pd.DataFrame({
    'Id': test_data.index + 1,
    'Category': Y_pred_knn
})
result_df.to_csv('predictions_E_10.csv', index=False)


[[-1.29477388  0.934571   -1.022801   ... -0.29613803 -1.29316958
   0.3595847 ]
 [-1.58915921 -0.70367918 -1.32776616 ... -1.32432988 -1.58685369
   1.11999299]
 [ 1.0178749   0.3884876   1.1831137  ... -0.83471471  1.01397051
   0.44189424]
 ...
 [ 0.84813921 -0.29411664  1.04079662 ...  1.19718824  0.84463913
  -1.85930705]
 [ 1.75781642 -0.70367918  2.15900223 ... -1.20192609  1.76537852
   0.75793906]
 [ 1.85594487 -0.29411664  2.31148481 ... -1.0795223   1.86327322
   0.8684802 ]]


In [23]:

# Y_pred_rf = rf_classifier.predict(test_data)

# result_df = pd.DataFrame({
#     'Id': test_data.index + 1,
#     'Category': Y_pred_rf
# })
# result_df.to_csv('predictions.csv', index=False)

In [24]:
file_path1 = "predictions_E_10.csv"
file_path2 = "predictions_E_9.csv"

# Read the CSV files into pandas DataFrames
df1 = pd.read_csv(file_path1)
df2 = pd.read_csv(file_path2)

# Find rows that are in df1 but not in df2
df_diff1 = pd.concat([df1, df2]).drop_duplicates(keep=False)

# Find rows that are in df2 but not in df1
df_diff2 = pd.concat([df2, df1]).drop_duplicates(keep=False)

# Display the differences
print("Rows in", file_path1, "but not in", file_path2)
print(df_diff1)

print("\nRows in", file_path2, "but not in", file_path1)
print(df_diff2)

Rows in predictions_E_10.csv but not in predictions_E_9.csv
      Id  Category
39    40         2
57    58         0
119  120         1
164  165         2
211  212         2
235  236         0
244  245         6
345  346         1
39    40         0
57    58         2
119  120         0
164  165         1
211  212         0
235  236         2
244  245         1
345  346         0

Rows in predictions_E_9.csv but not in predictions_E_10.csv
      Id  Category
39    40         0
57    58         2
119  120         0
164  165         1
211  212         0
235  236         2
244  245         1
345  346         0
39    40         2
57    58         0
119  120         1
164  165         2
211  212         2
235  236         0
244  245         6
345  346         1


In [25]:
file_path1 = "predictions_man.csv"
file_path2 = "1.csv"

# Read the CSV files into pandas DataFrames
df1 = pd.read_csv(file_path1)
df2 = pd.read_csv(file_path2)

# Find rows that are in df1 but not in df2
df_diff1 = pd.concat([df1, df2]).drop_duplicates(keep=False)

# Find rows that are in df2 but not in df1
df_diff2 = pd.concat([df2, df1]).drop_duplicates(keep=False)

# Display the differences
print("Rows in", file_path1, "but not in", file_path2)
print(df_diff1)

print("\nRows in", file_path2, "but not in", file_path1)
print(df_diff2)

Rows in predictions_man.csv but not in 1.csv
      Id  Category
14    15         1
15    16         5
16    17        13
21    22         2
25    26         2
..   ...       ...
388  389        14
391  392         0
417  418         1
420  421         0
425  426        17

[114 rows x 2 columns]

Rows in 1.csv but not in predictions_man.csv
      Id  Category
14    15         2
15    16         0
16    17         8
21    22         8
25    26         0
..   ...       ...
388  389         1
391  392         2
417  418         0
420  421         2
425  426         0

[114 rows x 2 columns]
