In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.impute import KNNImputer

train_data = pd.read_csv("iith_foml_2023_train.csv")
#train_data = train_data.fillna(train_data.mean())
columns_with_missing = train_data.columns[train_data.isnull().any()].tolist()

imputer = KNNImputer(n_neighbors=7)
train_data_imputed = train_data.copy()
train_data_imputed[columns_with_missing] = imputer.fit_transform(train_data[columns_with_missing])
# changing the data
# train_data.to_csv("train.csv", index=False)
# X_train = train_data.iloc[:, :-1]
# Y_train = train_data.iloc[:, -1]
X_train = train_data_imputed.iloc[:, :-1]
Y_train = train_data_imputed.iloc[:, -1]
X_train_train, X_train_test, Y_train_train, Y_train_test = train_test_split(X_train, Y_train, test_size=0.3, random_state=42)

In [2]:
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=42)
X_train, Y_train = ros.fit_resample(X_train, Y_train)
X_train_train, Y_train_train = ros.fit_resample(X_train_train,Y_train_train)

In [3]:
from sklearn.neighbors import KNeighborsClassifier

# Initialize a dictionary to store accuracy scores for different k values
accuracy_scores = {}
# Test k values from 5 to 19
for k in range(1, 30):
    # Create and train the KNN classifier
    knn_classifier = KNeighborsClassifier(n_neighbors=k, metric='manhattan')
    knn_classifier.fit(X_train_train, Y_train_train)

    # Make predictions on the test set
    Y_train_pred_knn = knn_classifier.predict(X_train_test)

    # Calculate accuracy and store it in the dictionary
    accuracy = accuracy_score(Y_train_test, Y_train_pred_knn)
    accuracy_scores[k] = accuracy

    print(f"KNN (k={k}) Accuracy: {accuracy * 100:.2f}%")

# Find the k value with the highest accuracy
best_k = max(accuracy_scores, key=accuracy_scores.get)
print(f"Best K value: {best_k} with accuracy {accuracy_scores[best_k] * 100:.2f}%")

KNN (k=1) Accuracy: 74.58%
KNN (k=2) Accuracy: 72.58%
KNN (k=3) Accuracy: 72.91%
KNN (k=4) Accuracy: 72.58%
KNN (k=5) Accuracy: 72.24%
KNN (k=6) Accuracy: 70.57%
KNN (k=7) Accuracy: 69.90%
KNN (k=8) Accuracy: 69.90%
KNN (k=9) Accuracy: 68.56%
KNN (k=10) Accuracy: 67.89%
KNN (k=11) Accuracy: 68.90%
KNN (k=12) Accuracy: 69.57%
KNN (k=13) Accuracy: 68.56%
KNN (k=14) Accuracy: 66.22%
KNN (k=15) Accuracy: 65.22%
KNN (k=16) Accuracy: 64.55%
KNN (k=17) Accuracy: 62.88%
KNN (k=18) Accuracy: 62.54%
KNN (k=19) Accuracy: 62.88%
KNN (k=20) Accuracy: 63.55%
KNN (k=21) Accuracy: 62.88%
KNN (k=22) Accuracy: 63.21%
KNN (k=23) Accuracy: 61.87%
KNN (k=24) Accuracy: 62.21%
KNN (k=25) Accuracy: 61.20%
KNN (k=26) Accuracy: 60.20%
KNN (k=27) Accuracy: 59.20%
KNN (k=28) Accuracy: 59.20%
KNN (k=29) Accuracy: 59.87%
Best K value: 1 with accuracy 74.58%


In [4]:
test_data = pd.read_csv("iith_foml_2023_test.csv")
imputer = KNNImputer(n_neighbors=7)
test_data_imputed = pd.DataFrame(imputer.fit_transform(test_data), columns=test_data.columns)

knn_classifier = KNeighborsClassifier(n_neighbors=9, metric='manhattan')
knn_classifier.fit(X_train, Y_train)

# Make predictions on the test set
Y_pred_knn = knn_classifier.predict(test_data)

result_df = pd.DataFrame({
    'Id': test_data.index + 1,
    'Category': Y_pred_knn
})
result_df.to_csv('withoutstd.csv', index=False)


ValueError: Input X contains NaN.
KNeighborsClassifier does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [None]:

# Y_pred_rf = rf_classifier.predict(test_data)

# result_df = pd.DataFrame({
#     'Id': test_data.index + 1,
#     'Category': Y_pred_rf
# })
# result_df.to_csv('predictions.csv', index=False)

In [None]:
file_path1 = "without.csv"
file_path2 = "predictions_man_9.csv"

# Read the CSV files into pandas DataFrames
df1 = pd.read_csv(file_path1)
df2 = pd.read_csv(file_path2)

# Find rows that are in df1 but not in df2
df_diff1 = pd.concat([df1, df2]).drop_duplicates(keep=False)

# Find rows that are in df2 but not in df1
df_diff2 = pd.concat([df2, df1]).drop_duplicates(keep=False)

# Display the differences
print("Rows in", file_path1, "but not in", file_path2)
print(df_diff1)

print("\nRows in", file_path2, "but not in", file_path1)
print(df_diff2)

Rows in predictions_man_9.csv but not in predictions_man_11.csv
      Id  Category
39    40         0
98    99         2
102  103         0
111  112         2
235  236         2
286  287         0
333  334         0
345  346         2
39    40         2
98    99         5
102  103         1
111  112         5
235  236         0
286  287         2
333  334         1
345  346         0

Rows in predictions_man_11.csv but not in predictions_man_9.csv
      Id  Category
39    40         2
98    99         5
102  103         1
111  112         5
235  236         0
286  287         2
333  334         1
345  346         0
39    40         0
98    99         2
102  103         0
111  112         2
235  236         2
286  287         0
333  334         0
345  346         2


In [None]:
file_path1 = "predictions_man.csv"
file_path2 = "1.csv"

# Read the CSV files into pandas DataFrames
df1 = pd.read_csv(file_path1)
df2 = pd.read_csv(file_path2)

# Find rows that are in df1 but not in df2
df_diff1 = pd.concat([df1, df2]).drop_duplicates(keep=False)

# Find rows that are in df2 but not in df1
df_diff2 = pd.concat([df2, df1]).drop_duplicates(keep=False)

# Display the differences
print("Rows in", file_path1, "but not in", file_path2)
print(df_diff1)

print("\nRows in", file_path2, "but not in", file_path1)
print(df_diff2)

Rows in predictions_man.csv but not in 1.csv
      Id  Category
14    15         1
15    16         5
16    17        13
21    22         2
25    26         2
..   ...       ...
388  389        14
391  392         0
417  418         1
420  421         0
425  426        17

[114 rows x 2 columns]

Rows in 1.csv but not in predictions_man.csv
      Id  Category
14    15         2
15    16         0
16    17         8
21    22         8
25    26         0
..   ...       ...
388  389         1
391  392         2
417  418         0
420  421         2
425  426         0

[114 rows x 2 columns]
