In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.impute import KNNImputer

train_data = pd.read_csv("iith_foml_2023_train.csv")
#train_data = train_data.fillna(train_data.mean())
columns_with_missing = train_data.columns[train_data.isnull().any()].tolist()

imputer = KNNImputer(n_neighbors=5)
train_data_imputed = train_data.copy()
train_data_imputed[columns_with_missing] = imputer.fit_transform(train_data[columns_with_missing])
# changing the data
# train_data.to_csv("train.csv", index=False)
# X_train = train_data.iloc[:, :-1]
# Y_train = train_data.iloc[:, -1]
X_train = train_data_imputed.iloc[:, :-1]
Y_train = train_data_imputed.iloc[:, -1]
X_train_train, X_train_test, Y_train_train, Y_train_test = train_test_split(X_train, Y_train, test_size=0.3, random_state=42)


In [2]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_train_train_scaled = scaler.transform(X_train_train)
X_train_test_scaled = scaler.transform(X_train_test)

In [3]:
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier

knn_classifier = KNeighborsClassifier(n_neighbors=1,metric='manhattan')
rf_classifier = RandomForestClassifier(random_state=42)
xgb_classifier = GradientBoostingClassifier(random_state=42)

ensemble_classifier = VotingClassifier(estimators=[
    ('knn', knn_classifier),
    ('rf', rf_classifier),
    ('xgb', xgb_classifier)
], voting='hard')

# Fit the ensemble classifier on the training data
ensemble_classifier.fit(X_train_train_scaled, Y_train_train)

# Make predictions on the test set
Y_pred_ensemble = ensemble_classifier.predict(X_train_test_scaled)

# Evaluate the ensemble model
accuracy_ensemble = accuracy_score(Y_train_test, Y_pred_ensemble)
print(f"Ensemble Accuracy: {accuracy_ensemble:.2f}")

Ensemble Accuracy: 0.90


In [6]:
test_data = pd.read_csv("iith_foml_2023_test.csv")
test_data_imputed = pd.DataFrame(imputer.fit_transform(test_data), columns=test_data.columns)
X_test_scaled = scaler.transform(test_data_imputed)

# Fit the ensemble classifier on the training data
ensemble_classifier.fit(X_train_scaled, Y_train)

# Make predictions on the test set
Y_pred_ensemble = ensemble_classifier.predict(X_test_scaled)

result_df = pd.DataFrame({
    'Id': test_data.index + 1,
    'Category': Y_pred_ensemble
})
result_df.to_csv('predictions_ensemble.csv', index=False)


In [None]:

# Y_pred_rf = rf_classifier.predict(test_data)

# result_df = pd.DataFrame({
#     'Id': test_data.index + 1,
#     'Category': Y_pred_rf
# })
# result_df.to_csv('predictions.csv', index=False)

In [None]:
file_path1 = "predictions_ensemble.csv"
file_path2 = "predictions_man.csv"

# Read the CSV files into pandas DataFrames
df1 = pd.read_csv(file_path1)
df2 = pd.read_csv(file_path2)

# Find rows that are in df1 but not in df2
df_diff1 = pd.concat([df1, df2]).drop_duplicates(keep=False)

# Find rows that are in df2 but not in df1
df_diff2 = pd.concat([df2, df1]).drop_duplicates(keep=False)

# Display the differences
print("Rows in", file_path1, "but not in", file_path2)
print(df_diff1)

print("\nRows in", file_path2, "but not in", file_path1)
print(df_diff2)

Rows in predictions_ensemble.csv but not in predictions_man.csv
      Id  Category
14    15         2
15    16         0
16    17         2
27    28         0
44    45         0
..   ...       ...
356  357         5
386  387         8
391  392         2
398  399         1
420  421         2

[78 rows x 2 columns]

Rows in predictions_man.csv but not in predictions_ensemble.csv
      Id  Category
14    15         1
15    16         5
16    17        13
27    28         1
44    45         2
..   ...       ...
356  357         0
386  387         2
391  392         0
398  399         4
420  421         0

[78 rows x 2 columns]


In [None]:
file_path1 = "predictions_man.csv"
file_path2 = "1.csv"

# Read the CSV files into pandas DataFrames
df1 = pd.read_csv(file_path1)
df2 = pd.read_csv(file_path2)

# Find rows that are in df1 but not in df2
df_diff1 = pd.concat([df1, df2]).drop_duplicates(keep=False)

# Find rows that are in df2 but not in df1
df_diff2 = pd.concat([df2, df1]).drop_duplicates(keep=False)

# Display the differences
print("Rows in", file_path1, "but not in", file_path2)
print(df_diff1)

print("\nRows in", file_path2, "but not in", file_path1)
print(df_diff2)

Rows in predictions_man.csv but not in 1.csv
      Id  Category
14    15         1
15    16         5
16    17        13
21    22         2
25    26         2
..   ...       ...
388  389        14
391  392         0
417  418         1
420  421         0
425  426        17

[114 rows x 2 columns]

Rows in 1.csv but not in predictions_man.csv
      Id  Category
14    15         2
15    16         0
16    17         8
21    22         8
25    26         0
..   ...       ...
388  389         1
391  392         2
417  418         0
420  421         2
425  426         0

[114 rows x 2 columns]
