In [17]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from tqdm import tqdm 

In [2]:
data_train = pd.read_csv('../ML/datasets/adult_data_train.csv')
data = pd.read_csv('../ML/datasets/adult_data_reserved.csv')

In [3]:
data_train.replace('?', pd.NA, inplace=True)
data.replace('?', pd.NA, inplace=True)

In [4]:
data.shape

(6513, 14)

In [5]:
data_train.dropna(inplace=True)
data_train.reset_index(drop=True, inplace=True)
# data_train.fillna(data.mode().iloc[0], inplace=True)
data.shape

(6513, 14)

In [6]:
data.fillna(data.mode().iloc[0], inplace=True)

In [12]:
for column in tqdm(data.columns):
    # print(f'Processing column: {column}')
    for i, value in enumerate(data[column]):
      if value not in data_train[column].values:
        most_frequent_value = data[column].mode()[0]
        data.at[i, column] = most_frequent_value
data.columns

100%|██████████| 14/14 [00:27<00:00,  1.99s/it]


Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country'],
      dtype='object')

In [8]:
data.shape

(6513, 14)

In [13]:
le = LabelEncoder()
str_columns = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']
for i in str_columns:
	le.fit(data_train[i])
	data_train[i] = le.transform(data_train[i])
for i in str_columns:
	le.fit(data[i])
	data[i] = le.transform(data[i])

In [14]:
X = data_train.drop(columns = 'label')
y = data_train['label']

In [15]:
scaler = MinMaxScaler()
scaler.fit(X)
X_train_scaler = scaler.transform(X)
X_test_scaler = scaler.transform(data)

In [16]:
knn = KNeighborsClassifier()
knn.fit(X_train_scaler, y)
y_pred = knn.predict(X_test_scaler)
y_pred.shape

(6513,)

In [31]:
params = {
	'n_neighbors': np.arange(1, 16), 
	'metric': ['manhattan', 'euclidean'],
	'weights': ['uniform', 'distance']
}
knn_grid = GridSearchCV(knn, params, cv=5, scoring='f1', n_jobs=-1)
knn_grid.fit(X_train_scaler, y)

In [32]:
knn_grid.best_params_

{'metric': 'manhattan', 'n_neighbors': 11, 'weights': 'uniform'}

In [35]:
knn = KNeighborsClassifier(n_neighbors=11)
knn.fit(X_train_scaler, y)
y_pred = knn.predict(X_test_scaler)
y_pred.shape

(6513,)

In [34]:
with open('../ML/outputs/file_adultReserve.txt', 'w') as f:
    f.write(f'{list(y_pred)}')