In [1]:
import pandas as pd
import numpy as np

In [2]:
test_data = pd.read_csv('test_Naver_noclass.csv')
train_data = pd.read_csv('train_Naver.csv')

df_test = test_data.copy()
df_train = train_data.copy()

In [3]:
from sklearn.model_selection import train_test_split

X_train, X_val, Y_train, Y_val = train_test_split(
    df_train.iloc[:, :-1], df_train.iloc[:, -1], test_size=0.2, random_state=17
)
X_test = df_test

In [5]:
from sklearn import neighbors
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

In [6]:
clf = neighbors.KNeighborsClassifier(n_neighbors=3, weights='distance', metric="cosine")
clf.fit(X_train, Y_train)
Y_pred = clf.predict(X_val)

print("KNN accuracy: ", accuracy_score(Y_val, Y_pred))
print("KNN f1 score: ", f1_score(Y_val, Y_pred, average = "weighted"))
print("KNN confusion matrix: ")
print(confusion_matrix(Y_val, Y_pred))

KNN accuracy:  0.875
KNN f1 score:  0.8779220779220778
KNN confusion matrix: 
[[14  2]
 [ 5 35]]


In [7]:
from sklearn.linear_model import LogisticRegression

regressor = LogisticRegression()
regressor.fit(X_train, Y_train)

pred = regressor.predict(X_val)
print("LR classification accuracy", accuracy_score(Y_val, pred))
print("LR f1 score", f1_score(Y_val, pred, average="macro"))
print("LR classification confusion matrix: ")
print(confusion_matrix(Y_val, pred))

LR classification accuracy 0.8571428571428571
LR f1 score 0.8310708898944192
LR classification confusion matrix: 
[[13  3]
 [ 5 35]]


In [8]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

param_grid = {'C': np.linspace(0.1, 100, num=7),
                'gamma': np.linspace(0.1, 10, num=7),
                'kernel': ['rbf', 'linear', 'sigmoid'],
             'degree': [2]}

grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3)

# grid.fit(X_train, Y_train)

In [9]:
# print(grid.best_params_)
# print(grid.best_estimator_)

# print("SVM accuracy: ", accuracy_score(Y_val, grid.predict(X_val)))
# print("SVM f1 score: ", f1_score(Y_val, grid.predict(X_val), average='weighted'))
# print("SVM confusion matrix: ")
# print(confusion_matrix(Y_val, grid.predict(X_val)))

In [10]:
from sklearn.ensemble import RandomForestClassifier  

In [11]:
from sklearn.model_selection import RandomizedSearchCV

clf = RandomForestClassifier(n_estimators=100, random_state=42)

n_estimators = [100,200,300]
max_features = ['auto', 'sqrt']
min_samples_split = [2, 5, 10]

random_grid = {'n_estimators': n_estimators,
                'max_features': max_features,
                'min_samples_split': min_samples_split}

rf_random = RandomizedSearchCV(estimator = clf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
rf_random.fit(X_train, Y_train)

Fitting 3 folds for each of 18 candidates, totalling 54 fits




RandomizedSearchCV(cv=3, estimator=RandomForestClassifier(random_state=42),
                   n_iter=100, n_jobs=-1,
                   param_distributions={'max_features': ['auto', 'sqrt'],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [100, 200, 300]},
                   random_state=42, verbose=2)

In [12]:
print(rf_random.best_params_)
print(rf_random.best_estimator_)
print("RF accuracy: ", accuracy_score(Y_val, rf_random.predict(X_val)))
print("RF f1 score: ", f1_score(Y_val, rf_random.predict(X_val), average='weighted'))
print("RF confusion matrix: ")
print(confusion_matrix(Y_val, rf_random.predict(X_val)))

{'n_estimators': 300, 'min_samples_split': 2, 'max_features': 'auto'}
RandomForestClassifier(n_estimators=300, random_state=42)
RF accuracy:  0.8035714285714286
RF f1 score:  0.801615747852307
RF confusion matrix: 
[[10  6]
 [ 5 35]]


In [13]:
pred = regressor.predict(X_test)
final_res = pd.concat([test_data, pd.DataFrame(pred, columns=['class'])], axis=1)
final_res.to_csv('test_Naver.csv')