In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.decomposition import PCA
import math
import seaborn as sn
import xgboost as xgb
from imblearn.over_sampling import RandomOverSampler
from sklearn import metrics
from sklearn.calibration import LabelEncoder
from catboost import CatBoostClassifier

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
train = pd.read_csv('train.csv')
valid = pd.read_csv('valid.csv')
test = pd.read_csv('test.csv')

In [4]:
label2_train = train.copy()
label2_valid = valid.copy()
label2_test = test.copy()

In [5]:
label2_train = label2_train.dropna(subset=['label_2'])
label2_valid = label2_valid.dropna(subset=['label_2'])

In [6]:
X_train = label2_train.drop(['label_1', 'label_2', 'label_3', 'label_4'], axis=1)
y_train = label2_train[['label_1', 'label_2', 'label_3', 'label_4']]
X_val = label2_valid.drop(['label_1', 'label_2', 'label_3', 'label_4'], axis=1)
y_val = label2_valid[['label_1', 'label_2', 'label_3', 'label_4']]
X_test = label2_test.drop(['ID'], axis=1)

In [7]:
scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

In [None]:
cross_val_score(SVC(C=100), X_train_scaled, y_train['label_2'], cv=2, scoring='accuracy').mean()

In [None]:
cross_val_score(SVC(C=1000, gamma= 0.01), X_train_scaled, y_train['label_2'], cv=2, scoring='accuracy').mean()

In [32]:
pca = PCA(n_components=0.95, svd_solver = 'full')
X_train_pca = pca.fit_transform(X_train_scaled)
X_val_pca = pca.transform(X_val_scaled)
X_test_pca = pca.transform(X_test_scaled)

In [8]:
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV

halving_cv = HalvingGridSearchCV(SVC(), {
    'C': [1, 10,100,1000,1500,2000],
    'kernel': ['rbf'],
    'gamma': ['scale','auto', 0.1, 0.01, 0.001]
})
halving_cv.fit(X_train_scaled, y_train['label_2'])

print("Best Params", halving_cv.best_params_)
print("Best CV Score", halving_cv.best_score_)

Best Params {'C': 100, 'gamma': 0.001, 'kernel': 'rbf'}
Best CV Score 0.5852988403211419


In [None]:
from sklearn.metrics import classification_report

def get_score(model, X_train, y_train, X_val, y_val, verbose = False):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    if verbose:
        print(classification_report(y_val, y_pred))

    return accuracy_score(y_val, y_pred) * 100

In [None]:
get_score(
    SVC(C=1000),
    X_train_pca, y_train['label_2'],
    X_val_pca, y_val['label_2']
)

In [None]:
cross_val_score(SVC(C=1000), X_train_pca, y_train['label_2'], cv=5).mean()  * 100

In [None]:
cross_val_score(CatBoostClassifier(), X_train_pca, y_train['label_2'], cv=5).mean()

In [None]:
get_score(
    SVC(C=1000),
    X_train_pca, y_train['label_2'],
    X_val_pca, y_val['label_2']
)

In [None]:
get_score(
    CatBoostClassifier(learning_rate=0.1, max_depth=6, task_type="GPU", devices='0:1'),
    X_train_pca, y_train['label_2'],
    X_val_pca, y_val['label_2']
)

In [None]:
cross_val_score(
    CatBoostClassifier(loss_function='MultiClass', max_depth=6 , task_type="GPU", devices='0:1'),
    X_train, y_train['label_2'], cv=5).mean() * 100

In [None]:
cross_val_score(
    SVC(C=1000, gamma=0.001),
    X_train, y_train['label_2'], cv=5, verbose=3).mean() * 100

In [None]:
# from scipy.stats import randint
# from sklearn.model_selection import RandomizedSearchCV

# param_dist = { "learning_rate": np.linspace(0,0.2,5),
#                "max_depth": randint(3, 10)}
               
# #Instantiate RandomSearchCV object
# rscv = RandomizedSearchCV(CatBoostClassifier(task_type="GPU",
#                            devices='0:1'), param_dist, scoring='accuracy', cv =5, verbose=3)

# #Fit the model
# rscv.fit(X_train_pca, y_train['label_3'])

# print(rscv.best_params_)

In [None]:
cross_val_score(
    CatBoostClassifier(task_type="GPU",
                        devices='0:1,', learning_rate=0.1, max_depth=6, iterations=1000),
    X_train, y_train['label_2'], cv=5, verbose=3
).mean() * 100

In [None]:
get_score(
    CatBoostClassifier(task_type="GPU",
                        devices='0:1,', learning_rate=0.15),
    X_train_pca, y_train['label_2'],
    X_val_pca, y_val['label_2']
)

In [None]:
get_score(
    SVC(C=1000, gamma=0.1),
    X_train_pca, y_train['label_2'],
    X_val_pca, y_val['label_2']
)