In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.decomposition import PCA
import math
import seaborn as sn
import xgboost as xgb
from imblearn.over_sampling import RandomOverSampler
from sklearn import metrics
from sklearn.calibration import LabelEncoder
from catboost import CatBoostClassifier


In [2]:
import warnings
warnings.filterwarnings('ignore')


In [3]:
train = pd.read_csv('train.csv')
valid = pd.read_csv('valid.csv')
test = pd.read_csv('test.csv')


In [4]:
train.isnull().sum()


feature_1        0
feature_2        0
feature_3        0
feature_4        0
feature_5        0
              ... 
feature_768      0
label_1          0
label_2        480
label_3          0
label_4          0
Length: 772, dtype: int64

In [4]:
# Separate features and labels
X_train = train.drop(['label_1', 'label_2', 'label_3', 'label_4'], axis=1)
y_train = train[['label_1', 'label_2', 'label_3', 'label_4']]
X_val = valid.drop(['label_1', 'label_2', 'label_3', 'label_4'], axis=1)
y_val = valid[['label_1', 'label_2', 'label_3', 'label_4']]
X_test = test.drop(['ID'], axis=1)


In [5]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)


### Label_1

In [None]:
plt.figure(figsize=(18, 6))
sn.countplot(data=y_train, x='label_1', color='teal')
plt.xlabel('Speaker', fontsize=12)


In [None]:
len(y_train['label_1'].unique())


In [None]:
from sklearn.metrics import classification_report

def get_score(model, X_train, y_train, X_val, y_val, verbose = False):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    if verbose:
        print(classification_report(y_val, y_pred))

    return accuracy_score(y_val, y_pred)


In [7]:
pca = PCA(n_components=0.95, svd_solver = 'full')
X_train_pca = pca.fit_transform(X_train_scaled)
X_val_pca = pca.transform(X_val_scaled)
X_test_pca = pca.transform(X_test_scaled)


In [6]:
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV

halving_cv = HalvingGridSearchCV(SVC(), {
    'C': [1, 10,100,1000,1500,2000],
    'kernel': ['rbf'],
    'gamma': ['scale','auto', 0.1, 0.01, 0.001]
})
halving_cv.fit(X_train_scaled, y_train['label_1'])

print("Best Params", halving_cv.best_params_)
print("Best CV Score", halving_cv.best_score_)


Best Params {'C': 1500, 'gamma': 0.001, 'kernel': 'rbf'}
Best CV Score 0.9354261662574535


In [None]:
get_score(
    KNeighborsClassifier(n_neighbors=10),
    X_train_pca, y_train['label_3'],
    X_val_pca, y_val['label_3'],
)


In [None]:
cross_val_score(SVC(), X_train_pca, y_train['label_1'], cv=5, verbose=3).mean()


In [None]:
get_score(
    RandomForestClassifier(n_estimators=100),
    X_train_pca, y_train['label_1'],
    X_val_pca, y_val['label_1']
)


In [None]:
from xgboost import XGBClassifier


get_score(
    XGBClassifier(num_class=len(y_train['label_1'].unique()), tree_method='gpu_hist', gpu_id= 0),
    X_train_pca, y_train['label_1']-1,
    X_val_pca, y_val['label_1']-1
)


In [None]:
from sklearn.ensemble import AdaBoostClassifier


get_score(
    AdaBoostClassifier(n_estimators=50,
                         learning_rate=1),
    X_train_pca, y_train['label_1'],
    X_val_pca, y_val['label_1']
)


In [None]:
corr_matrix = train.corr()
label = 'label_1'
label_col_id = corr_matrix.columns.get_loc(label)

corr_with_label = corr_matrix.iloc[:, label_col_id]
corr_fearures = corr_with_label[(corr_with_label.index != label) & (~corr_with_label.index.str.contains('label'))]
corr_fearures.sort_values(ascending=False)


In [None]:
get_score(
    CatBoostClassifier(loss_function='MultiClass', task_type="GPU",
                           devices='0:1'),
    X_train_pca, y_train['label_1'],
    X_val_pca, y_val['label_1']
)


In [None]:
cross_val_score(
    CatBoostClassifier(loss_function='MultiClass', task_type="GPU", devices='0:1'
                       , learning_rate = 0.15),
    X_train, y_train['label_1'], cv=5
).mean() * 100


In [None]:
cross_val_score(
    RandomForestClassifier(n_estimators=100),
    X_train, y_train['label_3'], cv=5
).mean() * 100


In [None]:
cross_val_score(SVC(C=50), X_train_pca, y_train['label_1'], cv=5, verbose=3).mean()


In [None]:
best_model_layer1 = SVC(C=100)


In [None]:
get_score(
    RandomForestClassifier(),
    X_train, y_train['label_1'],
    X_val, y_val['label_1']
)


In [None]:
cross_val_score(
    SVC(C=1000),
    X_train_scaled, y_train['label_1'], cv=2, verbose=3 )


In [2]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.svm import SVC
import pandas as pd
import numpy as np


train = pd.read_csv('../Dataset/layer_12_train.csv')

X_train = train.drop(['label_1', 'label_2', 'label_3', 'label_4'], axis=1)
y_train = train[['label_1', 'label_2', 'label_3', 'label_4']]

scaler = StandardScaler()
pca = PCA(n_components=0.95, svd_solver = 'full')
svc = SVC(C=1500, gamma=0.001, kernel='rbf')

pipeline = Pipeline([
    ('scaler', scaler),
    ('pca', pca),
    ('svc', svc)
])

pipeline.fit(X_train, y_train['label_1'])
y_pred = pipeline.predict(X_train)


In [3]:
pd.DataFrame(y_pred).to_csv('layer_12_train_label_1.csv', index=False)


In [1]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.svm import SVC
import pandas as pd
import numpy as np


train = pd.read_csv('../Dataset/layer_12_train.csv')

train_label_2 = train[train['label_2'].notna()]

X_train = train_label_2.drop(['label_1', 'label_2', 'label_3', 'label_4'], axis=1)
y_train = train_label_2[['label_1', 'label_2', 'label_3', 'label_4']]

scaler = StandardScaler()
pca = PCA(n_components=0.95, svd_solver = 'full')
svc = SVC(C=100, gamma=0.001, kernel='rbf')

pipeline = Pipeline([
    ('scaler', scaler),
    ('pca', pca),
    ('svc', svc)
])

pipeline.fit(X_train, y_train['label_2'])
y_pred = pipeline.predict(X_train)

pd.DataFrame(y_pred).shape


(28040, 1)

In [2]:
pd.DataFrame(y_pred).to_csv('layer_12_train_label_2.csv', index=False)


In [3]:

X_train = train.drop(['label_1', 'label_2', 'label_3', 'label_4'], axis=1)
y_train = train[['label_1', 'label_2', 'label_3', 'label_4']]


pipeline.fit(X_train, y_train['label_3'])
y_pred = pipeline.predict(X_train)

pd.DataFrame(y_pred).shape


(28520, 1)

In [4]:
pd.DataFrame(y_pred).to_csv('layer_12_train_label_3.csv', index=False)


In [5]:
# SVC(C=1000, gamma='auto', class_weight='balanced')

scaler = StandardScaler()
pca = PCA(n_components=0.95, svd_solver = 'full')
svc = SVC(C=1000, gamma='auto', class_weight='balanced')

pipeline = Pipeline([
    ('scaler', scaler),
    ('pca', pca),
    ('svc', svc)
])

pipeline.fit(X_train, y_train['label_4'])
y_pred = pipeline.predict(X_train)

pd.DataFrame(y_pred).shape


(28520, 1)

In [6]:
pd.DataFrame(y_pred).to_csv('layer_12_train_label_4.csv', index=False)
