In [2]:
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from catboost import CatBoostClassifier, CatBoostRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler # eliminate outliers
from sklearn import svm
import warnings                   # To ignore the warnings
warnings.filterwarnings("ignore")

In [3]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
valid = pd.read_csv('valid.csv')

# Find columns with missing values and count how many missing values in each column
missing_columns = train.columns[train.isnull().any()]
missing_counts = train[missing_columns].isnull().sum()

# Print the columns with missing values and their corresponding missing value counts
print("shape of train: ", train.shape)
for column in missing_columns:
    print(f"Column '{column}' has {missing_counts[column]} missing values.")

shape of train:  (28520, 772)
Column 'label_2' has 480 missing values.


Label 2 contains missing values in the training dataset, but given that the dataset has almost 30,000 rows, removing 480 rows with missing values doesn't significantly impact the dataset's overall size or quality.

In [4]:
L1 = "label_1" #Speaker ID
L2 = "label_2" #Speaker age
L3 = "label_3" #Speaker gender
L4 = "label_4" #Speaker accent
LABELS = [L1, L2, L3, L4,]
AGE_LABEL = L2
FEATURES = [f'feature_{i}' for i in range(1,769)]

In [5]:
train_df = train.copy()
test_df = test.copy()
valid_df = valid.copy()

train_df.head()

Unnamed: 0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,...,feature_763,feature_764,feature_765,feature_766,feature_767,feature_768,label_1,label_2,label_3,label_4
0,0.186257,-0.058807,0.024632,-0.163933,-0.146699,0.035889,0.111708,-0.162861,0.028249,-0.098063,...,0.055629,-0.010358,0.125754,0.011648,0.079197,0.093215,45,,1,6
1,0.063431,-0.023597,0.068057,-0.252915,-0.061094,-0.027316,0.135747,-0.168147,0.091236,-0.078473,...,-0.014893,0.071721,0.018918,0.100032,-0.083042,0.088615,45,,1,6
2,0.034962,0.035816,-0.029753,-0.094607,-0.017576,-0.053074,0.040121,-0.007932,0.097872,-0.024042,...,0.012415,0.015215,0.083808,0.031312,-0.056277,0.064702,45,,1,6
3,0.033772,0.085612,0.067488,-0.073953,-0.180646,-0.024512,0.242879,-0.023374,-0.059999,0.002006,...,-0.078246,-0.032903,0.082949,-0.020659,0.082274,-0.050164,45,,1,6
4,0.134305,0.062096,0.10692,-0.089327,0.117093,-0.077107,0.152579,0.047529,-0.015998,-0.110657,...,-0.094629,0.069718,0.014379,0.048124,0.007586,-0.01698,45,,1,6


In [6]:
train_df[LABELS + [FEATURES[i] for i in range(0,267,32)]].describe()

Unnamed: 0,label_1,label_2,label_3,label_4,feature_1,feature_33,feature_65,feature_97,feature_129,feature_161,feature_193,feature_225,feature_257
count,28520.0,28040.0,28520.0,28520.0,28520.0,28520.0,28520.0,28520.0,28520.0,28520.0,28520.0,28520.0,28520.0
mean,30.498843,27.975107,0.799299,5.997125,0.065874,-0.043585,0.038885,0.054217,-0.079537,0.025433,-0.001322,0.01804,-0.056276
std,17.328389,5.735913,0.400532,2.375567,0.077031,0.058303,0.055592,0.342679,0.059705,0.056393,0.058277,0.074377,0.071872
min,1.0,22.0,0.0,0.0,-0.208012,-0.306159,-0.200614,-1.342383,-0.35909,-0.250736,-0.252896,-0.484277,-0.347124
25%,15.0,25.0,1.0,6.0,0.017124,-0.081056,0.003292,-0.181106,-0.117379,-0.006828,-0.038038,-0.03044,-0.103898
50%,30.0,27.0,1.0,6.0,0.067048,-0.041906,0.03958,0.048112,-0.074849,0.027357,-0.001771,0.014303,-0.060172
75%,46.0,30.0,1.0,6.0,0.117308,-0.004928,0.076112,0.302699,-0.038362,0.060247,0.035775,0.067079,-0.011663
max,60.0,61.0,1.0,13.0,0.33554,0.212618,0.258643,1.239059,0.140202,0.268009,0.253798,0.309669,0.340745


In [7]:
x_train = {}
x_valid = {}
x_test = {}

y_train = {}
y_valid = {}
y_test = {}

#create dictionaries for each label
for target_label in LABELS:
  tr_df = train_df[train_df['label_2'].notna()] if target_label == "label_2" else train_df
  vl_df = valid_df[valid_df['label_2'].notna()] if target_label == "label_2" else valid_df
  te_df = test_df

  scaler = RobustScaler()
  # x_train_features = tr_df.drop(LABELS, axis=1)

  x_train[target_label] = pd.DataFrame(scaler.fit_transform(tr_df.drop(LABELS, axis=1)), columns=FEATURES)
  y_train[target_label] = tr_df[target_label]

  x_valid[target_label] = pd.DataFrame(scaler.transform(vl_df.drop(LABELS, axis=1)), columns=FEATURES)
  y_valid  [target_label] = vl_df[target_label]

  x_test[target_label] = pd.DataFrame(scaler.transform(te_df.drop(["ID"],axis=1)), columns=FEATURES)
  # y_test[target_label] = te_df[target_label] <- need to predict




In the provided code, it removes rows with NaN values in Label 2. All four labels are then organized into a dictionary for easier handling and convenience.

# Support Functions

### Cross validation

In [8]:
def crossValidation(model, x_train, y_train):
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    scores = cross_val_score(model, x_train, y_train, cv=kf)

    mean_accuracy = scores.mean()
    std_accuracy = scores.std()
    # Print the cross-validation scores
    model_name = type(model).__name__
    print('--',model_name,'--')
    print("Cross-validation scores:", scores)
    print(f"Standard Deviation: {std_accuracy:.2f}")
    print("***Mean Accuracy***: {:.2f}%".format(mean_accuracy * 100))
    print('\n\n')


### Random Search for hyper parameter tuning 

In [9]:
def randomSearch(modelName, x_train, y_train):

    param_dist_svm = {
        'C': [100,10,1],
        'kernel': ['rbf','linear','poly','sigmoid'],
        'gamma': ['scale','auto',1,10],
        'degree': [1,2,3],  # For the polynomial kernel
        'class_weight' : ['None','balanced']
    }

    param_dist_catBoost = {
        'depth': [2,6,10],
        'learning_rate': [0.1,1,10],
        'l2_leaf_reg': [1,2],
        'random_strength': [0,1],
    }

    param_dist_randomForrest = {
        'n_estimators': [1,10,100],
        'max_depth': [1,10],
        'min_samples_split': [1,10],
        'min_samples_leaf': [1,10],
    }

    svm = SVC()
    catBoost = CatBoostClassifier(iterations=100,task_type="GPU",devices='0:1')
    randomForrest = RandomForestClassifier()

    model = ''
    param_dist = ''
    nJobs = -1

    if(modelName=="svm"):
        model = svm
        param_dist = param_dist_svm
        nJobs = -1
    elif(modelName=="catBoost"):
        model = catBoost
        param_dist = param_dist_catBoost
        nJobs = 1
    elif(modelName=="randomForest"):
        model = randomForrest
        param_dist = param_dist_randomForrest
        nJobs = -1

    

    random_search = RandomizedSearchCV(
        model, param_distributions=param_dist, n_iter=10, cv=5, n_jobs=nJobs, random_state=42, scoring='accuracy'
    )

    random_search.fit(x_train, y_train)

    best_params = random_search.best_params_
    best_model = random_search.best_estimator_

    print("best parameters:", best_params)
    return best_params

### Grid Search for hyper parameter tuning 

In [10]:
def gridSearch(modelName, param_grid, model, x_train, y_train):
    nJobs = -1
    if(modelName=="svm"):
        nJobs = -1
    elif(modelName=="catBoost"):
        nJobs = 1
    elif(modelName=="randomForest"):
        nJobs = -1

    # Perform grid search with cross-validation
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=nJobs, verbose=3)
    grid_search.fit(x_train, y_train)

    # Get the best hyperparameters
    best_params = grid_search.best_params_
    print("best parameters:", best_params)

In [11]:
def generate_param_grid(best_params, modelName):
    param_grid = {}

    if modelName == 'svc' or 'svm' :
        param_grid = {
            'kernel': [best_params['kernel']],
            'gamma': ['scale', 'auto'] if best_params['gamma'] == 'scale' else [best_params['gamma']],
            'degree': [best_params['degree']],
            'class_weight': [best_params['class_weight']],
            'C': [ best_params['C'], 2*int(best_params['C']) ]
        }
    elif modelName == 'catBoost' :
        param_grid = {
            'depth': [ best_params['depth']-1, best_params['depth'], best_params['depth']+1 ],
            'learning_rate': [ best_params['learning_rate'], int(best_params['learning_rate'])*2 ],
            'l2_leaf_reg': [best_params['l2_leaf_reg']],
            'random_strength': [best_params['random_strength']],
        }
    elif modelName == 'randomForest' :
        param_grid = {
            'n_estimators': [best_params['n_estimators'], int(best_params['n_estimators'])*2],
            'max_depth': [best_params['max_depth'], int(best_params['max_depth'])*2],
            'min_samples_split': [best_params['min_samples_split']],
            'min_samples_leaf': [best_params['min_samples_leaf']],
        }
    
    return param_grid

### Create output csv

In [12]:
def create_csv_from_labels(y_valid_pred, output_file):
    # Create a DataFrame with the label data

    data = {
        'ID': range(1, len(y_valid_pred[0]) + 1),
        'label_1': y_valid_pred[0],  # Assuming label_1 is the first column in y_valid_pred
        'label_2': y_valid_pred[1],  # Assuming label_2 is the second column in y_valid_pred
        'label_3': y_valid_pred[2],  # Assuming label_3 is the third column in y_valid_pred
        'label_4': y_valid_pred[3]   # Assuming label_4 is the fourth column in y_valid_pred
    }
    
    df = pd.DataFrame(data)
    
    # Save the DataFrame to a CSV file
    df.to_csv(output_file, index=False)

### Model Train

In [13]:
def modelTrain( model, x_train, y_train, x_valid=None, y_valid=None, x_test=None):
    # Train
    model.fit(x_train, y_train)
    y_test_pred = False
    model_name = type(model).__name__

    if(not x_valid.empty):
        # Predict
        y_valid_pred = model.predict(x_valid)
        # Accuracy
        print(model_name,"accuracy_score for validation data set: ",metrics.accuracy_score(y_valid, y_valid_pred))

    if(not x_test.empty):
        ########### TEST ##############
        y_test_pred = model.predict(x_test)

    return y_test_pred

# Label training

In [68]:
all_labels=[]
# This array is using to create the final output csv of all predicted labels.

# Label 1

In [69]:
Label = L1
pca_NComponents = 0.97

In [70]:
# Checking whether the dataset is bias or not
print(train_df[Label].unique())
print(train_df[Label].value_counts())
print(train_df.info())

[45  5 60 19 11 52 25 46 51 35 56 53  3 40 43 58 44 37 55 17  2 47 54 21
 34 23 10 28 20  7  6  4 48 32 12 22 38 36 59 50 14 15 24 13 29 18  1  9
 49 27 42 26 41 57  8 33 31 16 30 39]
12    485
35    484
26    483
60    482
24    482
25    481
59    481
10    481
54    481
45    480
41    480
9     480
2     479
42    479
47    479
6     479
56    479
34    478
52    478
3     478
14    478
33    478
43    477
1     477
13    477
20    477
23    477
30    476
51    476
32    476
53    476
22    476
38    476
49    476
55    475
28    474
8     474
40    474
48    474
21    474
4     474
39    473
17    473
7     473
15    472
58    472
5     471
27    471
31    470
19    469
11    469
46    469
29    469
36    468
16    468
50    467
37    467
44    467
57    466
18    465
Name: label_1, dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28520 entries, 0 to 28519
Columns: 772 entries, feature_1 to label_4
dtypes: float64(769), int64(3)
memory usage: 168.0 MB
None


According to the above information label 1 is not bias and its not compulsory to do over-sampling a under-sampling.

In [71]:
x_train_df = x_train[Label].copy()
y_train_df = y_train[Label].copy()

x_valid_df = x_valid[Label].copy()
y_valid_df = y_valid[Label].copy()

x_test_df = x_test[Label].copy()
# y_test__df = y_test[L1].copy() <- need to predict

In [72]:
# Feature extraction (dimension reduction)
pca = PCA(n_components=pca_NComponents, svd_solver='full')
pca.fit(x_train_df)
x_train_df_pca = pd.DataFrame(pca.transform(x_train_df)) #train
x_valid_df_pca = pd.DataFrame(pca.transform(x_valid_df)) #valid
x_test_df_pca = pd.DataFrame(pca.transform(x_test_df)) #test
print('Shape after PCA: ',x_train_df_pca.shape)

Shape after PCA:  (28520, 411)


First of all, it is necessary to select the best classification algorithm for model training. Therefore, cross-validation techniques have been used to choose the best classifier. Four different classifiers have been applied for cross-validation.

In [73]:
# Before feature engineering cross validation and checking accuracy

svm_classifier = svm.SVC()
knn_classifier = KNeighborsClassifier(n_neighbors=5)
catBoost_classifier = CatBoostClassifier(iterations=100,task_type="GPU")
randomForest_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

crossValidation(svm_classifier, x_train_df, y_train_df)
crossValidation(knn_classifier, x_train_df, y_train_df)
crossValidation(catBoost_classifier, x_train_df, y_train_df)
crossValidation(randomForest_classifier, x_train_df, y_train_df)


-- SVC --
Cross-validation scores: [0.95038569 0.9470547  0.94723001 0.94793128 0.94652875]
Standard Deviation: 0.00
***Mean Accuracy***: 94.78%



-- KNeighborsClassifier --
Cross-validation scores: [nan nan nan nan nan]
Standard Deviation: nan
***Mean Accuracy***: nan%



Learning rate set to 0.5
0:	learn: 3.6057920	total: 769ms	remaining: 1m 16s
1:	learn: 3.3065122	total: 1.36s	remaining: 1m 6s
2:	learn: 2.9810563	total: 2.04s	remaining: 1m 6s
3:	learn: 2.7235499	total: 2.71s	remaining: 1m 5s
4:	learn: 2.5119317	total: 3.38s	remaining: 1m 4s
5:	learn: 2.3095864	total: 4.06s	remaining: 1m 3s
6:	learn: 2.1109924	total: 4.73s	remaining: 1m 2s
7:	learn: 1.9574053	total: 5.39s	remaining: 1m 1s
8:	learn: 1.8336591	total: 6.07s	remaining: 1m 1s
9:	learn: 1.7524635	total: 6.62s	remaining: 59.6s
10:	learn: 1.6464150	total: 7.27s	remaining: 58.8s
11:	learn: 1.5505873	total: 7.9s	remaining: 57.9s
12:	learn: 1.4617098	total: 8.56s	remaining: 57.3s
13:	learn: 1.3737719	total: 9.23s	remaining: 56

svm - 92.82% <br/>
knn - nan% <br/>
catBoost - 84.45% <br/>
random forrest - 84.96% <br/>
Therefore SVM is the best classifier to use further stuff.

In [74]:
# After dimension reduction (with PCA) cross validation and checking accuracy
crossValidation(svm_classifier, x_train_df_pca, y_train_df)

-- SVC --
Cross-validation scores: [0.94740533 0.94407433 0.9447756  0.94617812 0.9447756 ]
Standard Deviation: 0.00
***Mean Accuracy***: 94.54%





Based on the previous observation, the accuracy has not significantly decreased even though PCA has been applied. Therefore, the dataset that underwent PCA is being used for further analysis.

In [75]:
# Hyper parameter tuning with random search 
best_params_from_random_search = randomSearch('svm', x_train_df_pca, y_train_df)

best parameters: {'kernel': 'rbf', 'gamma': 'scale', 'degree': 2, 'class_weight': 'balanced', 'C': 1}


Random search selects the best hyper-parameters for the chosen classifier randomly. Using these parameters, create a suitable parameter grid for grid search.

In [76]:
# Hyper parameter tuning with grid search 
param_dist_svm = generate_param_grid(best_params_from_random_search, 'svm')

gridSearch('svm', param_dist_svm, svm_classifier, x_train_df_pca, y_train_df)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
best parameters: {}


The combination of random search followed by grid search strikes a balance between exploration and exploitation of the hyperparameter space, making hyperparameter tuning more efficient and effective in finding good model configurations.

In [77]:
svm_classifier = svm.SVC(kernel='rbf', C=1, class_weight='balanced')

#predict for the test dataset
y_test_pred = modelTrain(svm_classifier, x_train_df_pca, y_train_df, x_valid_df_pca, y_valid_df, x_test_df_pca)

all_labels.append(y_test_pred)


SVC accuracy_score for validation data set:  0.9573333333333334


# Label 2

In [78]:
Label = L2
pca_NComponents = 0.97

In [79]:
# Checking whether the dataset is bias or not
print(train_df[Label].unique())
print(train_df[Label].value_counts())
print(train_df.info())

[nan 25. 27. 23. 33. 34. 22. 30. 26. 24. 31. 29. 61. 28. 36. 32. 35. 41.]
26.0    4762
25.0    2849
27.0    2846
23.0    2842
31.0    2385
24.0    1906
28.0    1899
30.0    1894
22.0    1432
29.0    1424
33.0     945
36.0     481
35.0     480
34.0     478
32.0     476
41.0     474
61.0     467
Name: label_2, dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28520 entries, 0 to 28519
Columns: 772 entries, feature_1 to label_4
dtypes: float64(769), int64(3)
memory usage: 168.0 MB
None


According to the above information label 2 is not bias and its not compulsory to do over-sampling a under-sampling.

In [80]:
x_train_df = x_train[Label].copy()
y_train_df = y_train[Label].copy()

x_valid_df = x_valid[Label].copy()
y_valid_df = y_valid[Label].copy()

x_test_df = x_test[Label].copy()
# y_test__df = y_test[L1].copy() <- need to predict

In [81]:
# dimension reduction
pca = PCA(n_components=pca_NComponents, svd_solver='full')
pca.fit(x_train_df)
x_train_df_pca = pd.DataFrame(pca.transform(x_train_df)) #train
x_valid_df_pca = pd.DataFrame(pca.transform(x_valid_df)) #valid
x_test_df_pca = pd.DataFrame(pca.transform(x_test_df)) #test
print('Shape after PCA: ',x_train_df_pca.shape)

Shape after PCA:  (28040, 411)


First of all, it is necessary to select the best classification algorithm for model training. Therefore, cross-validation techniques have been used to choose the best classifier. Four different classifiers have been applied for cross-validation.

In [82]:
# Before feature engineering cross validation and checking accuracy

svm_classifier = svm.SVC()
knn_classifier = KNeighborsClassifier(n_neighbors=5)
catBoost_classifier = CatBoostClassifier(iterations=100,task_type="GPU")
randomForest_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

crossValidation(svm_classifier, x_train_df, y_train_df)
crossValidation(knn_classifier, x_train_df, y_train_df)
crossValidation(catBoost_classifier, x_train_df, y_train_df)
crossValidation(randomForest_classifier, x_train_df, y_train_df)


-- SVC --
Cross-validation scores: [0.90531384 0.90139087 0.9017475  0.89907275 0.9038873 ]
Standard Deviation: 0.00
***Mean Accuracy***: 90.23%



-- KNeighborsClassifier --
Cross-validation scores: [nan nan nan nan nan]
Standard Deviation: nan
***Mean Accuracy***: nan%



Learning rate set to 0.5
0:	learn: 2.4634050	total: 268ms	remaining: 26.6s
1:	learn: 2.2891894	total: 464ms	remaining: 22.7s
2:	learn: 2.1538824	total: 658ms	remaining: 21.3s
3:	learn: 2.0468581	total: 848ms	remaining: 20.4s
4:	learn: 1.9823369	total: 1.02s	remaining: 19.5s
5:	learn: 1.8825374	total: 1.22s	remaining: 19s
6:	learn: 1.8112253	total: 1.4s	remaining: 18.6s
7:	learn: 1.7466733	total: 1.59s	remaining: 18.3s
8:	learn: 1.6893486	total: 1.78s	remaining: 18s
9:	learn: 1.6274391	total: 1.97s	remaining: 17.7s
10:	learn: 1.5769485	total: 2.16s	remaining: 17.5s
11:	learn: 1.5374268	total: 2.33s	remaining: 17.1s
12:	learn: 1.4933365	total: 2.52s	remaining: 16.9s
13:	learn: 1.4589383	total: 2.69s	remaining: 16.5s
1

svm - 92.82% <br/>
knn - nan% <br/>
catBoost - 84.45% <br/>
random forrest - 84.96% <br/>
Therefore SVM is the best classifier to use further stuff.

In [83]:
# After dimension reduction (with PCA) cross validation and checking accuracy
crossValidation(svm_classifier, x_train_df_pca, y_train_df)

-- SVC --
Cross-validation scores: [0.9006776  0.89514979 0.89711127 0.89604137 0.90139087]
Standard Deviation: 0.00
***Mean Accuracy***: 89.81%





Based on the previous observation, the accuracy has not significantly decreased even though PCA has been applied. Therefore, the dataset that underwent PCA is being used for further analysis.

In [84]:
# Hyper parameter tuning with random search 
best_params_from_random_search = randomSearch('svm', x_train_df_pca, y_train_df)

best parameters: {'kernel': 'rbf', 'gamma': 'scale', 'degree': 2, 'class_weight': 'balanced', 'C': 1}


Random search selects the best hyper-parameters for the chosen classifier randomly. Using these parameters, create a suitable parameter grid for grid search.

In [85]:
# Hyper parameter tuning with grid search 
param_dist_svm = generate_param_grid(best_params_from_random_search, 'svm')

gridSearch('svm', param_dist_svm, svm_classifier, x_train_df_pca, y_train_df)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
best parameters: {}


The combination of random search followed by grid search strikes a balance between exploration and exploitation of the hyperparameter space, making hyperparameter tuning more efficient and effective in finding good model configurations.

In [86]:
svm_classifier = svm.SVC(kernel='rbf', C=1, class_weight='balanced')

y_test_pred = modelTrain(svm_classifier, x_train_df_pca, y_train_df, x_valid_df_pca, y_valid_df, x_test_df_pca)

all_labels.append(y_test_pred)


SVC accuracy_score for validation data set:  0.9116847826086957


# Label 3

In [87]:
Label = L3
pca_NComponents = 0.97

In [88]:
# Checking whether the dataset is bias or not
print(train_df[Label].unique())
print(train_df[Label].value_counts())
print(train_df.info())

[1 0]
1    22796
0     5724
Name: label_3, dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28520 entries, 0 to 28519
Columns: 772 entries, feature_1 to label_4
dtypes: float64(769), int64(3)
memory usage: 168.0 MB
None


According to the above information label 3 is bias to the value 1. Therefore, its necessary to do over-sampling and under-sampling.

In [89]:
x_train_df = x_train[Label].copy()
y_train_df = y_train[Label].copy()

x_valid_df = x_valid[Label].copy()
y_valid_df = y_valid[Label].copy()

x_test_df = x_test[Label].copy()
# y_test__df = y_test[L1].copy() <- need to predict

In [90]:
pca = PCA(n_components=pca_NComponents, svd_solver='full')
pca.fit(x_train_df)
x_train_df_pca = pd.DataFrame(pca.transform(x_train_df)) #train
x_valid_df_pca = pd.DataFrame(pca.transform(x_valid_df)) #valid
x_test_df_pca = pd.DataFrame(pca.transform(x_test_df)) #test
print('Shape after PCA: ',x_train_df_pca.shape)

Shape after PCA:  (28520, 411)


First of all, it is necessary to select the best classification algorithm for model training. Therefore, cross-validation techniques have been used to choose the best classifier. Four different classifiers have been applied for cross-validation.

In [91]:
# Before feature engineering cross validation and checking accuracy

svm_classifier = svm.SVC()
knn_classifier = KNeighborsClassifier(n_neighbors=5)
catBoost_classifier = CatBoostClassifier(iterations=100,task_type="GPU")
randomForest_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

crossValidation(svm_classifier, x_train_df, y_train_df)
crossValidation(knn_classifier, x_train_df, y_train_df)
crossValidation(catBoost_classifier, x_train_df, y_train_df)
crossValidation(randomForest_classifier, x_train_df, y_train_df)


-- SVC --
Cross-validation scores: [0.99701964 0.99614306 0.99684432 0.99701964 0.99754558]
Standard Deviation: 0.00
***Mean Accuracy***: 99.69%



-- KNeighborsClassifier --
Cross-validation scores: [nan nan nan nan nan]
Standard Deviation: nan
***Mean Accuracy***: nan%



Learning rate set to 0.223393
0:	learn: 0.4672159	total: 71.6ms	remaining: 7.09s
1:	learn: 0.3616856	total: 94.8ms	remaining: 4.64s
2:	learn: 0.2982112	total: 120ms	remaining: 3.89s
3:	learn: 0.2580124	total: 145ms	remaining: 3.48s
4:	learn: 0.2236338	total: 170ms	remaining: 3.23s
5:	learn: 0.2032320	total: 195ms	remaining: 3.05s
6:	learn: 0.1860437	total: 219ms	remaining: 2.91s
7:	learn: 0.1717560	total: 244ms	remaining: 2.8s
8:	learn: 0.1600713	total: 267ms	remaining: 2.7s
9:	learn: 0.1502907	total: 289ms	remaining: 2.6s
10:	learn: 0.1404610	total: 312ms	remaining: 2.52s
11:	learn: 0.1319064	total: 335ms	remaining: 2.46s
12:	learn: 0.1261825	total: 358ms	remaining: 2.39s
13:	learn: 0.1203026	total: 382ms	remaining

svm - 92.82% <br/>
knn - nan% <br/>
catBoost - 84.45% <br/>
random forrest - 84.96% <br/>
Therefore SVM is the best classifier to use further stuff.

In [92]:
# After dimension reduction (with PCA) cross validation and checking accuracy
crossValidation(svm_classifier, x_train_df_pca, y_train_df)

-- SVC --
Cross-validation scores: [0.996669   0.99614306 0.996669   0.996669   0.99754558]
Standard Deviation: 0.00
***Mean Accuracy***: 99.67%





Based on the previous observation, the accuracy has not significantly decreased even though PCA has been applied. Therefore, the dataset that underwent PCA is being used for further analysis.

In [93]:
# Hyper parameter tuning with random search 
best_params_from_random_search = randomSearch('svm', x_train_df_pca, y_train_df)

best parameters: {'kernel': 'rbf', 'gamma': 'scale', 'degree': 2, 'class_weight': 'balanced', 'C': 1}


Random search selects the best hyper-parameters for the chosen classifier randomly. Using these parameters, create a suitable parameter grid for grid search.

In [94]:
# Hyper parameter tuning with grid search 
param_dist_svm = generate_param_grid(best_params_from_random_search, 'svm')

gridSearch('svm', param_dist_svm, svm_classifier, x_train_df_pca, y_train_df)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
best parameters: {}


The combination of random search followed by grid search strikes a balance between exploration and exploitation of the hyperparameter space, making hyperparameter tuning more efficient and effective in finding good model configurations.

In [95]:
svm_classifier = svm.SVC(kernel='rbf', C=100, class_weight='balanced')

y_test_pred = modelTrain(svm_classifier, x_train_df_pca, y_train_df, x_valid_df_pca, y_valid_df, x_test_df_pca)

all_labels.append(y_test_pred)


SVC accuracy_score for validation data set:  0.9973333333333333


# Label 4

In [14]:
Label = L4
pca_NComponents = 0.97

In [15]:
# Checking whether the dataset is bias or not
print(train_df[Label].unique())
print(train_df[Label].value_counts())
print(train_df.info())

[ 6 13  4  5  1  2  7  3  0 12  9  8 11 10]
6     19938
2      1449
0       955
12      954
7       938
13      482
1       481
11      480
10      480
3       479
5       478
9       472
4       469
8       465
Name: label_4, dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28520 entries, 0 to 28519
Columns: 772 entries, feature_1 to label_4
dtypes: float64(769), int64(3)
memory usage: 168.0 MB
None


According to the above information label 4 is bias to value 2 and 6. Therefore, its necessary to do over-sampling and under-sampling.

In [16]:
x_train_df = x_train[Label].copy()
y_train_df = y_train[Label].copy()

x_valid_df = x_valid[Label].copy()
y_valid_df = y_valid[Label].copy()

x_test_df = x_test[Label].copy()
# y_test__df = y_test[L1].copy() <- need to predict

In [17]:
pca = PCA(n_components=pca_NComponents, svd_solver='full')
pca.fit(x_train_df)
x_train_df_pca = pd.DataFrame(pca.transform(x_train_df)) #train
x_valid_df_pca = pd.DataFrame(pca.transform(x_valid_df)) #valid
x_test_df_pca = pd.DataFrame(pca.transform(x_test_df)) #test
print('Shape after PCA: ',x_train_df_pca.shape)

Shape after PCA:  (28520, 411)


First of all, it is necessary to select the best classification algorithm for model training. Therefore, cross-validation techniques have been used to choose the best classifier. Four different classifiers have been applied for cross-validation.

In [100]:
# Before feature engineering cross validation and checking accuracy

svm_classifier = svm.SVC()
knn_classifier = KNeighborsClassifier(n_neighbors=5)
catBoost_classifier = CatBoostClassifier(iterations=100,task_type="GPU")
randomForest_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

crossValidation(svm_classifier, x_train_df, y_train_df)
crossValidation(knn_classifier, x_train_df, y_train_df)
crossValidation(catBoost_classifier, x_train_df, y_train_df)
crossValidation(randomForest_classifier, x_train_df, y_train_df)


-- SVC --
Cross-validation scores: [0.94863254 0.9516129  0.95459327 0.94985975 0.94898317]
Standard Deviation: 0.00
***Mean Accuracy***: 95.07%



-- KNeighborsClassifier --
Cross-validation scores: [nan nan nan nan nan]
Standard Deviation: nan
***Mean Accuracy***: nan%



Learning rate set to 0.5
0:	learn: 1.3862129	total: 215ms	remaining: 21.3s
1:	learn: 1.2009523	total: 387ms	remaining: 18.9s
2:	learn: 1.1272747	total: 539ms	remaining: 17.4s
3:	learn: 1.0697368	total: 683ms	remaining: 16.4s
4:	learn: 1.0178063	total: 829ms	remaining: 15.8s
5:	learn: 0.9764876	total: 965ms	remaining: 15.1s
6:	learn: 0.9167721	total: 1.11s	remaining: 14.8s
7:	learn: 0.8696411	total: 1.27s	remaining: 14.7s
8:	learn: 0.8373295	total: 1.42s	remaining: 14.4s
9:	learn: 0.8004163	total: 1.58s	remaining: 14.2s
10:	learn: 0.7664919	total: 1.73s	remaining: 14s
11:	learn: 0.7341878	total: 1.87s	remaining: 13.7s
12:	learn: 0.7087548	total: 2.02s	remaining: 13.5s
13:	learn: 0.6814698	total: 2.17s	remaining: 13.3

svm - 92.82% <br/>
knn - nan% <br/>
catBoost - 84.45% <br/>
random forrest - 84.96% <br/>
Therefore SVM is the best classifier to use further stuff.

In [101]:
# After dimension reduction (with PCA) cross validation and checking accuracy
crossValidation(svm_classifier, x_train_df_pca, y_train_df)

-- SVC --
Cross-validation scores: [0.94775596 0.95073633 0.95284011 0.94845722 0.94793128]
Standard Deviation: 0.00
***Mean Accuracy***: 94.95%





Based on the previous observation, the accuracy has not significantly decreased even though PCA has been applied. Therefore, the dataset that underwent PCA is being used for further analysis.

In [19]:
# Hyper parameter tuning with random search 
best_params_from_random_search = randomSearch('svm', x_train_df_pca, y_train_df)

Random search selects the best hyper-parameters for the chosen classifier randomly. Using these parameters, create a suitable parameter grid for grid search.

In [18]:
# Hyper parameter tuning with grid search 
param_dist_svm = generate_param_grid(best_params_from_random_search, 'svm')
print(param_dist_svm)
# gridSearch('svm', param_dist_svm, svm_classifier, x_train_df_pca, y_train_df)

NameError: name 'best_params_from_random_search' is not defined

The combination of random search followed by grid search strikes a balance between exploration and exploitation of the hyperparameter space, making hyperparameter tuning more efficient and effective in finding good model configurations.

In [104]:
svm_classifier = svm.SVC(kernel='poly', C=100, gamma='scale', degree=3, class_weight='balanced')

y_test_pred = modelTrain(svm_classifier, x_train_df_pca, y_train_df, x_valid_df_pca, y_valid_df, x_test_df_pca)

all_labels.append(y_test_pred)


SVC accuracy_score for validation data set:  0.9733333333333334


# make output csv

In [105]:
create_csv_from_labels(all_labels, 'layer7_output.csv')