In [1]:
import numpy as np
import pandas as pd
import joblib
import pickle
from sklearn.preprocessing import MinMaxScaler

In [2]:
def load_features_from_file(filepath):
    data = np.genfromtxt(filepath, delimiter=",")
    X = data[:, :-1]
    y = data[:, -1]

    return (X, y)

# Remove highly correlated features
def remove_highly_correlated_features(X, threshold=0.95):
    corr_matrix = pd.DataFrame(X).corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
    X_reduced = np.delete(X, to_drop, axis=1)
    return X_reduced, to_drop

In [4]:
X_train, y_train = load_features_from_file("../2. Feature Selection & Extraction/Saved Features/Final_features_training.csv")
X_val, y_val = load_features_from_file("../2. Feature Selection & Extraction/Saved Features/Final_features_validation.csv")

In [19]:
import xgboost as xgb
from sklearn.metrics import accuracy_score
import numpy as np

# Create the DMatrix data structure for XGBoost
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)

# Define the number of classes
num_classes = len(np.unique(y_train))

# Define the parameters for the XGBoost model
params = {
    'max_depth': 20,
    'eta': 0.1,
    'objective': 'multi:softprob',  # Use 'multi:softprob' for multi-class classification
    'eval_metric': 'mlogloss',
    'num_class': num_classes
}

# Define the watchlist to track the training and validation error
watchlist = [(dtrain, 'train'), (dval, 'eval')]

# Train the XGBoost model
num_round = 200
bst = xgb.train(params, dtrain, num_round, watchlist, early_stopping_rounds=10)

# Make predictions on the validation set
y_pred_prob = bst.predict(dval)
y_pred = np.argmax(y_pred_prob, axis=1)

# Calculate the accuracy
accuracy = accuracy_score(y_val, y_pred)
print(f'Validation Accuracy: {accuracy:.2f}')


[0]	train-mlogloss:1.20384	eval-mlogloss:1.22189
[1]	train-mlogloss:1.05532	eval-mlogloss:1.08607




[2]	train-mlogloss:0.93115	eval-mlogloss:0.96972
[3]	train-mlogloss:0.82603	eval-mlogloss:0.87144
[4]	train-mlogloss:0.73551	eval-mlogloss:0.78684
[5]	train-mlogloss:0.65722	eval-mlogloss:0.71370
[6]	train-mlogloss:0.58844	eval-mlogloss:0.64939
[7]	train-mlogloss:0.52814	eval-mlogloss:0.59295
[8]	train-mlogloss:0.47492	eval-mlogloss:0.54174
[9]	train-mlogloss:0.42773	eval-mlogloss:0.49605
[10]	train-mlogloss:0.38574	eval-mlogloss:0.45629
[11]	train-mlogloss:0.34817	eval-mlogloss:0.42038
[12]	train-mlogloss:0.31473	eval-mlogloss:0.38829
[13]	train-mlogloss:0.28472	eval-mlogloss:0.35965
[14]	train-mlogloss:0.25792	eval-mlogloss:0.33373
[15]	train-mlogloss:0.23382	eval-mlogloss:0.31081
[16]	train-mlogloss:0.21219	eval-mlogloss:0.28998
[17]	train-mlogloss:0.19266	eval-mlogloss:0.27037
[18]	train-mlogloss:0.17504	eval-mlogloss:0.25215
[19]	train-mlogloss:0.15926	eval-mlogloss:0.23655
[20]	train-mlogloss:0.14498	eval-mlogloss:0.22173
[21]	train-mlogloss:0.13197	eval-mlogloss:0.20767
[22]	tra

In [25]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Initialize the SVM model with RBF kernel
svm_model = SVC(kernel='linear', C=1, gamma='scale', probability=True)

# Train the SVM model
svm_model.fit(X_train, y_train)

# Make predictions on the validation set
y_pred = svm_model.predict(X_val)

# Calculate the accuracy
accuracy = accuracy_score(y_val, y_pred)
print(f'Validation Accuracy: {accuracy:.2f}')


Validation Accuracy: 0.99


In [26]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Initialize the Random Forest model with additional parameters
rf_model = RandomForestClassifier(
    n_estimators=200,  # Increase the number of trees
    max_depth=10,  # Limit the maximum depth of each tree
    min_samples_split=2,
    min_samples_leaf=1,
    max_features='sqrt',  # Use the square root of the number of features
    random_state=42
)

# Train the Random Forest model
rf_model.fit(X_train, y_train)

# Make predictions on the validation set
y_pred = rf_model.predict(X_val)

# Calculate the accuracy
accuracy = accuracy_score(y_val, y_pred)
print(f'Validation Accuracy: {accuracy:.2f}')


Validation Accuracy: 0.99


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Assuming X_train and y_train are your training data and labels
# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Initialize the Random Forest model with additional parameters
rf_model = RandomForestClassifier(
    n_estimators=200,  # Increase the number of trees
    max_depth=10,  # Limit the maximum depth of each tree
    min_samples_split=2,
    min_samples_leaf=1,
    max_features='sqrt',  # Use the square root of the number of features
    random_state=42
)

# Train the Random Forest model
rf_model.fit(X_train, y_train)

# Make predictions on the validation set
y_pred = rf_model.predict(X_val)

# Calculate the accuracy
accuracy = accuracy_score(y_val, y_pred)
print(f'Validation Accuracy: {accuracy:.2f}')


In [27]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import xgboost as xgb
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
import numpy as np

# Train the XGBoost model
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)
num_classes = len(np.unique(y_train))
params = {
    'max_depth': 6,
    'eta': 0.3,
    'objective': 'multi:softprob',
    'eval_metric': 'mlogloss',
    'num_class': num_classes
}
num_round = 100
bst = xgb.train(params, dtrain, num_round, [(dtrain, 'train'), (dval, 'eval')], early_stopping_rounds=10)

# Train the SVM model
svm_model = make_pipeline(StandardScaler(), SVC(kernel='linear', C=1, probability=True))
svm_model.fit(X_train, y_train)

# Train the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Generate predictions for the validation set
y_pred_xgb = bst.predict(dval)
y_pred_svm = svm_model.predict_proba(X_val)
y_pred_rf = rf_model.predict_proba(X_val)

# Combine the predictions (stacking)
X_meta = np.hstack((y_pred_xgb, y_pred_svm, y_pred_rf))

# Train the meta-model (e.g., Logistic Regression)
meta_model = LogisticRegression(random_state=42)
meta_model.fit(X_meta, y_val)

# Make final predictions using the meta-model
X_test_meta = np.hstack((
    bst.predict(xgb.DMatrix(X_val)),
    svm_model.predict_proba(X_val),
    rf_model.predict_proba(X_val)
))
y_pred_meta = meta_model.predict(X_test_meta)

# Calculate the accuracy of the stacked model
accuracy = accuracy_score(y_val, y_pred_meta)
print(f'Validation Accuracy: {accuracy:.2f}')


[0]	train-mlogloss:0.89018	eval-mlogloss:0.93369
[1]	train-mlogloss:0.61980	eval-mlogloss:0.67869




[2]	train-mlogloss:0.44727	eval-mlogloss:0.51425
[3]	train-mlogloss:0.32841	eval-mlogloss:0.39872
[4]	train-mlogloss:0.24367	eval-mlogloss:0.31525
[5]	train-mlogloss:0.18323	eval-mlogloss:0.25486
[6]	train-mlogloss:0.13894	eval-mlogloss:0.20920
[7]	train-mlogloss:0.10536	eval-mlogloss:0.17332
[8]	train-mlogloss:0.08077	eval-mlogloss:0.14631
[9]	train-mlogloss:0.06237	eval-mlogloss:0.12538
[10]	train-mlogloss:0.04871	eval-mlogloss:0.11070
[11]	train-mlogloss:0.03819	eval-mlogloss:0.09676
[12]	train-mlogloss:0.03005	eval-mlogloss:0.08754
[13]	train-mlogloss:0.02400	eval-mlogloss:0.07966
[14]	train-mlogloss:0.01930	eval-mlogloss:0.07171
[15]	train-mlogloss:0.01572	eval-mlogloss:0.06586
[16]	train-mlogloss:0.01298	eval-mlogloss:0.06167
[17]	train-mlogloss:0.01077	eval-mlogloss:0.05876
[18]	train-mlogloss:0.00909	eval-mlogloss:0.05650
[19]	train-mlogloss:0.00773	eval-mlogloss:0.05378
[20]	train-mlogloss:0.00666	eval-mlogloss:0.05205
[21]	train-mlogloss:0.00581	eval-mlogloss:0.05063
[22]	tra

In [36]:
# Save the trained KMeans classifier
joblib_file = "svm.pkl"
joblib.dump(svm_model, joblib_file)

['svm.pkl']