
# Import Libraries

In [None]:
import numpy as np
import pandas as pd
import zipfile
import os
import warnings
warnings.filterwarnings("ignore")
pd.options.mode.copy_on_write = True

In [None]:
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
from plotly.subplots import make_subplots
import plotly.figure_factory as ff
pio.templates.default = 'plotly_dark'

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder , MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score ,auc ,roc_auc_score , confusion_matrix

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import Perceptron
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

In [None]:
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier

In [None]:
import optuna
import logging
from optuna.samplers import TPESampler
from optuna.visualization import plot_optimization_history, plot_param_importances ,plot_contour
optuna.logging.set_verbosity(optuna.logging.WARNING)

## Read Data

In [None]:
def unzip_file_to_same_location(zip_path):

    extract_to = os.path.dirname(zip_path)
 
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
       
        zip_ref.extractall(extract_to)
        print('Extraction Complete')

In [None]:
zip_file_path = 'archive (2).zip'
unzip_file_to_same_location(zip_file_path)

In [None]:
df = pd.read_csv('exercise_angles.csv')

In [None]:
num_observations, num_features = df.shape
print(f"Number of observations: {num_observations}")
print(f"Number of features: {num_features}")

In [None]:
side_counts = df['Side'].value_counts()
print("Value counts for 'Side' column:")
print(side_counts)

In [None]:
df.drop('Side',axis=1,inplace=True)

In [None]:
df.describe()

In [None]:
numeric_columns = df.select_dtypes(include=['float64']).columns


colors = ['#2d288f', '#5342a5', '#735ebb', '#927bd1', '#b199e8', 
          '#f1cbf9', '#dca7f7', '#bf85f8', '#9666fa', '#554cff']

fig = make_subplots(rows=2, cols=5, subplot_titles=[col.replace('_', ' ') for col in numeric_columns])

for i, col in enumerate(numeric_columns):
    row = (i // 5) + 1
    col_pos = (i % 5) + 1
    fig.add_trace(
        go.Histogram(x=df[col], marker_color=colors[i], name=col.replace('_', ' ') , nbinsx=20, showlegend=False),
        row=row, col=col_pos
    )

fig.update_layout(height=500, width=1100, title_text="Histograms for all body joint angles")

fig.show()

In [None]:
df_melted = df.melt(id_vars='Label', var_name='body joint angle', value_name='Values')
colors = ['#288f11', '#00b79d', '#7fd3e8', '#00a3ff', '#8c3aff']
       

df_melted['body joint angle'] = df_melted['body joint angle'].str.replace('_', ' ')
fig = px.box(df_melted, x='Label', y='Values', color='Label', 
             facet_col='body joint angle', facet_col_wrap=2,
             color_discrete_sequence=colors)

fig.update_layout(height=1000, width=1200, title="Box Plot for body joint angle by labels")


fig.show()

In [None]:
label_encoder = LabelEncoder()
df['Label'] = label_encoder.fit_transform(df['Label'])

In [None]:
def preprocess_data(df, label_column):

    
    X = df.drop(columns=[label_column])
    y = df[label_column]

    first_four_columns = X.columns[:5] 
    last_four_columns = X.columns[5:10]  

    transformers = [
        ('standard_scaler', StandardScaler(), first_four_columns),
        ('minmax_scaler', MinMaxScaler(), last_four_columns),
       
    ]

    column_transformer = ColumnTransformer(transformers)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    X_train_scaled = column_transformer.fit_transform(X_train)
    X_test_scaled = column_transformer.transform(X_test)

    X_train_scaled = pd.DataFrame(X_train_scaled, columns=np.concatenate([first_four_columns, last_four_columns]))
    X_test_scaled = pd.DataFrame(X_test_scaled, columns=np.concatenate([first_four_columns, last_four_columns]))

    return X_train_scaled,y_train, X_test_scaled,  y_test

In [None]:
X_train, y_train, X_test, y_test = preprocess_data(df, 'Label')

In [None]:
def train_classifiers(X_train, y_train, X_test, y_test, random_state=42):

    classifiers = {
        'Logistic Regression': LogisticRegression(max_iter=1000, random_state=random_state),
        'Decision Tree': DecisionTreeClassifier(random_state=random_state),
        'Random Forest': RandomForestClassifier(random_state=random_state),
        'Support Vector Machine': SVC(random_state=random_state),
        'K-Nearest Neighbors': KNeighborsClassifier(),
        'Naive Bayes': GaussianNB(),
        'Gradient Boosting': GradientBoostingClassifier(random_state=random_state),
        'Perceptron': Perceptron(random_state=random_state),
        'Quadratic Discriminant Analysis': QuadraticDiscriminantAnalysis(),
        'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=random_state),
        'LightGBM': LGBMClassifier(random_state=random_state,verbosity=-1),
        'CatBoost': CatBoostClassifier(silent=True, random_state=random_state)
    }

    results = []

    for name, clf in classifiers.items():
        print(f"Training {name}...")
        clf.fit(X_train, y_train)

        train_acc = accuracy_score(y_train, clf.predict(X_train))
        test_acc = accuracy_score(y_test, clf.predict(X_test))

        results.append({'Model': name, 'Train Accuracy': train_acc, 'Test Accuracy': test_acc})

    results_df = pd.DataFrame(results).sort_values('Test Accuracy',ascending=False).reset_index(drop=True)
    return results_df

In [None]:
results_df = train_classifiers(X_train, y_train, X_test, y_test)

In [None]:
results_df

In [None]:
def optimize_lightgbm(X_train_scaled, y_train,X_test_scaled, y_test):
    
    def objective(trial):
        param = {
            'objective': 'multiclass',
            'num_class': len(set(y_train)), 
            'boosting_type': trial.suggest_categorical('boosting_type', ['gbdt', 'dart', 'goss']),
            'verbosity': -1,
            'num_leaves': trial.suggest_int('num_leaves', 70, 200),
            'max_depth': trial.suggest_int('max_depth', 6, 20),  # -1 means no limit
            'learning_rate': trial.suggest_loguniform('learning_rate', 0.1, 1.0),
            'n_estimators': trial.suggest_int('n_estimators', 20, 400),
            'subsample': trial.suggest_float('subsample', 0.2, 1.0),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
            'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-3, 0.5),
            'reg_lambda': trial.suggest_loguniform('reg_lambda', 0.01, 100),
      
        }

        model = LGBMClassifier(**param, random_state=42)
        model.fit(X_train_scaled, y_train)

        y_pred = model.predict(X_test_scaled)
        accuracy = accuracy_score(y_test, y_pred)

        return accuracy

    sampler = TPESampler(seed=42)
    study = optuna.create_study(direction='maximize',sampler=sampler)
    study.optimize(objective, n_trials=100)

    return study

In [None]:
study = optimize_lightgbm(X_train,y_train, X_test, y_test)
print("Best hyperparameters: ", study.best_params)

In [None]:
fig = make_subplots(rows=2, cols=1, subplot_titles=("Optimization History", "Parameter Importances"))

scatter_color = 'blue'  
line_color = 'green'    
fig1 = plot_optimization_history(study)


for trace in fig1.data:
    if isinstance(trace, go.Scatter) and 'markers' in trace.mode:  
        trace.update(marker=dict(color=scatter_color))  
    elif isinstance(trace, go.Scatter) and 'lines' in trace.mode: 
        trace.update(line=dict(color=line_color))  

    fig.add_trace(trace, row=1, col=1)

fig2 = plot_param_importances(study)

bar_color = 'darkblue'  

for trace in fig2.data:
    if isinstance(trace, go.Bar):  
        trace.update(marker=dict(color=bar_color))  
    trace.showlegend = False  
    fig.add_trace(trace, row=2, col=1)

fig.update_layout(height=800, title_text="Optuna Study Results")
fig.show()

In [None]:
def plot_hyperparameter_contour(study, param1, param2, row, col):
    x = np.array([trial.params[param1] for trial in study.trials])
    y = np.array([trial.params[param2] for trial in study.trials])
    z = np.array([trial.value for trial in study.trials])

    contour = go.Contour(
        x=x,
        y=y,
        z=z,
        colorscale='Blues',
        colorbar=dict(title="Accuracy"),
        contours=dict(
            start=np.min(z),
            end=np.max(z),
            size=(np.max(z) - np.min(z)) / 20,
            showlabels=False
        ),
        name=f"{param1} vs {param2}"
    )

    return contour

def plot_all_hyperparameter_contours(study):
    fig = make_subplots(rows=2, cols=2, subplot_titles=(
        "num_leaves vs max_depth",
        "n_estimators vs learning_rate",
        "subsample vs colsample_bytree",
        "reg_alpha vs reg_lambda"
    ))

    hyperparams = [
        ('num_leaves', 'max_depth', 1, 1),
        ('n_estimators', 'learning_rate', 1, 2),
        ('subsample', 'colsample_bytree', 2, 1),
        ('reg_alpha', 'reg_lambda', 2, 2)
    ]

    for param1, param2, row, col in hyperparams:
        contour = plot_hyperparameter_contour(study, param1, param2, row, col)
        fig.add_trace(contour, row=row, col=col)

    fig.update_layout(title="Hyperparameter Contour Plots", height=800, width=1000)
    fig.show()

In [None]:
plot_all_hyperparameter_contours(study)

In [None]:
best_params = study.best_params

print(f"Re-training LightGBM...")
model = LGBMClassifier(**best_params, random_state=42)

model.fit(X_train, y_train)


y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)


train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)
confusion_matrix_test = confusion_matrix(y_test, y_test_pred)


print(f"Training Accuracy: {train_accuracy:.5f}")
print(f"Test Accuracy: {test_accuracy:.5f}")

In [None]:
cm_df = pd.DataFrame(confusion_matrix_test, 
                     index=label_encoder.inverse_transform(range(confusion_matrix_test.shape[0])), 
                     columns=label_encoder.inverse_transform(range(confusion_matrix_test.shape[1])))

fig = ff.create_annotated_heatmap(
    z=cm_df.values,
    x=cm_df.columns.tolist(),
    y=cm_df.index.tolist(),
    colorscale='Blues',
  
)

fig.update_layout(
    title='Confusion Matrix Heatmap',
    xaxis_title='Predicted Labels',
    yaxis_title='True Labels',
)

fig.show()

In [None]:
feature_importances = model.feature_importances_
features = X_train.columns

importance_df = pd.DataFrame({
    'Feature': features,
    'Importance': feature_importances
})

importance_df['Normalized Importance'] = importance_df['Importance'] / importance_df['Importance'].sum()

importance_df = importance_df.sort_values(by='Normalized Importance', ascending=True)

fig = go.Figure()

fig.add_trace(go.Bar(
    x=importance_df['Normalized Importance'],
    y=importance_df['Feature'],
    orientation='h',
    marker=dict(color='darkblue')
))

fig.update_layout(
    title='Normalized Feature Importance',
    xaxis_title='Normalized Importance Score',
    yaxis_title='Features',
    showlegend=False  
)

fig.show()