# PROJECT PIPELINE: MACHINE LEARNINGâ€“BASED ANALYSIS AND COMPARISON OF SPECIFIC CAPACITANCE FROM CV AND GCD

## 1. ENVIRONMENT SETUP AND DEPENDENCIES

### 1.1 VIRTUAL ENVIRONMENT PACKAGES (REQUIREMENTS.TXT)

THE FOLLOWING PACKAGES MUST BE INSTALLED IN YOUR VENV TO RUN THIS PIPELINE:

- PANDAS>=1.5.0
- NUMPY>=1.23.0
- MATPLOTLIB>=3.6.0
- SEABORN>=0.12.0
- SCIKIT-LEARN>=1.2.0
- XGBOOST>=1.7.0
- LIGHTGBM>=3.3.0
- SHAP>=0.41.0
- SCIPY>=1.9.0

### 1.2 GLOBAL IMPORTS AND VISUALIZATION STANDARDS

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from scipy import stats

warnings.filterwarnings('ignore')

In [2]:
# -----------------------------------------------------------------------------
# GLOBAL VISUALIZATION STANDARDS
# -----------------------------------------------------------------------------
plt.rcParams.update({
    'figure.figsize': (8, 6),
    'figure.dpi': 500,
    'axes.grid': True,
    'font.weight': 'bold',
    'axes.labelweight': 'bold',
    'axes.titleweight': 'bold',
    'font.size': 12
})

## 2. DATA LOADING AND VALIDATION

### 2.1 LOAD DATASETS, CLEAN UNNAMED COLUMNS, AND DISPLAY SHAPE

In [4]:
def load_and_clean_data(cv_path: str, gcd_path: str) -> tuple:
    """
    LOADS CV AND GCD DATASETS, REMOVES UNNAMED COLUMNS,
    AND DISPLAYS BASIC SHAPE AND HEAD INFO.
    
    PARAMETERS:
    - cv_path: STRING PATH TO CV CSV
    - gcd_path: STRING PATH TO GCD CSV
    
    RETURNS:
    - TUPLE OF (CV_DATAFRAME, GCD_DATAFRAME)
    """
    cv_df = pd.read_csv(cv_path)
    gcd_df = pd.read_csv(gcd_path)
    
    # DROP UNNAMED COLUMNS IN CV DATA
    cv_df = cv_df.loc[:, ~cv_df.columns.str.contains('^Unnamed')]
    
    print("=== CV DATASET SHAPE ===")
    print(cv_df.shape)
    print("\n=== CV DATASET HEAD ===")
    display(cv_df.head())
    
    print("\n=== GCD DATASET SHAPE ===")
    print(gcd_df.shape)
    print("\n=== GCD DATASET HEAD ===")
    display(gcd_df.head())
    
    return cv_df, gcd_df

# EXECUTE DATA LOADING
cv_df, gcd_df = load_and_clean_data(
    '../DATASET/DATA/AL203-1M-KOH-CV.csv',
    '../DATASET/DATA/AL203-1M-KOH-GCD.csv'
)

=== CV DATASET SHAPE ===
(8752, 5)

=== CV DATASET HEAD ===


Unnamed: 0,Scan_Rate,Potential,Current,Area,CS
0,100,-0.29984,-0.5105,2.41787,60.44685
1,100,-0.29885,-0.41119,2.41787,60.44685
2,100,-0.29786,-0.34692,2.41787,60.44685
3,100,-0.2938,-0.19546,2.41787,60.44685
4,100,-0.29284,-0.17072,2.41787,60.44685



=== GCD DATASET SHAPE ===
(3944, 5)

=== GCD DATASET HEAD ===


Unnamed: 0,Current_Density,Time,Potential,Discharge_Time,GCD_CS
0,5.0,-3.17e-07,-0.15928,2.88868,36.1085
1,5.0,0.1,-0.08898,2.88868,36.1085
2,5.0,0.2,-0.07416,2.88868,36.1085
3,5.0,0.3,-0.06734,2.88868,36.1085
4,5.0,0.4,-0.06165,2.88868,36.1085


### 2.2 VALIDATE MISSING VALUES, DUPLICATES, AND CS RANGE

In [5]:
def validate_data(df: pd.DataFrame, dataset_name: str, target_col: str):
    """
    CHECKS FOR MISSING VALUES, DUPLICATES, OUTLIERS, 
    AND VALIDATES TARGET VARIABLE (Cs) RANGE.
    """
    print(f"\n{'='*40}")
    print(f"DATA VALIDATION: {dataset_name.upper()}")
    print(f"{'='*40}")
    
    # MISSING VALUES
    missing_count = df.isnull().sum().sum()
    print(f"TOTAL MISSING VALUES: {missing_count}")
    
    # DUPLICATES
    duplicate_count = df.duplicated().sum()
    print(f"TOTAL DUPLICATE ROWS: {duplicate_count}")
    
    # VALIDATE CS (SPECIFIC CAPACITANCE)
    if target_col in df.columns:
        print(f"\n{target_col.upper()} (TARGET) SUMMARY STATISTICS:")
        display(df[target_col].describe())
        
        # OUTLIER DETECTION VIA Z-SCORE
        z_scores = np.abs(stats.zscore(df[target_col].dropna()))
        outliers = np.where(z_scores > 3)[0]
        print(f"DETECTED OUTLIERS IN {target_col.upper()} (Z > 3): {len(outliers)}")
    else:
        print(f"TARGET COLUMN '{target_col}' NOT FOUND.")

validate_data(cv_df, "CV Dataset", "CS")
validate_data(gcd_df, "GCD Dataset", "GCD_CS")


DATA VALIDATION: CV DATASET
TOTAL MISSING VALUES: 0
TOTAL DUPLICATE ROWS: 0

CS (TARGET) SUMMARY STATISTICS:


count    8752.000000
mean       71.759403
std         7.996925
min        60.446850
25%        64.487200
50%        70.618030
75%        78.711160
max        86.419420
Name: CS, dtype: float64

DETECTED OUTLIERS IN CS (Z > 3): 0

DATA VALIDATION: GCD DATASET
TOTAL MISSING VALUES: 0
TOTAL DUPLICATE ROWS: 0

GCD_CS (TARGET) SUMMARY STATISTICS:


count    3944.000000
mean       45.440003
std         3.206760
min        36.108500
25%        44.553500
50%        46.711120
75%        47.745620
max        47.745620
Name: GCD_CS, dtype: float64

DETECTED OUTLIERS IN GCD_CS (Z > 3): 0
