In [4]:
!pip install pandas scikit-learn zxcvbn-python matplotlib seaborn joblib
!USE_GPU=1 pip install lightgbm
!pip install lightgbm --config-settings=--gpu


Usage:   
  pip3 install [options] <requirement specifier> [package-index-options] ...
  pip3 install [options] -r <requirements file> [package-index-options] ...
  pip3 install [options] [-e] <vcs project url> ...
  pip3 install [options] [-e] <local project path> ...
  pip3 install [options] <archive url/path> ...

Arguments to --config-settings must be of the form KEY=VAL


In [5]:
import os
import time
import logging
import pandas as pd
import numpy as np
import joblib
import gc # Garbage Collector
import json # For saving metrics
import decimal # Import decimal to check type if needed
import datetime # Import datetime to handle potential timedelta objects

print("DEBUG: Basic imports done (os, time, logging, pd, np, joblib, gc, json, decimal, datetime).")

DEBUG: Basic imports done (os, time, logging, pd, np, joblib, gc, json, decimal, datetime).


In [6]:
# --- Import Core Libraries with Error Handling ---
try:
    from zxcvbn import zxcvbn
    print("DEBUG: Imported zxcvbn successfully.")
except ImportError as e:
    print(f"DEBUG: FATAL - FAILED to import zxcvbn: {e}")
    exit()

try:
    import lightgbm as lgb
    print("DEBUG: Imported lightgbm successfully.")
except ImportError as e:
    print(f"DEBUG: FATAL - FAILED to import lightgbm: {e}")
    exit()
except Exception as e:
    print(f"DEBUG: FATAL - Unknown error importing lightgbm: {e}")
    exit()

try:
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, log_loss
    print("DEBUG: Imported sklearn components successfully.")
except ImportError as e:
    print(f"DEBUG: FATAL - FAILED to import sklearn components: {e}")
    exit()

# --- Import Plotting Libraries ---
try:
    import matplotlib.pyplot as plt
    import seaborn as sns
    print("DEBUG: Imported matplotlib and seaborn successfully.")
    plotting_available = True
except ImportError as e:
    print(f"DEBUG: WARNING - FAILED to import plotting libraries (matplotlib/seaborn): {e}. Plotting will be disabled.")
    plt = None
    sns = None
    plotting_available = False

DEBUG: Imported zxcvbn successfully.
DEBUG: Imported lightgbm successfully.
DEBUG: Imported sklearn components successfully.
DEBUG: Imported matplotlib and seaborn successfully.


In [7]:
# --- Configuration ---
print("DEBUG: Setting up configuration...")
# Paths
DATA_PATH = "/kaggle/input/passwordrock/rockyou.txt"
OUTPUT_DIR = "/kaggle/working/" # Standard Kaggle output directory

# Sample Size
SAMPLE_SIZE = 150000
print(f"DEBUG: Using SAMPLE_SIZE = {SAMPLE_SIZE}")

LOG_FILE = os.path.join(OUTPUT_DIR, "password_strength_training.log")
MODEL_FILE = os.path.join(OUTPUT_DIR, "lightgbm_password_model.joblib")
FEATURE_NAMES_FILE = os.path.join(OUTPUT_DIR, "feature_names.joblib") # Will save remaining features
METRICS_FILE = os.path.join(OUTPUT_DIR, "training_metrics.json")
CONFUSION_MATRIX_FILE = os.path.join(OUTPUT_DIR, "confusion_matrix.png")
FEATURE_IMPORTANCE_FILE = os.path.join(OUTPUT_DIR, "feature_importance.png") # Will show importance of remaining features
LEARNING_CURVES_FILE = os.path.join(OUTPUT_DIR, "learning_curves.png")

DEBUG: Setting up configuration...
DEBUG: Using SAMPLE_SIZE = 150000


In [8]:
# --- Model Parameters ---
LGBM_PARAMS = {
    'objective': 'multiclass',
    'metric': 'multi_logloss',
    'num_class': 5,
    'boosting_type': 'gbdt',
    'learning_rate': 0.02,      # <<< REDUCED learning rate
    'num_leaves': 63,           # Keep for now, could reduce later if needed
    'max_depth': 10,            # Keep for now, could reduce later if needed
    'feature_fraction': 0.8,    # Keep subsampling
    'bagging_fraction': 0.8,    # Keep subsampling
    'bagging_freq': 5,
    'lambda_l1': 0.3,           # <<< INCREASED L1 regularization
    'lambda_l2': 0.3,           # <<< INCREASED L2 regularization
    'class_weight': 'balanced', # Keep balanced weights
    # 'min_child_samples': 20,  # Optional: Add minimum samples per leaf
    'verbose': -1,
    'n_jobs': -1,
    'seed': 42,
    'device': 'gpu',
    'gpu_use_dp': False
}
print(f"DEBUG: LGBM Params configured. Device set to: {LGBM_PARAMS.get('device', 'cpu')}")
print(f"DEBUG: Reduced learning_rate to {LGBM_PARAMS.get('learning_rate')} and increased L1/L2 regularization to {LGBM_PARAMS.get('lambda_l1')} to reduce overfitting.")
# -------------------------------------------------------------

DEBUG: LGBM Params configured. Device set to: gpu
DEBUG: Reduced learning_rate to 0.02 and increased L1/L2 regularization to 0.3 to reduce overfitting.


In [9]:
# Training Configuration
VALIDATION_SIZE = 0.2 # 20% Val, 20% Test -> 40% total held out from initial X
EARLY_STOPPING_ROUNDS = 50 # Keep early stopping
RANDOM_STATE = 42

# Define Features to Drop (Keep the same selection)
FEATURES_TO_DROP = [
    'has_spatial_match',
    'sequence_length',
    'sequence_space',
    'has_l33t_match',
    'has_date_match',
    'has_sequence',
    'count_upper'
]
print(f"DEBUG: Defined features to drop: {FEATURES_TO_DROP}")

DEBUG: Defined features to drop: ['has_spatial_match', 'sequence_length', 'sequence_space', 'has_l33t_match', 'has_date_match', 'has_sequence', 'count_upper']


In [10]:
# Create output directory
try:
    print(f"DEBUG: Checking/creating output directory: {OUTPUT_DIR}")
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    print(f"DEBUG: Output directory exists: {os.path.exists(OUTPUT_DIR)}")
except Exception as e:
    print(f"DEBUG: FATAL - FAILED to create output directory {OUTPUT_DIR}: {e}")
    exit()

DEBUG: Checking/creating output directory: /kaggle/working/
DEBUG: Output directory exists: True


In [11]:
# --- Logging Setup ---
try:
    print("DEBUG: Configuring logging...")
    for handler in logging.root.handlers[:]: logging.root.removeHandler(handler)
    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s - %(levelname)s - %(message)s',
        handlers=[ logging.FileHandler(LOG_FILE, mode='w'), logging.StreamHandler() ]
    )
    logging.info("--- Starting Password Strength Model Training ---")
    logging.info(f"Output directory: {OUTPUT_DIR}")
    logging.info(f"Using data path: {DATA_PATH}")
    logging.info(f"Sample size: {SAMPLE_SIZE}")
    logging.info(f"LGBM Params: {LGBM_PARAMS}") # Log updated params
    logging.info(f"Features to drop: {FEATURES_TO_DROP}")
    print("DEBUG: Logging configured successfully.")
except Exception as e:
    print(f"DEBUG: WARNING - FAILED to configure logging: {e}. File logging disabled.")

2025-04-01 18:18:56,149 - INFO - --- Starting Password Strength Model Training ---
2025-04-01 18:18:56,150 - INFO - Output directory: /kaggle/working/
2025-04-01 18:18:56,151 - INFO - Using data path: /kaggle/input/passwordrock/rockyou.txt
2025-04-01 18:18:56,151 - INFO - Sample size: 150000
2025-04-01 18:18:56,152 - INFO - LGBM Params: {'objective': 'multiclass', 'metric': 'multi_logloss', 'num_class': 5, 'boosting_type': 'gbdt', 'learning_rate': 0.02, 'num_leaves': 63, 'max_depth': 10, 'feature_fraction': 0.8, 'bagging_fraction': 0.8, 'bagging_freq': 5, 'lambda_l1': 0.3, 'lambda_l2': 0.3, 'class_weight': 'balanced', 'verbose': -1, 'n_jobs': -1, 'seed': 42, 'device': 'gpu', 'gpu_use_dp': False}
2025-04-01 18:18:56,153 - INFO - Features to drop: ['has_spatial_match', 'sequence_length', 'sequence_space', 'has_l33t_match', 'has_date_match', 'has_sequence', 'count_upper']


DEBUG: Configuring logging...
DEBUG: Logging configured successfully.


In [12]:
# --- Helper Functions ---

def load_data(filepath, sample_size=None, encoding='latin-1'):
    """Loads passwords from a text file, optionally sampling."""
    logging.info(f"Attempting to load data from {filepath}...")
    print(f"DEBUG: load_data called with filepath={filepath}, sample_size={sample_size}")
    start_time = time.time()
    passwords = []
    try:
        if not os.path.exists(filepath):
             logging.error(f"Error: Data file not found at {filepath}")
             print(f"DEBUG: File not found error for {filepath}")
             raise FileNotFoundError(f"Data file not found: {filepath}")

        with open(filepath, 'r', encoding=encoding, errors='ignore') as f:
            if sample_size is not None:
                logging.info("Sampling enabled. Reading lines...")
                print("DEBUG: load_data - Sampling enabled, reading lines...")
                all_lines = f.readlines()
                stripped_lines = [line.strip() for line in all_lines if line.strip()]
                num_lines = len(stripped_lines)
                logging.info(f"Total non-empty lines read: {num_lines}")
                print(f"DEBUG: load_data - Total non-empty lines: {num_lines}")

                if num_lines == 0:
                     logging.warning("File contains no non-empty lines.")
                     print("DEBUG: load_data - WARNING: No non-empty lines found.")
                     return pd.DataFrame(columns=['password'])

                actual_sample_size = min(sample_size, num_lines)
                if actual_sample_size < sample_size:
                    logging.warning(f"Requested sample size {sample_size} > available lines {num_lines}. Using {actual_sample_size}.")
                    print(f"DEBUG: load_data - Adjusting sample size to {actual_sample_size}")

                if actual_sample_size == 0:
                    logging.warning("Sample size is 0. Returning empty DataFrame.")
                    print("DEBUG: load_data - Sample size is 0.")
                    return pd.DataFrame(columns=['password'])

                if actual_sample_size == num_lines:
                     logging.info(f"Using all {num_lines} available lines.")
                     print("DEBUG: load_data - Using all available lines.")
                     passwords = stripped_lines
                else:
                    print(f"DEBUG: load_data - Sampling {actual_sample_size} lines using np.random.choice...")
                    indices = np.random.choice(num_lines, actual_sample_size, replace=False)
                    passwords = [stripped_lines[i] for i in indices]
                    logging.info(f"Sampled {len(passwords)} passwords.")
                    print(f"DEBUG: load_data - Actually sampled {len(passwords)} passwords.")
                del all_lines, stripped_lines
                gc.collect()

            else:
                logging.info("Loading full dataset (sample_size is None)...")
                print("DEBUG: load_data - Loading full dataset...")
                passwords = [line.strip() for line in f if line.strip()]

        duration = time.time() - start_time
        logging.info(f"Loaded {len(passwords)} passwords in {duration:.2f} seconds.")
        print(f"DEBUG: load_data - Loaded {len(passwords)} passwords in {duration:.2f}s.")
        if not passwords:
             logging.warning("No passwords were loaded. Check file content and encoding.")
             print("DEBUG: load_data - WARNING: No passwords loaded.")
             return pd.DataFrame(columns=['password'])
        return pd.DataFrame(passwords, columns=['password'])

    except FileNotFoundError: raise
    except MemoryError:
        logging.error(f"MemoryError loading data from {filepath}. Try reducing SAMPLE_SIZE.", exc_info=True)
        print(f"DEBUG: load_data - MemoryError. Reduce SAMPLE_SIZE.")
        raise
    except Exception as e:
        logging.error(f"Error during data loading from {filepath}: {e}", exc_info=True)
        print(f"DEBUG: load_data - Exception: {e}")
        raise

In [13]:
# --- Feature Engineering ---

def feature_engineer(df):
    """Generates features for each password using zxcvbn."""
    logging.info("Starting feature engineering with zxcvbn...")
    print("DEBUG: feature_engineer called.")
    if df.empty or 'password' not in df.columns:
        logging.error("Input DataFrame for feature engineering is empty or missing 'password' column.")
        print("DEBUG: feature_engineer - Input DataFrame empty or invalid.")
        return pd.DataFrame(), pd.Series(dtype='int'), []

    start_time = time.time()
    features = []
    processed_count = 0
    error_count = 0
    total_passwords = len(df)
    logging.info(f"Processing {total_passwords} passwords for feature engineering.")
    print(f"DEBUG: feature_engineer - Starting loop for {total_passwords} passwords.")

    print_interval = max(1, total_passwords // 20)

    for idx, password in enumerate(df['password']):
        if (idx + 1) % print_interval == 0:
             print(f"DEBUG: feature_engineer - Processing password {idx+1}/{total_passwords}")

        if pd.isna(password) or not isinstance(password, str): password = ""
        is_empty_flag = 1 if len(password) == 0 else 0

        try:
            zxcvbn_input = password if password else " "
            analysis = zxcvbn(zxcvbn_input)

            crack_time_seconds = analysis.get('crack_times_seconds', {}).get('offline_fast_hashing_1e10_per_second', 0.0)
            crack_time_float = 0.0
            try: crack_time_float = float(crack_time_seconds)
            except (ValueError, TypeError, OverflowError):
                if crack_time_seconds != 0.0: logging.warning(f"Could not convert crack_time_seconds '{crack_time_seconds}' to float at index {idx}. Using 0.0.")
            crack_time_log10 = np.log10(max(crack_time_float, 1e-12) + 1e-9)

            guesses_log10 = analysis.get('guesses_log10', 0.0)
            guesses_log10_float = 0.0
            try: guesses_log10_float = float(guesses_log10)
            except (ValueError, TypeError, OverflowError):
                 if guesses_log10 != 0.0: logging.warning(f"Could not convert guesses_log10 '{guesses_log10}' to float at index {idx}. Using 0.0.")

            calc_time_value = analysis.get('calc_time', 0.0)
            calc_time_ms_float = 0.0
            if isinstance(calc_time_value, (int, float, decimal.Decimal)):
                try: calc_time_ms_float = float(calc_time_value) * 1000.0
                except (ValueError, TypeError, OverflowError):
                     if calc_time_value != 0.0: logging.warning(f"Could not convert numeric calc_time '{calc_time_value}' to ms float at index {idx}.")
            elif hasattr(calc_time_value, 'total_seconds'):
                 try: calc_time_ms_float = float(calc_time_value.total_seconds()) * 1000.0
                 except Exception as e: logging.warning(f"Could not convert timedelta-like calc_time '{calc_time_value}' to ms float at index {idx}: {e}")
            else:
                 if calc_time_value != 0.0: logging.warning(f"Unexpected type for calc_time '{type(calc_time_value)}' value '{calc_time_value}' at index {idx}. Using 0.0 ms.")

            current_sequence = analysis.get('sequence', [])
            sequence_matches = [m for m in current_sequence if m.get('pattern') == 'sequence']
            has_sequence_val = int(bool(sequence_matches))
            sequence_length_val, sequence_space_val = (len(sequence_matches[0].get('token', '')), sequence_matches[0].get('sequence_space', 0)) if sequence_matches else (0, 0)

            count_lower_val = sum(1 for char in password if char.islower())
            count_upper_val = sum(1 for char in password if char.isupper())
            count_digit_val = sum(1 for char in password if char.isdigit())
            count_symbol_val = len(password) - (count_lower_val + count_upper_val + count_digit_val)

            zxcvbn_score_val = 0 if is_empty_flag == 1 else analysis.get('score', -1)

            feat = {
                'password_length': len(password),
                'zxcvbn_score': zxcvbn_score_val,
                'guesses_log10': guesses_log10_float,
                'crack_time_log10': crack_time_log10,
                'calc_time_ms': calc_time_ms_float,
                'has_sequence': has_sequence_val,
                'sequence_length': sequence_length_val,
                'sequence_space': sequence_space_val,
                'has_dictionary_match': int(any(m.get('pattern') == 'dictionary' for m in current_sequence)),
                'has_spatial_match': int(any(m.get('pattern') == 'spatial' for m in current_sequence)),
                'has_repeat_match': int(any(m.get('pattern') == 'repeat' for m in current_sequence)),
                'has_date_match': int(any(m.get('pattern') == 'date' for m in current_sequence)),
                'has_l33t_match': int(any(m.get('l33t', False) for m in current_sequence)),
                'count_lower': count_lower_val,
                'count_upper': count_upper_val,
                'count_digit': count_digit_val,
                'count_symbol': count_symbol_val,
                'is_empty': is_empty_flag
            }
            features.append(feat)
            processed_count += 1

        except OverflowError as ofe:
             logging.warning(f"OverflowError during zxcvbn analysis for password index {idx} ('{str(password)[:20]}...'): {ofe}. Skipping.")
             print(f"DEBUG: feature_engineer - WARNING: OverflowError for index {idx}: {ofe}")
             error_count += 1
             continue
        except Exception as e:
            logging.warning(f"Unexpected error during feature engineering for password index {idx} ('{str(password)[:20]}...'): {type(e).__name__}: {e}", exc_info=False)
            print(f"DEBUG: feature_engineer - WARNING: Unexpected error for index {idx}: {type(e).__name__}: {e}")
            error_count += 1
            continue

    duration = time.time() - start_time
    logging.info(f"Feature engineering completed in {duration:.2f} seconds.")
    logging.info(f"Successfully processed: {processed_count}, Errors/Skipped: {error_count}")
    print(f"DEBUG: feature_engineer - Loop finished in {duration:.2f}s. Processed: {processed_count}, Errors/Skipped: {error_count}")

    if not features:
        logging.error("No features were generated.")
        print("DEBUG: feature_engineer - ERROR: No features generated.")
        return pd.DataFrame(), pd.Series(dtype='int'), []

    feature_df = pd.DataFrame(features)
    print("DEBUG: feature_engineer - Dtypes of generated feature_df:")
    print(feature_df.dtypes.value_counts())

    for col in feature_df.select_dtypes(include=['object']).columns:
        try:
            feature_df[col] = pd.to_numeric(feature_df[col])
            print(f"DEBUG: feature_engineer - Converted object column '{col}' to numeric.")
        except (ValueError, TypeError):
             logging.error(f"Column '{col}' has object type and could not be converted to numeric. Dropping.")
             print(f"DEBUG: feature_engineer - ERROR: Could not convert object column '{col}' to numeric. Dropping.")
             feature_df = feature_df.drop(columns=[col])

    if 'zxcvbn_score' not in feature_df.columns or feature_df['zxcvbn_score'].isnull().all():
        logging.error("Feature engineering resulted in DataFrame missing target 'zxcvbn_score' or target is all NaN.")
        print("DEBUG: feature_engineer - ERROR: Resulting DataFrame missing target or target is all NaN.")
        return pd.DataFrame(), pd.Series(dtype='int'), []

    initial_rows = len(feature_df)
    feature_df = feature_df[feature_df['zxcvbn_score'] >= 0].copy()
    rows_after_filter = len(feature_df)
    if rows_after_filter < initial_rows:
        logging.warning(f"Filtered out {initial_rows - rows_after_filter} rows with invalid zxcvbn scores (< 0).")
        print(f"DEBUG: feature_engineer - Filtered {initial_rows - rows_after_filter} rows with invalid scores.")

    if feature_df.empty:
        logging.error("All processed passwords resulted in invalid zxcvbn scores or were filtered out.")
        print("DEBUG: feature_engineer - ERROR: DataFrame empty after filtering invalid scores.")
        return pd.DataFrame(), pd.Series(dtype='int'), []

    y = feature_df['zxcvbn_score'].astype(int)
    X = feature_df.drop(columns=['zxcvbn_score'])

    numeric_cols = X.select_dtypes(include=np.number).columns.tolist()
    non_numeric_cols = list(set(X.columns) - set(numeric_cols))
    if non_numeric_cols:
        logging.error(f"Non-numeric columns found in features before returning: {non_numeric_cols}. Dropping them.")
        print(f"DEBUG: feature_engineer - ERROR: Dropping non-numeric columns: {non_numeric_cols}")
        X = X[numeric_cols]

    all_feature_names = list(X.columns)
    if not all_feature_names:
         logging.error("No valid numeric features remaining after processing.")
         print("DEBUG: feature_engineer - ERROR: No numeric features left.")
         return pd.DataFrame(), pd.Series(dtype='int'), []

    logging.info(f"Generated {len(all_feature_names)} features initially: {all_feature_names}")
    print(f"DEBUG: feature_engineer - Returning X shape: {X.shape}, y shape: {y.shape}")

    return X, y, all_feature_names

In [14]:
# --- LightGBM Definition ---

def train_lightgbm(X_train, y_train, X_val, y_val, feature_names, params):
    """Trains the LightGBM model with GPU support and early stopping."""
    logging.info("Starting LightGBM training...")
    logging.info(f"Training with {len(feature_names)} features: {feature_names}")
    print(f"DEBUG: train_lightgbm called. Train shape: {X_train.shape}, Val shape: {X_val.shape}")
    logging.info(f"Using parameters: {params}")
    print(f"DEBUG: train_lightgbm - Params: {params}")

    start_time = time.time()

    gpu_available = False
    if params.get('device') == 'gpu':
        try:
            print("DEBUG: train_lightgbm - Verifying GPU availability with dummy data...")
            num_features = X_train.shape[1]
            if num_features == 0: raise ValueError("X_train has 0 features.")
            dummy_data = np.random.rand(10, num_features).astype(np.float32)
            dummy_labels = np.random.randint(0, params['num_class'], 10).astype(np.float32)
            lgb.Dataset(dummy_data, label=dummy_labels).construct()
            print("DEBUG: train_lightgbm - GPU seems available.")
            gpu_available = True
        except Exception as gpu_e:
            logging.warning(f"GPU check failed: {gpu_e}. Will attempt CPU fallback if GPU training fails.")
            print(f"DEBUG: train_lightgbm - WARNING: GPU pre-check failed: {gpu_e}")

    print("DEBUG: train_lightgbm - Checking data types before conversion...")
    print("X_train dtypes:\n", X_train.dtypes.value_counts())
    print("X_val dtypes:\n", X_val.dtypes.value_counts())
    try:
        X_train = X_train.astype(np.float32)
        X_val = X_val.astype(np.float32)
        y_train = y_train.astype(np.int32)
        y_val = y_val.astype(np.int32)
        print("DEBUG: train_lightgbm - Data types converted successfully.")
    except Exception as e:
        logging.error(f"ERROR during data type conversion before LightGBM Dataset creation: {e}", exc_info=True)
        print(f"DEBUG: train_lightgbm - ERROR during astype conversion: {e}")
        raise

    try:
        lgb_train = lgb.Dataset(X_train, label=y_train, feature_name=feature_names)
        lgb_val = lgb.Dataset(X_val, label=y_val, reference=lgb_train, feature_name=feature_names)
        print("DEBUG: train_lightgbm - LGBM Datasets created.")
    except Exception as e:
        logging.error(f"ERROR creating LightGBM Datasets: {e}", exc_info=True)
        print(f"DEBUG: train_lightgbm - ERROR creating LightGBM Datasets: {e}")
        raise

    evals_result = {}
    callbacks = [
        lgb.log_evaluation(period=50), # Log every 50 rounds
        lgb.early_stopping(stopping_rounds=EARLY_STOPPING_ROUNDS, verbose=True),
        lgb.record_evaluation(evals_result)
    ]
    print("DEBUG: train_lightgbm - Callbacks defined.")

    model = None
    current_params = params.copy()
    try:
        print(f"DEBUG: train_lightgbm - Attempting lgb.train with device='{current_params.get('device')}'...")
        # Increase num_boost_round significantly due to lower learning rate
        model = lgb.train(
            current_params, lgb_train, num_boost_round=20000, # Increased max rounds
            valid_sets=[lgb_train, lgb_val], valid_names=['train', 'val'],
            callbacks=callbacks
        )
        print(f"DEBUG: train_lightgbm - lgb.train completed successfully on '{current_params.get('device')}'.")

    except Exception as e:
        logging.error(f"LightGBM training failed on '{current_params.get('device')}': {e}", exc_info=True)
        print(f"DEBUG: train_lightgbm - EXCEPTION during lgb.train on '{current_params.get('device')}': {e}")
        if current_params.get('device') == 'gpu':
             logging.warning("GPU training failed. Attempting fallback to CPU.")
             print("DEBUG: train_lightgbm - GPU failure detected, attempting CPU fallback.")
             current_params['device'] = 'cpu'
             current_params.pop('gpu_device_id', None); current_params.pop('gpu_platform_id', None); current_params.pop('gpu_use_dp', None)
             logging.info(f"Retrying with CPU using parameters: {current_params}")
             print(f"DEBUG: train_lightgbm - Retrying with CPU params: {current_params}")
             try:
                 model = lgb.train(
                    current_params, lgb_train, num_boost_round=20000, # Increased max rounds
                    valid_sets=[lgb_train, lgb_val], valid_names=['train', 'val'],
                    callbacks=callbacks
                )
                 print("DEBUG: train_lightgbm - lgb.train completed successfully on CPU fallback.")
             except Exception as cpu_e:
                 logging.error(f"LightGBM training failed on CPU fallback as well: {cpu_e}", exc_info=True)
                 print(f"DEBUG: train_lightgbm - EXCEPTION during CPU fallback lgb.train: {cpu_e}")
                 raise cpu_e
        else:
            raise e

    if model is None:
        logging.error("Model training did not complete successfully.")
        print("DEBUG: train_lightgbm - ERROR: Model object is None after training block.")
        raise RuntimeError("LightGBM model training failed to produce a model.")

    duration = time.time() - start_time
    logging.info(f"LightGBM training completed in {duration:.2f} seconds.")
    logging.info(f"Best iteration: {model.best_iteration}")

    best_score_dict = model.best_score
    metric_key = params['metric']
    if isinstance(metric_key, list): metric_key = metric_key[0]
    if best_score_dict and 'val' in best_score_dict and metric_key in best_score_dict['val']:
        best_val_score = best_score_dict['val'][metric_key]
        logging.info(f"Best validation score ({metric_key}): {best_val_score:.4f}")
    else:
        logging.warning(f"Could not retrieve best validation score for metric '{metric_key}' from model.")
        print(f"DEBUG: train_lightgbm - model.best_score content: {best_score_dict}")

    print(f"DEBUG: train_lightgbm - Training finished in {duration:.2f}s.")
    return model, evals_result

In [15]:
# --- Model Evaluation ---

def evaluate_model(model, X_test, y_test, feature_names, params, output_dir):
    """Evaluates the model and saves metrics and plots."""
    logging.info("Evaluating model on the test set...")
    logging.info(f"Evaluating with {len(feature_names)} features: {feature_names}")
    print(f"DEBUG: evaluate_model called. Test shape: {X_test.shape}")
    start_time = time.time()

    try:
        if set(X_test.columns) != set(feature_names):
             logging.warning(f"Columns mismatch between X_test ({X_test.columns.tolist()}) and feature_names ({feature_names}). Realigning.")
             print(f"DEBUG: evaluate_model - Realigning X_test columns.")
             for col in feature_names:
                 if col not in X_test.columns: X_test[col] = 0.0
             X_test = X_test[feature_names]

        X_test = X_test.astype(np.float32)
        y_test = y_test.astype(np.int32)
    except Exception as e:
        logging.error(f"ERROR converting test data types or aligning columns: {e}", exc_info=True)
        print(f"DEBUG: evaluate_model - ERROR converting/aligning test data: {e}")
        raise

    print("DEBUG: evaluate_model - Predicting probabilities...")
    y_pred_proba = model.predict(X_test, num_iteration=model.best_iteration)
    print("DEBUG: evaluate_model - Predicting classes...")
    y_pred = np.argmax(y_pred_proba, axis=1)

    print("DEBUG: evaluate_model - Calculating metrics...")
    num_classes = params['num_class']
    class_labels = range(num_classes)
    target_names = [f'Score {i}' for i in class_labels]

    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, output_dict=True, zero_division=0, labels=class_labels, target_names=target_names)
    try:
        logloss = log_loss(y_test, y_pred_proba, labels=class_labels)
    except ValueError as le:
        logging.warning(f"Could not calculate log_loss: {le}")
        print(f"DEBUG: evaluate_model - y_test unique values: {np.unique(y_test)}")
        print(f"DEBUG: evaluate_model - y_pred_proba shape: {y_pred_proba.shape}")
        logloss = -1.0

    cm = confusion_matrix(y_test, y_pred, labels=class_labels)

    metrics = {
        'accuracy': accuracy,
        'log_loss': logloss,
        'classification_report': report,
        'confusion_matrix': cm.tolist()
    }

    logging.info(f"Test Set Evaluation:")
    logging.info(f"  Accuracy: {accuracy:.4f}")
    logging.info(f"  Log Loss: {logloss:.4f}")
    print(f"DEBUG: evaluate_model - Test Accuracy: {accuracy:.4f}, Log Loss: {logloss:.4f}")
    print(f"DEBUG: evaluate_model - Classification Report:\n{classification_report(y_test, y_pred, zero_division=0, labels=class_labels, target_names=target_names)}")

    if plotting_available:
        try:
            print("DEBUG: evaluate_model - Plotting confusion matrix...")
            plt.figure(figsize=(8, 6))
            sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                        xticklabels=[f'Pred {i}' for i in class_labels],
                        yticklabels=[f'True {i}' for i in class_labels])
            plt.xlabel('Predicted Label'); plt.ylabel('True Label'); plt.title('Confusion Matrix')
            plt.tight_layout(); plt.savefig(CONFUSION_MATRIX_FILE); plt.close()
            logging.info(f"Saved confusion matrix to {CONFUSION_MATRIX_FILE}")
            print(f"DEBUG: evaluate_model - Saved confusion matrix to {CONFUSION_MATRIX_FILE}")
        except Exception as plot_e:
            logging.error(f"Failed to plot/save confusion matrix: {plot_e}", exc_info=True)
            print(f"DEBUG: evaluate_model - ERROR plotting/saving confusion matrix: {plot_e}")

        try:
            print("DEBUG: evaluate_model - Plotting feature importance...")
            if not feature_names:
                 print("DEBUG: evaluate_model - No feature names available, skipping importance plot.")
                 logging.warning("Skipping feature importance plot as feature_names list is empty.")
            else:
                plt.figure(figsize=(10, max(5, len(feature_names) // 2)))
                lgb.plot_importance(model, max_num_features=len(feature_names), importance_type='gain')
                plt.title('LightGBM Feature Importance (Gain)')
                plt.tight_layout()
                plt.savefig(FEATURE_IMPORTANCE_FILE); plt.close()
                logging.info(f"Saved feature importance plot to {FEATURE_IMPORTANCE_FILE}")
                print(f"DEBUG: evaluate_model - Saved feature importance to {FEATURE_IMPORTANCE_FILE}")
        except Exception as plot_e:
            logging.error(f"Failed to plot/save feature importance: {plot_e}", exc_info=True)
            print(f"DEBUG: evaluate_model - ERROR plotting/saving feature importance: {plot_e}")
    else:
        logging.warning("Plotting libraries not available. Skipping plot generation.")
        print("DEBUG: evaluate_model - Skipping plots as libraries are missing.")

    duration = time.time() - start_time
    logging.info(f"Evaluation completed in {duration:.2f} seconds.")
    print(f"DEBUG: evaluate_model - Evaluation finished in {duration:.2f}s.")
    return metrics

In [16]:
# --- Saving Training Data & Metrics ---

def save_artifacts(model, feature_names, metrics, output_dir):
    """Saves the trained model, feature names used, and metrics."""
    logging.info("Saving training artifacts...")
    print("DEBUG: save_artifacts called.")
    try:
        print(f"DEBUG: save_artifacts - Saving model to {MODEL_FILE}")
        joblib.dump(model, MODEL_FILE)
        logging.info(f"Model saved to {MODEL_FILE}")

        print(f"DEBUG: save_artifacts - Saving {len(feature_names)} used feature names to {FEATURE_NAMES_FILE}")
        joblib.dump(feature_names, FEATURE_NAMES_FILE)
        logging.info(f"Used feature names saved to {FEATURE_NAMES_FILE}")

        print(f"DEBUG: save_artifacts - Saving metrics to {METRICS_FILE}")
        def convert_numpy(obj):
            if isinstance(obj, np.integer): return int(obj)
            elif isinstance(obj, np.floating): return float(obj)
            elif isinstance(obj, np.ndarray): return obj.tolist()
            elif isinstance(obj, (datetime.date, datetime.datetime)): return obj.isoformat()
            return obj

        with open(METRICS_FILE, 'w') as f:
            json.dump(metrics, f, indent=4, default=convert_numpy)
        logging.info(f"Metrics saved to {METRICS_FILE}")
        print("DEBUG: save_artifacts - Artifacts saved successfully.")

    except Exception as e:
        logging.error(f"Error saving artifacts: {e}", exc_info=True)
        print(f"DEBUG: save_artifacts - EXCEPTION: {e}")

In [17]:
# --- Plotting Learning Curves ---

def plot_learning_curves(evals_result, metric_key, output_file):
    """Plots the training and validation learning curves for the specified metric."""
    logging.info(f"Plotting learning curves for metric: {metric_key}...")
    print(f"DEBUG: plot_learning_curves called for metric {metric_key}.")
    if not evals_result:
        logging.warning("No evaluation results found to plot learning curves.")
        print("DEBUG: plot_learning_curves - No evals_result data.")
        return
    if not plotting_available:
         logging.warning("Plotting libraries not available. Skipping learning curve plot.")
         print("DEBUG: plot_learning_curves - Skipping plot as libraries are missing.")
         return
    try:
        plt.figure(figsize=(10, 6))
        print(f"DEBUG: plot_learning_curves - Plotting metric: {metric_key}")
        lgb.plot_metric(evals_result, metric=metric_key)
        plt.title(f'LightGBM Learning Curves ({metric_key})'); plt.ylabel('Metric Value'); plt.xlabel('Boosting Round')
        plt.legend(); plt.grid(True); plt.tight_layout()
        plt.savefig(output_file); plt.close()
        logging.info(f"Saved learning curves plot to {output_file}")
        print(f"DEBUG: plot_learning_curves - Saved plot to {output_file}")
    except Exception as e:
        logging.warning(f"Could not plot learning curves for metric {metric_key}: {e}", exc_info=True)
        print(f"DEBUG: plot_learning_curves - EXCEPTION plotting {metric_key}: {e}")

In [18]:
# --- Main Execution ---
if __name__ == "__main__":
    print("DEBUG: Entering main execution block (__name__ == '__main__').")
    overall_start_time = time.time()
    final_status = "FAILED"

    try:
        # 1. Load Data
        print("DEBUG: Main - Calling load_data...")
        df_passwords = load_data(DATA_PATH, sample_size=SAMPLE_SIZE)
        if df_passwords.empty: raise ValueError("Loaded password DataFrame is empty.")
        print(f"DEBUG: Main - load_data returned DataFrame with shape: {df_passwords.shape}")

        # 2. Feature Engineering (Generates all features initially, including 'is_empty')
        print("DEBUG: Main - Calling feature_engineer...")
        X_initial, y, initial_feature_names = feature_engineer(df_passwords)
        if X_initial.empty or y.empty or not initial_feature_names: raise ValueError("Feature engineering produced empty/invalid results.")
        print(f"DEBUG: Main - feature_engineer returned X_initial shape: {X_initial.shape}, y shape: {y.shape}")
        del df_passwords; gc.collect()
        print("DEBUG: Main - df_passwords deleted, garbage collected.")

        # Apply Feature Selection *before* splitting
        logging.info(f"Applying feature selection. Dropping: {FEATURES_TO_DROP}")
        print(f"DEBUG: Main - Applying feature selection before splitting. Dropping: {FEATURES_TO_DROP}")
        feature_names_used = [f for f in initial_feature_names if f not in FEATURES_TO_DROP]
        # Ensure 'is_empty' is kept if it wasn't explicitly dropped (it shouldn't be)
        if 'is_empty' not in feature_names_used and 'is_empty' in initial_feature_names:
             print("DEBUG: Main - Ensuring 'is_empty' feature is kept.")
             feature_names_used.append('is_empty')

        X = X_initial[feature_names_used].copy()
        del X_initial; gc.collect()
        print(f"DEBUG: Main - X shape after feature selection: {X.shape}")
        print(f"DEBUG: Main - Features used for splitting/training: {feature_names_used}")

        # 3. Data Splitting (using the *selected* features)
        print("DEBUG: Main - Splitting data with selected features...")
        if X.shape[0] <= 1 or X.shape[0] != y.shape[0]: raise ValueError(f"Invalid data shapes for splitting: X={X.shape}, y={y.shape}")
        min_samples_per_class = y.value_counts().min()
        if min_samples_per_class < 2: raise ValueError(f"Smallest class has only {min_samples_per_class} sample(s). Need >= 2 for stratified splitting.")
        print(f"DEBUG: Main - Smallest class count for stratification: {min_samples_per_class}")

        # Split Train (60%), Val (20%), Test (20%)
        X_train, X_test_val, y_train, y_test_val = train_test_split(
            X, y, test_size=VALIDATION_SIZE * 2, random_state=RANDOM_STATE, stratify=y
        )
        X_val, X_test, y_val, y_test = train_test_split(
            X_test_val, y_test_val, test_size=0.5, random_state=RANDOM_STATE, stratify=y_test_val
        )
        del X_test_val, y_test_val, X, y; gc.collect()
        print("DEBUG: Main - Data splitting complete.")
        logging.info(f"Data split complete: Train={X_train.shape[0]}, Val={X_val.shape[0]}, Test={X_test.shape[0]}")
        print(f"DEBUG: Main - Final Shapes: Train={X_train.shape}, Val={X_val.shape}, Test={X_test.shape}")

        # 4. Train Model (using selected features and adjusted params)
        print("DEBUG: Main - Calling train_lightgbm...")
        logging.info("Starting model training...")
        model, evals_result = train_lightgbm(X_train, y_train, X_val, y_val, feature_names_used, LGBM_PARAMS)
        print("DEBUG: Main - train_lightgbm returned.")

        # Plot learning curves
        metric_key_to_plot = LGBM_PARAMS.get('metric')
        if isinstance(metric_key_to_plot, list): metric_key_to_plot = metric_key_to_plot[0]
        elif metric_key_to_plot is None: metric_key_to_plot = 'multi_logloss'
        print(f"DEBUG: Main - Calling plot_learning_curves for metric '{metric_key_to_plot}'...")
        plot_learning_curves(evals_result, metric_key_to_plot, LEARNING_CURVES_FILE)
        print("DEBUG: Main - plot_learning_curves returned.")

        # 5. Evaluate Model (using selected features)
        print("DEBUG: Main - Calling evaluate_model...")
        test_metrics = evaluate_model(model, X_test, y_test, feature_names_used, LGBM_PARAMS, OUTPUT_DIR)
        print("DEBUG: Main - evaluate_model returned.")

        # 6. Save Artifacts (save the list of features *used*)
        print("DEBUG: Main - Calling save_artifacts...")
        save_artifacts(model, feature_names_used, test_metrics, OUTPUT_DIR)
        print("DEBUG: Main - save_artifacts returned.")
        logging.info(f"Feature importance plot for *used* features saved to {FEATURE_IMPORTANCE_FILE}.")
        print(f"DEBUG: Main - Review {FEATURE_IMPORTANCE_FILE} for importance of remaining features.")

        # --- Example Prediction & Confidence ---
        logging.info("\n--- Example Prediction ---")
        print("\nDEBUG: Main - Running example predictions...")
        example_passwords = ["password123", "Summer2024", "Tr0ub4dor&3", "P@$$w0rd!", "5uMM3r#2024*Q", "12345", "", "ÐÂÐÐ‹Ð†Ð‚Ð¡â"]
        example_df = pd.DataFrame(example_passwords, columns=['password'])

        # Feature engineer examples (generates all features, including 'is_empty')
        X_example_full, _, example_initial_features = feature_engineer(example_df.copy())

        if not X_example_full.empty:
             # Apply the SAME feature selection as used for training
             print(f"DEBUG: Main - Example features generated (full), shape: {X_example_full.shape}")
             print(f"DEBUG: Main - Applying feature selection to example data using: {feature_names_used}")
             cols_to_select = [f for f in feature_names_used if f in X_example_full.columns]
             X_example = X_example_full[cols_to_select].copy()
             print(f"DEBUG: Main - Example features after selection, shape: {X_example.shape}")

             # Align columns precisely
             print(f"DEBUG: Main - Aligning example features with training features used: {feature_names_used}")
             for col in feature_names_used:
                 if col not in X_example.columns:
                     print(f"DEBUG: Main - WARNING: Adding missing column '{col}' to example features with value 0.0.")
                     X_example[col] = 0.0
             cols_to_drop_extra = [col for col in X_example.columns if col not in feature_names_used]
             if cols_to_drop_extra:
                 print(f"DEBUG: Main - WARNING: Dropping extra columns found in example features: {cols_to_drop_extra}")
                 X_example = X_example.drop(columns=cols_to_drop_extra)

             # Ensure order and type
             X_example = X_example[feature_names_used]
             X_example = X_example.astype(np.float32)
             print(f"DEBUG: Main - Example features aligned. Predicting...")

             pred_probs = model.predict(X_example, num_iteration=model.best_iteration)
             pred_classes = np.argmax(pred_probs, axis=1)
             print("DEBUG: Main - Example predictions done.")

             # Map results back safely using original index
             results_map = {}
             valid_indices = X_example.index
             if len(valid_indices) == len(pred_classes):
                  for i, idx in enumerate(valid_indices):
                      pw = example_df.loc[idx, 'password']
                      results_map[pw] = {'pred_class': pred_classes[i], 'pred_probs': pred_probs[i]}
             else:
                  logging.warning("Mismatch between number of successfully processed example passwords and predictions.")
                  print("DEBUG: Main - WARNING: Mismatch in processed example password count and prediction count.")

             for pw in example_passwords: # Iterate original list
                 if pw in results_map:
                     predicted_class = results_map[pw]['pred_class']
                     probabilities = results_map[pw]['pred_probs']
                     confidence = probabilities[predicted_class]
                     logging.info(f"Password: '{pw}'")
                     logging.info(f"  Predicted Strength Score (0-4): {predicted_class}")
                     logging.info(f"  Confidence: {confidence:.4f}")
                     logging.info(f"  Class Probabilities (0-4): {[f'{p:.3f}' for p in probabilities]}")
                     print(f"DEBUG: Example - PW: '{pw}', Predicted: {predicted_class}, Confidence: {confidence:.4f}")
                 else:
                     logging.info(f"Password: '{pw}'")
                     logging.info("  Prediction: Failed (feature engineering error)")
                     print(f"DEBUG: Example - PW: '{pw}', Prediction: Failed")

                 # Add zxcvbn's direct feedback
                 try:
                     zxcvbn_pw = pw if pw else " "
                     zxcvbn_analysis = zxcvbn(zxcvbn_pw)
                     logging.info(f"  ZXCVBN Score: {zxcvbn_analysis['score']}")
                     crack_display = zxcvbn_analysis.get('crack_times_display', {}).get('offline_fast_hashing_1e10_per_second', 'N/A')
                     logging.info(f"  ZXCVBN Est. Crack Time (offline_fast): {crack_display}")
                     feedback = zxcvbn_analysis.get('feedback', {})
                     if feedback.get('warning'): logging.info(f"  ZXCVBN Warning: {feedback['warning']}")
                     if feedback.get('suggestions'): logging.info(f"  ZXCVBN Suggestions: {'; '.join(feedback['suggestions'])}")
                 except Exception as e:
                     logging.warning(f"Could not get zxcvbn details for example '{pw}': {e}")
                     print(f"DEBUG: Example - zxcvbn call failed for '{pw}': {e}")
                 logging.info("-" * 20)
        else:
            logging.warning("Could not generate features for any example passwords. Skipping example prediction.")
            print("DEBUG: Main - WARNING: Example feature generation failed for all examples.")

        final_status = "SUCCESS"

    # --- Exception Handling ---
    except FileNotFoundError as e:
        logging.error(f"CRITICAL ERROR: Input data file not found. {e}", exc_info=True)
        print(f"DEBUG: Main - CRITICAL: FileNotFoundError: {e}")
    except ValueError as e:
        logging.error(f"CRITICAL ERROR: Data processing or validation issue. {e}", exc_info=True)
        print(f"DEBUG: Main - CRITICAL: ValueError: {e}")
    except ImportError as e:
         logging.error(f"CRITICAL ERROR: Missing dependency. {e}.", exc_info=True)
         print(f"DEBUG: Main - CRITICAL: ImportError: {e}")
    except MemoryError as e:
        logging.error(f"CRITICAL ERROR: Out of Memory. {e}. Try reducing SAMPLE_SIZE.", exc_info=True)
        print(f"DEBUG: Main - CRITICAL: MemoryError: {e}")
    except Exception as e:
        logging.error(f"CRITICAL ERROR: An unexpected error occurred during main execution.", exc_info=True)
        print(f"DEBUG: Main - CRITICAL: An unexpected exception occurred: {type(e).__name__}: {e}")
        import traceback
        print("--- TRACEBACK ---"); traceback.print_exc(); print("--- END TRACEBACK ---")

    # --- Final Log ---
    finally:
        overall_duration = time.time() - overall_start_time
        logging.info(f"--- Training script finished with status: {final_status} in {overall_duration:.2f} seconds ---")
        print(f"\nDEBUG: --- Script finished in {overall_duration:.2f} seconds. Status: {final_status} ---")

2025-04-01 18:18:56,334 - INFO - Attempting to load data from /kaggle/input/passwordrock/rockyou.txt...
2025-04-01 18:18:56,343 - INFO - Sampling enabled. Reading lines...


DEBUG: Entering main execution block (__name__ == '__main__').
DEBUG: Main - Calling load_data...
DEBUG: load_data called with filepath=/kaggle/input/passwordrock/rockyou.txt, sample_size=150000
DEBUG: load_data - Sampling enabled, reading lines...


2025-04-01 18:19:01,623 - INFO - Total non-empty lines read: 17974447


DEBUG: load_data - Total non-empty lines: 17974447
DEBUG: load_data - Sampling 150000 lines using np.random.choice...


2025-04-01 18:19:02,519 - INFO - Sampled 150000 passwords.
2025-04-01 18:19:03,317 - INFO - Loaded 150000 passwords in 6.98 seconds.
2025-04-01 18:19:03,359 - INFO - Starting feature engineering with zxcvbn...
2025-04-01 18:19:03,360 - INFO - Processing 150000 passwords for feature engineering.


DEBUG: load_data - Actually sampled 150000 passwords.
DEBUG: load_data - Loaded 150000 passwords in 6.98s.
DEBUG: Main - load_data returned DataFrame with shape: (150000, 1)
DEBUG: Main - Calling feature_engineer...
DEBUG: feature_engineer called.
DEBUG: feature_engineer - Starting loop for 150000 passwords.
DEBUG: feature_engineer - Processing password 7500/150000
DEBUG: feature_engineer - Processing password 15000/150000
DEBUG: feature_engineer - Processing password 22500/150000
DEBUG: feature_engineer - Processing password 30000/150000
DEBUG: feature_engineer - Processing password 37500/150000
DEBUG: feature_engineer - Processing password 45000/150000
DEBUG: feature_engineer - Processing password 52500/150000
DEBUG: feature_engineer - Processing password 60000/150000
DEBUG: feature_engineer - Processing password 67500/150000
DEBUG: feature_engineer - Processing password 75000/150000
DEBUG: feature_engineer - Processing password 82500/150000
DEBUG: feature_engineer - Processing passw

2025-04-01 18:21:20,331 - INFO - Feature engineering completed in 136.97 seconds.
2025-04-01 18:21:20,332 - INFO - Successfully processed: 150000, Errors/Skipped: 0


DEBUG: feature_engineer - Processing password 150000/150000
DEBUG: feature_engineer - Loop finished in 136.97s. Processed: 150000, Errors/Skipped: 0


2025-04-01 18:21:21,029 - INFO - Generated 17 features initially: ['password_length', 'guesses_log10', 'crack_time_log10', 'calc_time_ms', 'has_sequence', 'sequence_length', 'sequence_space', 'has_dictionary_match', 'has_spatial_match', 'has_repeat_match', 'has_date_match', 'has_l33t_match', 'count_lower', 'count_upper', 'count_digit', 'count_symbol', 'is_empty']


DEBUG: feature_engineer - Dtypes of generated feature_df:
int64      15
float64     3
Name: count, dtype: int64
DEBUG: feature_engineer - Returning X shape: (150000, 17), y shape: (150000,)
DEBUG: Main - feature_engineer returned X_initial shape: (150000, 17), y shape: (150000,)


2025-04-01 18:21:21,344 - INFO - Applying feature selection. Dropping: ['has_spatial_match', 'sequence_length', 'sequence_space', 'has_l33t_match', 'has_date_match', 'has_sequence', 'count_upper']


DEBUG: Main - df_passwords deleted, garbage collected.
DEBUG: Main - Applying feature selection before splitting. Dropping: ['has_spatial_match', 'sequence_length', 'sequence_space', 'has_l33t_match', 'has_date_match', 'has_sequence', 'count_upper']
DEBUG: Main - X shape after feature selection: (150000, 10)
DEBUG: Main - Features used for splitting/training: ['password_length', 'guesses_log10', 'crack_time_log10', 'calc_time_ms', 'has_dictionary_match', 'has_repeat_match', 'count_lower', 'count_digit', 'count_symbol', 'is_empty']
DEBUG: Main - Splitting data with selected features...
DEBUG: Main - Smallest class count for stratification: 261


2025-04-01 18:21:21,607 - INFO - Data split complete: Train=90000, Val=30000, Test=30000
2025-04-01 18:21:21,608 - INFO - Starting model training...
2025-04-01 18:21:21,610 - INFO - Starting LightGBM training...
2025-04-01 18:21:21,610 - INFO - Training with 10 features: ['password_length', 'guesses_log10', 'crack_time_log10', 'calc_time_ms', 'has_dictionary_match', 'has_repeat_match', 'count_lower', 'count_digit', 'count_symbol', 'is_empty']
2025-04-01 18:21:21,611 - INFO - Using parameters: {'objective': 'multiclass', 'metric': 'multi_logloss', 'num_class': 5, 'boosting_type': 'gbdt', 'learning_rate': 0.02, 'num_leaves': 63, 'max_depth': 10, 'feature_fraction': 0.8, 'bagging_fraction': 0.8, 'bagging_freq': 5, 'lambda_l1': 0.3, 'lambda_l2': 0.3, 'class_weight': 'balanced', 'verbose': -1, 'n_jobs': -1, 'seed': 42, 'device': 'gpu', 'gpu_use_dp': False}


DEBUG: Main - Data splitting complete.
DEBUG: Main - Final Shapes: Train=(90000, 10), Val=(30000, 10), Test=(30000, 10)
DEBUG: Main - Calling train_lightgbm...
DEBUG: train_lightgbm called. Train shape: (90000, 10), Val shape: (30000, 10)
DEBUG: train_lightgbm - Params: {'objective': 'multiclass', 'metric': 'multi_logloss', 'num_class': 5, 'boosting_type': 'gbdt', 'learning_rate': 0.02, 'num_leaves': 63, 'max_depth': 10, 'feature_fraction': 0.8, 'bagging_fraction': 0.8, 'bagging_freq': 5, 'lambda_l1': 0.3, 'lambda_l2': 0.3, 'class_weight': 'balanced', 'verbose': -1, 'n_jobs': -1, 'seed': 42, 'device': 'gpu', 'gpu_use_dp': False}
DEBUG: train_lightgbm - Verifying GPU availability with dummy data...
DEBUG: train_lightgbm - GPU seems available.
DEBUG: train_lightgbm - Checking data types before conversion...
X_train dtypes:
 int64      7
float64    3
Name: count, dtype: int64
X_val dtypes:
 int64      7
float64    3
Name: count, dtype: int64
DEBUG: train_lightgbm - Data types converted su

2025-04-01 18:21:21,907 - ERROR - LightGBM training failed on 'gpu': Check failed: (best_split_info.left_count) > (0) at /usr/local/src/LightGBM/lightgbm-python/src/treelearner/serial_tree_learner.cpp, line 846 .
Traceback (most recent call last):
  File "<ipython-input-14-f7f306254a72>", line 64, in train_lightgbm
    model = lgb.train(
  File "/usr/local/lib/python3.10/dist-packages/lightgbm/engine.py", line 307, in train
    booster.update(fobj=fobj)
  File "/usr/local/lib/python3.10/dist-packages/lightgbm/basic.py", line 4135, in update
    _safe_call(
  File "/usr/local/lib/python3.10/dist-packages/lightgbm/basic.py", line 296, in _safe_call
    raise LightGBMError(_LIB.LGBM_GetLastError().decode("utf-8"))
lightgbm.basic.LightGBMError: Check failed: (best_split_info.left_count) > (0) at /usr/local/src/LightGBM/lightgbm-python/src/treelearner/serial_tree_learner.cpp, line 846 .

2025-04-01 18:21:21,910 - INFO - Retrying with CPU using parameters: {'objective': 'multiclass', 'metric

Training until validation scores don't improve for 50 rounds
DEBUG: train_lightgbm - EXCEPTION during lgb.train on 'gpu': Check failed: (best_split_info.left_count) > (0) at /usr/local/src/LightGBM/lightgbm-python/src/treelearner/serial_tree_learner.cpp, line 846 .

DEBUG: train_lightgbm - GPU failure detected, attempting CPU fallback.
DEBUG: train_lightgbm - Retrying with CPU params: {'objective': 'multiclass', 'metric': 'multi_logloss', 'num_class': 5, 'boosting_type': 'gbdt', 'learning_rate': 0.02, 'num_leaves': 63, 'max_depth': 10, 'feature_fraction': 0.8, 'bagging_fraction': 0.8, 'bagging_freq': 5, 'lambda_l1': 0.3, 'lambda_l2': 0.3, 'class_weight': 'balanced', 'verbose': -1, 'n_jobs': -1, 'seed': 42, 'device': 'cpu'}
Training until validation scores don't improve for 50 rounds
[50]	train's multi_logloss: 0.222834	val's multi_logloss: 0.222796
[100]	train's multi_logloss: 0.0454746	val's multi_logloss: 0.0455765
[150]	train's multi_logloss: 0.0108288	val's multi_logloss: 0.011098


2025-04-01 18:21:32,004 - INFO - LightGBM training completed in 10.39 seconds.
2025-04-01 18:21:32,005 - INFO - Best iteration: 343
2025-04-01 18:21:32,006 - INFO - Best validation score (multi_logloss): 0.0021
2025-04-01 18:21:32,010 - INFO - Plotting learning curves for metric: multi_logloss...


Early stopping, best iteration is:
[343]	train's multi_logloss: 0.000997068	val's multi_logloss: 0.00213135
DEBUG: train_lightgbm - lgb.train completed successfully on CPU fallback.
DEBUG: train_lightgbm - Training finished in 10.39s.
DEBUG: Main - train_lightgbm returned.
DEBUG: Main - Calling plot_learning_curves for metric 'multi_logloss'...
DEBUG: plot_learning_curves called for metric multi_logloss.
DEBUG: plot_learning_curves - Plotting metric: multi_logloss


2025-04-01 18:21:32,231 - INFO - Saved learning curves plot to /kaggle/working/learning_curves.png
2025-04-01 18:21:32,232 - INFO - Evaluating model on the test set...
2025-04-01 18:21:32,233 - INFO - Evaluating with 10 features: ['password_length', 'guesses_log10', 'crack_time_log10', 'calc_time_ms', 'has_dictionary_match', 'has_repeat_match', 'count_lower', 'count_digit', 'count_symbol', 'is_empty']


DEBUG: plot_learning_curves - Saved plot to /kaggle/working/learning_curves.png
DEBUG: Main - plot_learning_curves returned.
DEBUG: Main - Calling evaluate_model...
DEBUG: evaluate_model called. Test shape: (30000, 10)
DEBUG: evaluate_model - Predicting probabilities...


2025-04-01 18:21:32,898 - INFO - Test Set Evaluation:
2025-04-01 18:21:32,898 - INFO -   Accuracy: 0.9990
2025-04-01 18:21:32,899 - INFO -   Log Loss: 0.0021


DEBUG: evaluate_model - Predicting classes...
DEBUG: evaluate_model - Calculating metrics...
DEBUG: evaluate_model - Test Accuracy: 0.9990, Log Loss: 0.0021
DEBUG: evaluate_model - Classification Report:
              precision    recall  f1-score   support

     Score 0       0.72      0.74      0.73        53
     Score 1       1.00      1.00      1.00      8720
     Score 2       1.00      1.00      1.00      9239
     Score 3       1.00      1.00      1.00      5687
     Score 4       1.00      1.00      1.00      6301

    accuracy                           1.00     30000
   macro avg       0.94      0.95      0.95     30000
weighted avg       1.00      1.00      1.00     30000

DEBUG: evaluate_model - Plotting confusion matrix...


2025-04-01 18:21:33,157 - INFO - Saved confusion matrix to /kaggle/working/confusion_matrix.png
2025-04-01 18:21:33,355 - INFO - Saved feature importance plot to /kaggle/working/feature_importance.png
2025-04-01 18:21:33,355 - INFO - Evaluation completed in 1.12 seconds.
2025-04-01 18:21:33,356 - INFO - Saving training artifacts...


DEBUG: evaluate_model - Saved confusion matrix to /kaggle/working/confusion_matrix.png
DEBUG: evaluate_model - Plotting feature importance...
DEBUG: evaluate_model - Saved feature importance to /kaggle/working/feature_importance.png
DEBUG: evaluate_model - Evaluation finished in 1.12s.
DEBUG: Main - evaluate_model returned.
DEBUG: Main - Calling save_artifacts...
DEBUG: save_artifacts called.
DEBUG: save_artifacts - Saving model to /kaggle/working/lightgbm_password_model.joblib


2025-04-01 18:21:33,407 - INFO - Model saved to /kaggle/working/lightgbm_password_model.joblib
2025-04-01 18:21:33,411 - INFO - Used feature names saved to /kaggle/working/feature_names.joblib
2025-04-01 18:21:33,413 - INFO - Metrics saved to /kaggle/working/training_metrics.json
2025-04-01 18:21:33,414 - INFO - Feature importance plot for *used* features saved to /kaggle/working/feature_importance.png.
2025-04-01 18:21:33,414 - INFO - 
--- Example Prediction ---
2025-04-01 18:21:33,416 - INFO - Starting feature engineering with zxcvbn...
2025-04-01 18:21:33,417 - INFO - Processing 8 passwords for feature engineering.
2025-04-01 18:21:33,428 - INFO - Feature engineering completed in 0.01 seconds.
2025-04-01 18:21:33,429 - INFO - Successfully processed: 8, Errors/Skipped: 0
2025-04-01 18:21:33,435 - INFO - Generated 17 features initially: ['password_length', 'guesses_log10', 'crack_time_log10', 'calc_time_ms', 'has_sequence', 'sequence_length', 'sequence_space', 'has_dictionary_match', 

DEBUG: save_artifacts - Saving 10 used feature names to /kaggle/working/feature_names.joblib
DEBUG: save_artifacts - Saving metrics to /kaggle/working/training_metrics.json
DEBUG: save_artifacts - Artifacts saved successfully.
DEBUG: Main - save_artifacts returned.
DEBUG: Main - Review /kaggle/working/feature_importance.png for importance of remaining features.

DEBUG: Main - Running example predictions...
DEBUG: feature_engineer called.
DEBUG: feature_engineer - Starting loop for 8 passwords.
DEBUG: feature_engineer - Processing password 1/8
DEBUG: feature_engineer - Processing password 2/8
DEBUG: feature_engineer - Processing password 3/8
DEBUG: feature_engineer - Processing password 4/8
DEBUG: feature_engineer - Processing password 5/8
DEBUG: feature_engineer - Processing password 6/8
DEBUG: feature_engineer - Processing password 7/8
DEBUG: feature_engineer - Processing password 8/8
DEBUG: feature_engineer - Loop finished in 0.01s. Processed: 8, Errors/Skipped: 0
DEBUG: feature_engi

<Figure size 1000x600 with 0 Axes>

<Figure size 1000x500 with 0 Axes>

In [20]:
!zip -r /kaggle/working.zip /kaggle/working

from IPython.display import FileLink
FileLink("/kaggle/working.zip")

updating: kaggle/working/ (stored 0%)
updating: kaggle/working/password_strength_training.log (deflated 78%)
updating: kaggle/working/feature_importance.png (deflated 14%)
updating: kaggle/working/confusion_matrix.png (deflated 16%)
updating: kaggle/working/feature_names.joblib (deflated 23%)
updating: kaggle/working/lightgbm_password_model.joblib (deflated 68%)
updating: kaggle/working/training_metrics.json (deflated 78%)
updating: kaggle/working/learning_curves.png (deflated 12%)
updating: kaggle/working/.virtual_documents/ (stored 0%)
