In [15]:
import pandas as pd
import numpy as np
import pickle
import os
import warnings

MODEL_FILE = '../random_forest_model.pkl'
NEW_DATA_FILE = '../Keystrokes.csv'
OUTPUT_FILE = 'keystroke_predictions.csv'

FEATURE_NAMES = [
    'H.period', 'DD.period.t', 'UD.period.t', 'H.t', 'DD.t.i',
    'UD.t.i', 'H.i', 'DD.i.e', 'UD.i.e', 'H.e', 'DD.e.five',
    'UD.e.five', 'H.five', 'DD.five.Shift.r', 'UD.five.Shift.r',
    'H.Shift.r', 'DD.Shift.r.o', 'UD.Shift.r.o', 'H.o', 'DD.o.a',
    'UD.o.a', 'H.a', 'DD.a.n', 'UD.a.n', 'H.n', 'DD.n.l', 'UD.n.l',
    'H.l', 'DD.l.Return', 'UD.l.Return', 'H.Return'
]

def predict_new_data():
   
    if not os.path.exists(MODEL_FILE):
        print(f"Error: Model file '{MODEL_FILE}' not found.")
        print("Please place your .pkl file in the same directory as this script.")
        return

    if not os.path.exists(NEW_DATA_FILE):
        print(f"Error: New data file '{NEW_DATA_FILE}' not found.")
        print(f"Please update the 'NEW_DATA_FILE' variable in this script to match your file's name.")
        return

    print(f"Loading model from '{MODEL_FILE}'...")
    try:
        with open(MODEL_FILE, 'rb') as f:
            model = pickle.load(f)
    except Exception as e:
        print(f"Error loading pickle file: {e}")
        return

    print(f"Loading new data from '{NEW_DATA_FILE}'...")
    try:
        new_data = pd.read_csv(NEW_DATA_FILE, header=0)
    except Exception as e:
        print(f"Error reading CSV file: {e}")
        return

    # Check if all 31 required feature columns are present
    missing_cols = [col for col in FEATURE_NAMES if col not in new_data.columns]

    # --- 1. FILE-LEVEL CHECK (from previous version) ---
    # If columns are missing, apply default values to all rows and exit
    if missing_cols:
        print(f"\nWarning: The new data file is missing required feature columns:")
        print(missing_cols)
        print("Defaulting all predictions to 'Imposter' with 0.0 Genuine / 1.0 Imposter probability.")

        output_df = new_data.copy()
        output_df['Prediction'] = 'Imposter'
        output_df['Probability_Genuine'] = 0.0
        output_df['Probability_Imposter'] = 1.0

        output_df.to_csv(OUTPUT_FILE, index=False)

        print("\n--- Prediction Complete (Defaulted) ---")
        print(f"Results saved to '{OUTPUT_FILE}'")
        print("\nPrediction counts:")
        print(output_df['Prediction'].value_counts(dropna=False))
        return  # Stop the function here
    # --- END OF FILE-LEVEL CHECK ---

    # --- 2. ROW-LEVEL PREPROCESSING (New Logic) ---
    # If we are here, all columns were found. Proceed with row-by-row prediction.
    
    # Keep copy of original data to merge with at the end
    original_data_with_index = new_data.copy()
    original_data_with_index['original_index'] = new_data.index

    print("Preprocessing data (applying to_numeric)...")
    # Create a copy for processing to avoid SettingWithCopyWarning
    new_data_processed = new_data.copy()
    for feature in FEATURE_NAMES:
        # errors='coerce' turns any non-numeric data into NaN (Not a Number)
        new_data_processed[feature] = pd.to_numeric(new_data_processed[feature], errors='coerce')

    # Identify good rows (all 31 features are valid numbers)
    # and bad rows (at least one feature is NaN)
    good_rows_mask = new_data_processed[FEATURE_NAMES].notna().all(axis=1)
    bad_rows_mask = ~good_rows_mask

    # Initialize prediction columns
    new_data_processed['Prediction'] = ''
    new_data_processed['Probability_Genuine'] = 0.0
    new_data_processed['Probability_Imposter'] = 0.0

    # --- 3. Handle Bad Rows ---
    # As requested, default rows with *any* missing data to 'Imposter'
    num_bad_rows = bad_rows_mask.sum()
    if num_bad_rows > 0:
        print(f"Warning: Found {num_bad_rows} rows with missing/invalid data. Defaulting them to 'Imposter'.")
        new_data_processed.loc[bad_rows_mask, 'Prediction'] = 'Imposter'
        new_data_processed.loc[bad_rows_mask, 'Probability_Genuine'] = 0.0
        new_data_processed.loc[bad_rows_mask, 'Probability_Imposter'] = 1.0

    # --- 4. Handle Good Rows ---
    num_good_rows = good_rows_mask.sum()
    if num_good_rows > 0:
        print(f"Making {num_good_rows} predictions on valid data rows...")
        X_predict_good = new_data_processed.loc[good_rows_mask, FEATURE_NAMES]

        try:
            predictions_good = model.predict(X_predict_good)
            probabilities_good = model.predict_proba(X_predict_good)

            new_data_processed.loc[good_rows_mask, 'Prediction'] = predictions_good
            new_data_processed.loc[good_rows_mask, 'Probability_Genuine'] = probabilities_good[:, 0]
            new_data_processed.loc[good_rows_mask, 'Probability_Imposter'] = probabilities_good[:, 1]
        except Exception as e:
            print(f"Error during prediction: {e}")
            return
    else:
        print("No valid data rows found to make predictions on.")
    
    if num_good_rows == 0 and num_bad_rows == 0:
        print("Error: No data rows found in the file.")
        return

    # --- 5. Save Results ---
    print("Saving results...")

    # Combine original data with the new prediction columns
    # The indices of original_data_with_index and new_data_processed are aligned
    output_df = original_data_with_index.drop(columns=['original_index'])
    output_df['Prediction'] = new_data_processed['Prediction']
    output_df['Probability_Genuine'] = new_data_processed['Probability_Genuine']
    output_df['Probability_Imposter'] = new_data_processed['Probability_Imposter']


    output_df.to_csv(OUTPUT_FILE, index=False)

    print("\n--- Prediction Complete ---")
    print(f"Results saved to '{OUTPUT_FILE}'")
    print("\nPrediction counts:")
    print(output_df['Prediction'].value_counts(dropna=False))

if __name__ == "__main__":
    # Suppress warnings
    warnings.filterwarnings('ignore')
    predict_new_data()

Loading model from '../random_forest_model.pkl'...
Loading new data from '../Keystrokes.csv'...
Preprocessing data (applying to_numeric)...
Making 2 predictions on valid data rows...
Saving results...

--- Prediction Complete ---
Results saved to 'keystroke_predictions.csv'

Prediction counts:
Prediction
Imposter    1
Genuine     1
Name: count, dtype: int64
