In [2]:
!pip install psutil pyyaml tensorflow imbalanced-learn==0.9.1 xgboost==1.6.1
!pip install flask-cors | cat
import os
import psutil
import sys
import time
import logging
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

logging.basicConfig(level=logging.INFO, 
                    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger('kaggle-notebook')

def print_memory_usage():
    memory_info = psutil.virtual_memory()
    print(f"Memory usage: {memory_info.percent}%")
    print(f"Available: {memory_info.available / (1024 ** 2):.1f} MB / {memory_info.total / (1024 ** 2):.1f} MB")

print("\n=== Initial System Information ===")
print_memory_usage()

print("\n=== Setting up directories ===")
for dir_path in ["data/raw", "data/interim", "data/processed", 
                 "models/trained_models", "reports", "predictions", 
                 "deployment", "visualizations"]:
    os.makedirs(dir_path, exist_ok=True)
    print(f"Directory {dir_path} ready")

sys.path.append('/kaggle/input/insider-threat-detection/Insider-threat-detection')

print("\n=== Checking for example data ===")
example_data_dir = "/kaggle/input/insider-threat-detection/Insider-threat-detection/example_data"
if os.path.exists(example_data_dir):
    import shutil
    print(f"Copying example data from {example_data_dir}")
    for filename in os.listdir(example_data_dir):
        src_path = os.path.join(example_data_dir, filename)
        dest_path = os.path.join("data/raw", filename)
        if os.path.isfile(src_path) and not os.path.exists(dest_path):
            shutil.copy(src_path, dest_path)
            print(f"Copied {filename} to data/raw/")
else:
    print("Example data not found. Using synthetic data.")
    
    print("Generating synthetic data...")
    try:
        import numpy as np
        
        np.random.seed(42)
        n_users = 100
        n_days = 30
        n_records = n_users * n_days
        
        # User IDs
        user_ids = [f"U{i:04d}" for i in range(n_users)]
        
        # Generate log data
        log_data = {
            "timestamp": pd.date_range(start="2024-01-01", periods=n_records),
            "user_id": np.random.choice(user_ids, n_records),
            "pc": np.random.choice([f"PC{i:03d}" for i in range(50)], n_records),
            "activity": np.random.choice(["logon", "logoff", "connect", "disconnect"], n_records),
            "resource": np.random.choice(["file_server", "email", "web", "database"], n_records)
        }
        log_df = pd.DataFrame(log_data)
        os.makedirs("data/raw", exist_ok=True)
        log_df.to_csv("data/raw/synthetic_logs.csv", index=False)
        print(f"Created synthetic log data: {len(log_df)} records")
        
        email_data = {
            "timestamp": pd.date_range(start="2024-01-01", periods=n_records//2),
            "user_id": np.random.choice(user_ids, n_records//2),
            "recipient": np.random.choice(user_ids, n_records//2),
            "subject": np.random.choice(["Meeting", "Report", "Update", "Question", "Important"], n_records//2),
            "content": np.random.choice(["Please review", "Let's discuss", "Confidential information", 
                                         "Project update", "Need your input"], n_records//2)
        }
        email_df = pd.DataFrame(email_data)
        email_df.to_csv("data/raw/synthetic_emails.csv", index=False)
        print(f"Created synthetic email data: {len(email_df)} records")
        
        # Generate file access data
        file_data = {
            "timestamp": pd.date_range(start="2024-01-01", periods=n_records//3),
            "user_id": np.random.choice(user_ids, n_records//3),
            "filename": np.random.choice([f"file_{i}.txt" for i in range(20)], n_records//3),
            "access_type": np.random.choice(["read", "write", "modify", "delete"], n_records//3),
            "file_path": np.random.choice(["/documents", "/reports", "/financial", "/hr"], n_records//3)
        }
        file_df = pd.DataFrame(file_data)
        file_df.to_csv("data/raw/synthetic_files.csv", index=False)
        print(f"Created synthetic file access data: {len(file_df)} records")
    except Exception as e:
        print(f"Error generating synthetic data: {str(e)}")

print("\n=== Fixing matplotlib style in evaluate.py ===")
try:
    eval_file = '/kaggle/input/insider-threat-detection/Insider-threat-detection/models/evaluate.py'
    with open(eval_file, 'r') as f:
        contents = f.read()

    if 'seaborn-v0_8-darkgrid' in contents and 'try:' not in contents[:500]:
        fixed_style = """import matplotlib.pyplot as plt
import seaborn as sns

# Update the deprecated style
try:
    plt.style.use('seaborn-v0_8-darkgrid')
except:
    try:
        plt.style.use('seaborn-darkgrid')
    except:
        plt.style.use('default')
        sns.set_style('darkgrid')  # Fallback to basic seaborn style
"""
        with open('fixed_evaluate.py', 'w') as f:
            f.write(fixed_style + contents[contents.find('\n', contents.find('plt.style.use')):])
        print("Created fixed_evaluate.py")
except Exception as e:
    print(f"Error fixing matplotlib style: {str(e)}")

print("\n=== Setting up Kaggle scripts ===")
try:
    import shutil
    shutil.copy("/kaggle/input/insider-threat-detection/Insider-threat-detection/kaggle/kaggle_run.py", 
                "kaggle_run.py")
    print("Copied kaggle_run.py to working directory")
    
    shutil.copy("/kaggle/input/insider-threat-detection/Insider-threat-detection/kaggle/kaggle_deploy.py", 
                "kaggle_deploy.py")
    print("Copied kaggle_deploy.py to working directory")
except Exception as e:
    print(f"Error copying Kaggle scripts: {str(e)}")

print("\n=== Starting Insider Threat Detection ML Lifecycle ===")
print("=== Using optimized hyperparameters with improved error handling ===")
print("")

start_time = time.time()

!python /kaggle/input/insider-threat-detection/Insider-threat-detection/kaggle/kaggle_run.py --debug --sample 0.1

runtime = time.time() - start_time
hours, remainder = divmod(runtime, 3600)
minutes, seconds = divmod(remainder, 60)
print(f"\n=== Training completed in {int(hours)}h {int(minutes)}m {int(seconds)}s ===")

print("\n=== Final Memory Usage ===")
print_memory_usage()

print("\n=== Running visualization script ===")
try:
    from visualize_results import main as visualize_main
    visualize_main()
except Exception as e:
    print(f"Error running visualization script: {str(e)}")
    
    metrics_path = "reports/model_metrics.csv"
    if os.path.exists(metrics_path):
        try:
            metrics_df = pd.read_csv(metrics_path)
            print("\nModel Performance Metrics:")
            display(metrics_df)
        except Exception as viz_error:
            print(f"Error displaying metrics: {str(viz_error)}")

print("\n=== Displaying Results ===")
try:
    if os.path.exists("visualizations"):
        viz_files = list(Path("visualizations").glob("*.png"))
        if viz_files:
            print(f"Found {len(viz_files)} visualization files")
            
            # Display combined metrics if available
            combined_metrics = "visualizations/combined_metrics.png"
            if os.path.exists(combined_metrics):
                plt.figure(figsize=(10, 6))
                img = plt.imread(combined_metrics)
                plt.imshow(img)
                plt.axis("off")
                plt.title("Model Performance Comparison")
                plt.show()
            
            conf_matrices = "visualizations/all_confusion_matrices.png"
            if os.path.exists(conf_matrices):
                plt.figure(figsize=(12, 8))
                img = plt.imread(conf_matrices)
                plt.imshow(img)
                plt.axis("off")
                plt.title("Confusion Matrices")
                plt.show()
            
            rf_importance = "visualizations/random_forest_feature_importance.png"
            if os.path.exists(rf_importance):
                plt.figure(figsize=(10, 8))
                img = plt.imread(rf_importance)
                plt.imshow(img)
                plt.axis("off")
                plt.title("Random Forest Feature Importance")
                plt.show()
        else:
            print("No visualization files found")
except Exception as e:
    print(f"Error displaying visualizations: {str(e)}")

print("\n=== Training and Evaluation Complete ===")
print("Check reports/ directory for detailed evaluation results")
print("Check visualizations/ directory for performance visualizations") 


=== Initial System Information ===
Memory usage: 3.8%
Available: 30872.0 MB / 32102.9 MB

=== Setting up directories ===
Directory data/raw ready
Directory data/interim ready
Directory data/processed ready
Directory models/trained_models ready
Directory reports ready
Directory predictions ready
Directory deployment ready
Directory visualizations ready

=== Checking for example data ===
Example data not found. Using synthetic data.
Generating synthetic data...
Created synthetic log data: 3000 records
Created synthetic email data: 1500 records
Created synthetic file access data: 1000 records

=== Fixing matplotlib style in evaluate.py ===
Created fixed_evaluate.py

=== Setting up Kaggle scripts ===
Copied kaggle_run.py to working directory
Copied kaggle_deploy.py to working directory

=== Starting Insider Threat Detection ML Lifecycle ===
=== Using optimized hyperparameters with improved error handling ===


🔄 Starting Insider Threat Detection ML Lifecycle on Kaggle
Working directory: /