### Section 1. Loading Essential Libraries & Initialise Project

In [None]:
# Import necessary libraries
import os
import yaml
import pandas as pd
import json
import seaborn as sns
import matplotlib.pyplot as plt
from collections import defaultdict
from datetime import datetime
import lightgbm as lgb
from sklearn.model_selection import train_test_split


# Custom module imports
from src.eda import EDA  # Load EDA Class
from src.enrich import *  # Import enrichment functions


def initialise_project():
    """
    Initialize the project:: setting the working directory, loading the configuration,
    and configuring pandas settings.
    
    Returns:
        dict: Configuration loaded from the YAML file.
    """
    try:
        # Change to the project directory
        PROJECT_DIR = './dashtban/TeleIoT-Fraud/'
        os.chdir(PROJECT_DIR)
        print(f"Working directory set to: {os.getcwd()}")

        # Load configuration
        CONFIG_PATH = 'config.yml'
        if not os.path.exists(CONFIG_PATH):
            raise FileNotFoundError(f"Configuration file '{CONFIG_PATH}' not found in the project directory.")

        with open(CONFIG_PATH, 'r') as config_file:
            config = yaml.safe_load(config_file)

        # Configure pandas settings
        pd.set_option('mode.use_inf_as_na', True)
        print("Pandas configuration set: mode.use_inf_as_na = True")

        print("Project successfully initialized.")
        return config

    except FileNotFoundError as e:
        print(f"Error: {e}")
        raise

    except yaml.YAMLError as e:
        print(f"Error loading configuration file: {e}")
        raise

    except Exception as e:
        print(f"An unexpected error occurred during project initialization: {e}")
        raise

# Initialise the project and load configuration
config = initialise_project()


### Section 2. Processing Scanned Time-Series Files from R Scripts

In [None]:
# Load the scanned dataset (R data processed by R scripts)
# Use the 'rdata_scanned' key from the config to load the dataset
df = pd.read_csv(config['data']['rdata_scanned'])

# Clean column names by removing the 'positions.' prefix for easier access in the next steps
df.columns = [col.replace('positions.', '') for col in df.columns]

# Initialise the EDA class with the loaded DataFrame
eda = EDA(df)

  df = pd.read_csv(config['data']['rdata_scanned'])


### Stage 3 : Investigating Data & Generating Reports

In [None]:
# Column Characteristics Before Data Cleansing
eda.column_statistics(df, config['path']['report'] + 'column_statistics_original_data.html')

# Generate Summary Statistics for Numerical Values
eda.summary_html(df, config['path']['report'] + 'summary_statistics_numerical_values.html')

# Generate Geospatial Maps Based on Data
eda.generate_maps(df, config['reports']['maps'])

# Select Random Rows Without Missing Values for Inspection
eda.select_random_rows_without_missing_values(df).to_csv(config['path']['report'] + 'temp.csv', index=False)

# Select Rows with Missing Values and Save to HTML for Review
eda.select_rows_with_missing_values(df, config['path']['report'] + 'rows_with_missing_values.html')

# Generate Correlation Table for the Original Data
eda.generate_correlation_table(df, config['path']['report'] + 'summary_numerical_original_cohort.html')

# Generate a Correlation Heatmap for the Data
eda.generate_correlation_heatmap(df, config['path']['report'] + 'df_original_cohort_corr.jpg')

# Add Missing Indicator Column to the DataFrame (Threshold = 1% Missing)
df = eda.add_missing_indicator_column(df, threshold=0.01, colname='missing')

# Save the Refined DataFrame to Disk for Future Use
df.to_pickle(config['path']['data'] + 'df_refined.pkl')

# Reload the Refined DataFrame from the Pickle File
df = pd.read_pickle(config['path']['data'] + 'df_refined.pkl')


### Stage 4 : Enriching Data

In [None]:
# Apply the Enrichment Pipeline to the Scanned R-DataFrame
df = enrichment_pipeline(df)

# Save the Refined and Enriched DataFrame to a Pickle File for Future Use
df.to_pickle(config['data']['refined_enriched'])

### Stage 5 : Investigating the Enriched Data

In [None]:
# Load the enriched data from the pickle file
df = pd.read_pickle(config['data']['refined_enriched'])

# Initialize the EDA class with the enriched DataFrame
eda = EDA(df)

# Generate correlation heatmap for the entire DataFrame
eda.generate_correlation_heatmap(df, config['path']['report'] + 'df_enriched_cohort_corr.jpg')

# Select specific columns of interest for detailed correlation analysis
selected_columns = [
    'duration', 'total_distance', 'average_speed', 'duration_minutes', 'direct_distance', 
    'gforce', 'Signal_Strength', 'Wavelet_XYZ_1', 'Wavelet_XYZ_2', 'Wavelet_XYZ_3', 
    'Wavelet_LonLat_1', 'Wavelet_LonLat_2', 'Wavelet_LonLat_3', 'PCA_gforce_1', 
    "PCA_speedMph_1", 'Distinct_formOfWay_Count', 'incident'
]

# Generate correlation heatmap for the selected columns
eda.generate_correlation_heatmap(df[selected_columns], config['path']['report'] + 'df_enriched_cohort_corr_selected.jpg')

# Generate column statistics for the enriched data
eda.column_statistics(df, config['path']['report'] + 'column_statistics_enriched_data.html')

# Generate summary statistics for numerical values in the enriched data
eda.summary_html(df, config['path']['report'] + 'summary_statistics_numerical_values_enriched.html')

# Generate geospatial maps based on the enriched data
eda.generate_maps(df, config['reports']['maps'])

# Select and save random rows without missing values to CSV
eda.select_random_rows_without_missing_values(df).to_csv(config['path']['report'] + 'temp_enriched.csv', index=False)

# Select and save random rows with missing values to an HTML report
eda.select_rows_with_missing_values(df, config['path']['report'] + 'rows_with_missing_values_enriched.html')

# Generate correlation table for numerical features in the enriched data
eda.generate_correlation_table(df, config['path']['report'] + 'summary_numerical_enriched_cohort.html')

# Generate another correlation heatmap for the entire enriched cohort
eda.generate_correlation_heatmap(df, config['path']['report'] + 'df_enriched_cohort_corr_final.jpg')


### Stage 6 : Algorithm Design, Modeling, & Pre-Evaludation

<div>

#### Algorithm 1: Incident Prediction and Risk Assessment

1. **Distance Calculation**:  
   - Compute the distance between all data points and the observed incidents.

2. **Selection of Farthest Points**:  
   - Identify the farthest data point from the incidents and select as many points as there are observed incidents to create a balanced dataset.

3. **Model Training**:  
   - Train a machine learning model to distinguish between incidents and non-incidents.

4. **Probability Update**:  
   - Use the trained model to predict probabilities. Update the index pool by assigning a label of "incident" if the probability is greater than 70%.

5. **Iteration**:  
   - With the updated labels, return to step 2. Continue the process until there are no more data points left to label.

6. **Risk Assessment**:  
   - Evaluate the highest-risk journeys by counting the number of predicted incidents per journey. Assign a severity index for each journey based on the number of predicted incidents.

7. **Classifier Refinement**:  
   - Train a second classifier using the updated labels and evaluate how well the true incidents can be detected.

8. **Refinement of the Process**:  
   - Based on the findings from step 7, refine the procedure and the model for improved incident detection and risk assessment.

</div>

In [None]:
"""
#### Steps 1 and 2 of Algorithm 1
1. **Distance Calculation**:  
   - Compute the distance between all data points and the observed incidents.
2. **Selection of Farthest Points**:  
   - Identify the farthest data point from the incidents and select as many points as there are observed incidents to create a balanced dataset.
"""

def find_distant_points(df, label_column='incident', num_points=12000, sel_cols=None):
    """
    Finds the data points with the highest distance between instances with label 0 (non-incidents) and label 1 (incidents).
    
    Parameters:
        df (DataFrame): The input DataFrame.
        label_column (str): The column name representing the label ('incident').
        num_points (int): The number of data points to select with the highest distance.
        sel_cols (list, optional): List of columns to consider for distance computation. Default is None, which uses all columns.
        
    Returns:
        high_distance_points_df (DataFrame): DataFrame with an additional column indicating the highest distance points.
    """
    try:
        # Ensure the DataFrame uses only numeric values for distance calculation
        if sel_cols:
            df = df[sel_cols]
        df = df.apply(pd.to_numeric, errors='coerce')
        
        # Drop rows with missing values
        df.dropna(inplace=True)
        
        # Separate data points based on the label (incident: 1, non-incident: 0)
        label_0 = df[df[label_column] == 0].values
        label_1 = df[df[label_column] == 1].values
        
        # Check for the presence of both label 0 and label 1 data points
        if len(label_0) == 0 or len(label_1) == 0:
            raise ValueError(f"No data points with label 0 or label 1 found.")
        
        # Compute the pairwise Euclidean distance between points of label 0 and label 1
        distances = np.sqrt(((label_0[:, np.newaxis] - label_1) ** 2).sum(axis=2))
        
        # Get indices of the top `num_points` data points with the highest distances
        top_indices = np.unravel_index(np.argsort(distances.ravel())[-num_points:], distances.shape)
        
        # Create DataFrame with all original columns
        high_distance_points_df = df.copy()
        
        # Add a column to indicate whether the data point is one of the highest distance points
        high_distance_points_df['Is_Highest_Distance'] = 0
        high_distance_points_df.iloc[top_indices[0], high_distance_points_df.columns.get_loc(label_column)] = 1
        
        return high_distance_points_df
    
    except Exception as e:
        print(f"An error occurred: {str(e)}")
        return None

# Select the same number of samples as observed incidents
num_samples = df[df['incident'] == 1].shape[0]

# Select features for computing similarity (use predefined columns)
sel_cols = ['duration', 'total_distance', 'average_speed', 'duration_minutes', 'direct_distance', 
            'gforce', 'Signal_Strength', 'Wavelet_XYZ_1', 'Wavelet_XYZ_2', 'Wavelet_XYZ_3',
            'Wavelet_LonLat_1', 'Wavelet_LonLat_2', 'Wavelet_LonLat_3', 'PCA_gforce_1', 
            'PCA_speedMph_1', 'Distinct_formOfWay_Count', 'incident']

# Select top 1 row per UID (if grouping by 'UID' to ensure unique data points)
df_1 = df.groupby('UID').head(1)

# Sample a subset of rows to improve computational efficiency
df_sample = df_1.sample(n=1000, random_state=42)

# Call the function to get the distant points
distant_df = find_distant_points(df_sample, label_column='incident', num_points=num_samples, sel_cols=sel_cols)

# You can now work with `distant_df` for further analysis or modeling.


In [None]:
#### Step 3 of Algorithm 1: Model Training (LightGBM)
#
# Train a machine learning model to distinguish between incidents and non-incidents.
# In this case, we will train a LightGBM model.
#
#

def train_lightgbm_model(df, target_column='incident', test_size=0.2, random_state=42, params=None, num_round=100, plot_file='feature_importance.png'):
    """
    Trains a LightGBM model to classify incidents (target_column = 1) vs non-incidents (target_column = 0).
    
    Parameters:
        df (DataFrame): The input DataFrame containing features and the target column.
        target_column (str): The column name representing the target variable (default is 'incident').
        test_size (float): The fraction of the dataset to be used for testing (default is 0.2).
        random_state (int): The seed for random number generation (default is 42).
        params (dict, optional): Hyperparameters for the LightGBM model. If None, default parameters are used.
        num_round (int): The number of boosting iterations (default is 100).
        plot_file (str): The file name to save the feature importance plot (default is 'feature_importance.png').
    
    Returns:
        bst (Booster): The trained LightGBM model.
        filtered_df (DataFrame): The filtered DataFrame with predicted probabilities added.
    """
    try:
        # Filter the DataFrame for the rows where the target_column equals 1 (incidents)
        filtered_df = df[df[target_column] == 1].copy()

        # Separate features (X) and target (y)
        X = filtered_df.drop(target_column, axis=1)  # Features (all columns except target)
        y = filtered_df[target_column]  # Target column (incident)

        # Split the data into training and testing sets
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

        # Set default parameters if none are provided
        if params is None:
            params = {
                'boosting_type': 'gbdt',
                'objective': 'binary',
                'metric': 'binary_logloss',
                'num_leaves': 31,
                'learning_rate': 0.05,
                'feature_fraction': 0.9,
                'bagging_fraction': 0.8,
                'bagging_freq': 5,
                'verbose': 0
            }

        # Create LightGBM dataset objects for training and testing
        train_data = lgb.Dataset(X_train, label=y_train)
        test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)

        # Train the LightGBM model
        bst = lgb.train(params, train_data, num_round, valid_sets=[test_data], early_stopping_rounds=10)

        # Predict probabilities for the positive class (incident = 1)
        y_pred_proba = bst.predict(X_test, num_iteration=bst.best_iteration)

        # Add predicted probabilities as a new column in the filtered DataFrame
        filtered_df['predicted_proba'] = y_pred_proba

        # Plot feature importance
        lgb.plot_importance(bst, figsize=(10, 8), importance_type='split')
        plt.tight_layout()
        plt.savefig(plot_file)
        plt.close()

        return bst, filtered_df

    except Exception as e:
        print(f"An error occurred during model training: {str(e)}")
        return None, None




### Stage 7 : Evaluation & Model Selection 