In [20]:
import os
os.chdir('/Users/dashtban/Ticker-Test/')

import yaml
# Read configuration from YAML file
with open('config.yml', 'r') as config_file:
    config = yaml.safe_load(config_file)

from src.utils import *
from src.enrich import *
import pandas as pd
import json
import seaborn as sns
import matplotlib.pyplot as plt
from collections import defaultdict
from datetime import datetime
import lightgbm as lgb
from sklearn.model_selection import train_test_split

# Set low_memory to False
pd.set_option('mode.use_inf_as_na', True)


In [21]:
# Load proccesed files from R-studio 
df = pd.read_csv(config['data']['rdata_complete'])
# Remove the  prefix
df.columns = [col.replace('positions.', '') for col in df.columns]

In [None]:
# Stage 1 : Data Investigation

# Column Characteristics before removing and cleansing
column_statistics(df,config['path']['report']+'column_statistics_original_data.html')

# Summary stats
summary_html(df,config['path']['report']+'summary_statistics_numerical_values.html')

# Generate maps base on geospatial data
generate_maps(df,config['reports']['maps'])

# Select randomly some rows
select_random_rows_without_missing_values(df).to_csv(config['path']['report']+'temp.csv', index=False) 
# See some random rows with missing values 
select_rows_with_missing_values(df,config['path']['report']+'rows_with_missing_values.html')

# generate_correlation_table
generate_correlation_table(df,config['path']['report']+'summary_numerical_original_cohort.html')
generate_correlation_heatmap(df,config['path']['report']+'df_original_cohort_corr.jpg')


# Create missing indicator
df = add_missing_indicator_column(df,threshold=.01,colname = 'missing')

# Save the refined data 
df.to_pickle(config['path']['data']+'df_refined.pkl')
df = pd.read_pickle(config['path']['data']+'df_refined.pkl')


In [41]:
# Load Scanned files from R-studio 
df = pd.read_csv(config['data']['rdata_scanned'])
# Remove the  prefix
df.columns = [col.replace('positions.', '') for col in df.columns]

  df = pd.read_csv(config['data']['rdata_scanned'])


In [None]:
### Satge 2: Enrichment
df = enrichment_pipeline(df)
df.to_pickle(config['data']['refined_enriched'])


In [None]:
# generate_correlation_table
df = pd.read_pickle(config['data']['refined_enriched'])

# Generrate the corr plots
generate_correlation_heatmap(df,config['path']['report']+'df_enriched_cohort_corr.jpg')
sel_cols = ['duration','total_distance','average_speed', 'duration_minutes',
            'direct_distance', 'gforce','Signal_Strength', 'Wavelet_XYZ_1', 
            'Wavelet_XYZ_2', 'Wavelet_XYZ_3','Wavelet_LonLat_1', 'Wavelet_LonLat_2',
            'Wavelet_LonLat_3','PCA_gforce_1',"PCA_speedMph_1", 'Distinct_formOfWay_Count','incident']
generate_correlation_heatmap(df[sel_cols],config['path']['report']+'df_enriched_cohort_corr.jpg')


In [27]:
### Satge 3. Modeling

# Algorithm 1.
# 1- Find the distance between all data points to the observed incidents
# 2- Find the farthest one and select as much as the observed one to create a fully balanced one
# 3- Train a model to distiguish between incidents and non-incidents
# 4- Update the index pool with probility produced by the Trainer, (assing as incident if the prob>70%)
# 5- with the updated lables, go to step 2
# 6- continue until there is no data point remained
# 7- Identify the highest risk journeys by evaluating the number of inciednts predicted per journey and assing severity index for each journey
# 8- train another clasifer and see how well the true incidents could be detected,
# 9- refine the procedure based on finding in step 8

{'rdata_complete': './Data/Rdata/processed_data_complete.csv',
 'rdata_events': './Data/Rdata/processed_data_with_events.csv',
 'rdata_noevent': './Data/Rdata/processed_data_no_events.csv',
 'rdata_scanned': '/Users/dashtban/Ticker-Test/Data/Rdata/processed_data_scanned.csv',
 'refined': './Data/df_refined.pkl',
 'refined_enriched': './Data/df_refined_enriched.pkl'}

In [None]:
def find_distant_points(df, label_column='incident', num_points=12000):
    """
    Finds the data points with the highest distance between instances with label 0 and label 1.
    
    Parameters:
        df (DataFrame): The input DataFrame.
        label_column (str): The column name representing the label.
        num_points (int): The number of data points to select.
        
    Returns:
        high_distance_points_df (DataFrame): DataFrame with all columns and an additional column indicating highest distance points.
    """
    try:
        # Convert columns to numeric, converting non-numeric values to NaN
        df = df.apply(pd.to_numeric, errors='coerce')
        
        # Drop rows with missing values
        df.dropna(inplace=True)
        
        # Select data points for label 0
        label_0 = df[df[label_column] == 0].values
        
        # Select data points for label 1
        label_1 = df[df[label_column] == 1].values
        
        # Compute distances between instances with label 0 and label 1
        if len(label_0) == 0 or len(label_1) == 0:
            raise ValueError("No data points with label 0 or label 1 found.")
        
        distances = np.sqrt(((label_0[:, np.newaxis] - label_1) ** 2).sum(axis=2))
        
        # Get indices of top num_points data points with highest distances
        top_indices = np.unravel_index(np.argsort(distances.ravel())[-num_points:], distances.shape)
        
        # Create DataFrame with all columns
        high_distance_points_df = df.copy()
        
        # Add additional column indicating highest distance points
        high_distance_points_df['Is_Highest_Distance'] = 0
        
        # Set value 1 for highest distance points
        high_distance_points_df.iloc[top_indices[0], high_distance_points_df.columns.get_loc(label_column)] = 1
        
        return high_distance_points_df
    
    except Exception as e:
        print(f"An error occurred: {str(e)}")
        return None


# Select as much as the observed samples
num_samples = df[df['incident'] == 1].shape[0]

# Select features for computing similarity
sel_cols = ['duration','total_distance','average_speed', 'duration_minutes',
            'direct_distance', 'gforce','Signal_Strength', 'Wavelet_XYZ_1', 
            'Wavelet_XYZ_2', 'Wavelet_XYZ_3','Wavelet_LonLat_1', 'Wavelet_LonLat_2',
            'Wavelet_LonLat_3','PCA_gforce_1',"PCA_speedMph_1", 'Distinct_formOfWay_Count','incident']

# Select top 1 row per UID
df_1 = df.groupby('UID').head(1)
df2 = df_1.sample(n=1000)

# Call the function
d_df = find_distant_points(df2, num_samples)

In [None]:
### Train a LihgtGBM

import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

def train_lightgbm_model(df, target_column='incident', test_size=0.2, random_state=42, params=None, num_round=100, plot_file='feature_importance.png'):
    try:
        # Filter DataFrame based on the target column
        filtered_df = df[df[target_column] == 1]

        # Assuming you have features and target
        X = filtered_df.drop(target_column, axis=1)  # Features
        y = filtered_df[target_column]  # Target

        # Split the data into training and testing sets
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

        # Set default parameters if not provided
        if params is None:
            params = {
                'boosting_type': 'gbdt',
                'objective': 'binary',
                'metric': 'binary_logloss',
                'num_leaves': 31,
                'learning_rate': 0.05,
                'feature_fraction': 0.9,
                'bagging_fraction': 0.8,
                'bagging_freq': 5,
                'verbose': 0
            }

        # Create dataset for LightGBM
        train_data = lgb.Dataset(X_train, label=y_train)
        test_data = lgb.Dataset(X_test, label=y_test)

        # Train LightGBM model
        bst = lgb.train(params, train_data, num_round, valid_sets=[test_data], early_stopping_rounds=10)

        # Predict probabilities for the target class
        y_pred_proba = bst.predict(X_test)

        # Add predicted probabilities as a new feature in the DataFrame
        filtered_df['predicted_proba'] = y_pred_proba

        # Plot feature importance
        lgb.plot_importance(bst, figsize=(10, 8))
        plt.tight_layout()
        plt.savefig(plot_file)
        plt.close()

        return bst, filtered_df

    except Exception as e:
        print(f"An error occurred: {str(e)}")
        return None, None
