In [2]:
import warnings
warnings.filterwarnings("ignore")

## Data Set Up

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder

def dataSetUp():
    # load in training data on each potential synapse
    data = pd.read_csv("./data/train_data.csv")

    # load in additional features for each neuron
    feature_weights = pd.read_csv("./data/feature_weights.csv")
    morph_embeddings = pd.read_csv("./data/morph_embeddings.csv")

    # Merge Data
    # join all feature_weight_i columns into a single np.array column
    feature_weights["feature_weights"] = (
        feature_weights.filter(regex="feature_weight_")
        .sort_index(axis=1)
        .apply(lambda x: np.array(x), axis=1)
    )
    # delete the feature_weight_i columns
    feature_weights.drop(
        feature_weights.filter(regex="feature_weight_").columns, axis=1, inplace=True
    )

    # join all morph_embed_i columns into a single np.array column
    morph_embeddings["morph_embeddings"] = (
        morph_embeddings.filter(regex="morph_emb_")
        .sort_index(axis=1)
        .apply(lambda x: np.array(x), axis=1)
    )
    # delete the morph_embed_i columns
    morph_embeddings.drop(
        morph_embeddings.filter(regex="morph_emb_").columns, axis=1, inplace=True
    )

    data = (
        data.merge(
            feature_weights.rename(columns=lambda x: "pre_" + x),
            how="left",
            validate="m:1",
            copy=False,
        )
        .merge(
            feature_weights.rename(columns=lambda x: "post_" + x),
            how="left",
            validate="m:1",
            copy=False,
        )
        .merge(
            morph_embeddings.rename(columns=lambda x: "pre_" + x),
            how="left",
            validate="m:1",
            copy=False,
        )
        .merge(
            morph_embeddings.rename(columns=lambda x: "post_" + x),
            how="left",
            validate="m:1",
            copy=False,
        )
    )

    # generate the fw_similarity feature
    # cosine similarity function
    def row_feature_similarity(row):
        pre = row["pre_feature_weights"]
        post = row["post_feature_weights"]
        return (pre * post).sum() / (np.linalg.norm(pre) * np.linalg.norm(post))

        # compute the cosine similarity between the pre- and post- feature weights
    data["fw_similarity"] = data.apply(row_feature_similarity, axis=1)

    # generate projection group as pre->post
    data["projection_group"] = (
        data["pre_brain_area"].astype(str)
        + "->"
        + data["post_brain_area"].astype(str)
    )

    # encoding Non-numerical features
    label_encoders = {}
    for column in ['compartment', 'pre_brain_area', 'post_brain_area', 'projection_group']:
        label_encoders[column] = LabelEncoder()
        data[column] = label_encoders[column].fit_transform(data[column])


    return data

## Feature Engineeing

In [4]:
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split

excluded_feature_indices = [0, 30, 31, 32, 33]

def select_features(data, excluded_feature_indices):
  data = data.drop(data.columns[excluded_feature_indices], axis=1)
  return data

def train_test_data_set_up(data):
  train_data, test_data = train_test_split(data, test_size=0.2, random_state=1)
  # Define the label column name
  label_column = 'connected'
  train_data_x = train_data.drop(label_column,axis=1)
  train_data_y = train_data[label_column]

  test_data_x = test_data.drop(label_column, axis=1)
  test_data_y = test_data[label_column]

  return train_data_x, train_data_y, test_data_x, test_data_y, train_data, test_data

def overSampling(data_x, data_y):
  # oversample connected neuron pairs
  ros = RandomOverSampler(random_state=0)
  X_resampled, y_resampled = ros.fit_resample(data_x, data_y)
  return X_resampled, y_resampled


## Logging

In [5]:
import logging
import datetime
import os

def setup_logging(experiment_name):
    current_time = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
    log_directory = "./logging"  # Replace with your desired path
    os.makedirs(log_directory, exist_ok=True)

    log_filename = f'{log_directory}/{experiment_name}_{current_time}_metrics_log.log'

    logger = logging.getLogger('test')
    logger.setLevel(level=logging.INFO)

    formatter = logging.Formatter('%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s')

    file_handler = logging.FileHandler(log_filename)
    file_handler.setLevel(level=logging.INFO)
    file_handler.setFormatter(formatter)

    # stream_handler = logging.StreamHandler()
    # stream_handler.setLevel(logging.INFO)
    # stream_handler.setFormatter(formatter)

    logger.addHandler(file_handler)
    # logger.addHandler(stream_handler)
    
    return logger

## Metric

In [6]:

from sklearn.metrics import balanced_accuracy_score, accuracy_score, confusion_matrix

def metric(test_data, logger):
    
    # compute accuracy
    accuracy = accuracy_score(test_data['connected'], test_data['pred'] > .5)
    logger.info(f'Accuracy: {accuracy}')
    print(f'Accuracy: {accuracy}')
    # confusion matrix
    cm = confusion_matrix(test_data['connected'], test_data['pred'] > .5)

    # Extracting TN, FP, FN, TP from the confusion matrix
    TN, FP, FN, TP = cm.ravel()
    logger.info(f'Confusion Matrix: TN={TN}, FP={FP}, FN={FN}, TP={TP}')
    print(f'Confusion Matrix: TN={TN}, FP={FP}, FN={FN}, TP={TP}')

    # Calculating Sensitivity (True Positive Rate)
    sensitivity = TP / (TP + FN)
    logger.info(f'Sensitivity: {sensitivity}')
    print(f'Sensitivity: {sensitivity}')

    # Calculating Specificity (True Negative Rate)
    specificity = TN / (TN + FP)
    logger.info(f'Specificity: {specificity}')
    print(f'Specificity: {specificity}')

    # compute balanced accuracy
    balanced_accuracy = balanced_accuracy_score(test_data['connected'], test_data['pred'] > .5)
    logger.info(f'Balanced Accuracy: {balanced_accuracy}')
    print(f'balanced_accuracy: {balanced_accuracy}')


## Pipeline

In [7]:

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# create pipeline
def create_pipe(model, train_data_x, train_data_y, test_data_x, test_data_y, test_data):
  pipe = Pipeline(
      [("scaler", StandardScaler()), ("model", model)]
  )
  pipe.fit(train_data_x, train_data_y)
  test_data["pred"] = pipe.predict_proba(test_data_x)[:,1]

  return pipe, test_data


## Experiment

In [8]:
import importlib

In [9]:
def experiment(experiment_name, method):
    data = dataSetUp()
    model = method(data)
    train_data_x, train_data_y, test_data_x, test_data_y, train_data, test_data = model.data_processing()
    pipe, test_data = create_pipe(model.model, train_data_x, train_data_y, test_data_x, test_data_y, test_data)

    logger = setup_logging(experiment_name)
    metric(test_data, logger=logger)
    
    return pipe

### random forest

In [9]:
import packages.random_forest as random_forest

In [35]:
importlib.reload(random_forest)
from packages.random_forest import RandomForest
experiment_name = "random_forests"
rf_pipe = experiment(experiment_name, RandomForest)

Accuracy: 0.719751392364194
Confusion Matrix: TN=26518, FP=10378, FN=38, TP=233
Sensitivity: 0.8597785977859779
Specificity: 0.71872289679098
0.7892507472884789


### logistic regression

In [10]:
import packages.logistic_regression as logistic_regression

In [137]:
importlib.reload(logistic_regression)
from packages.logistic_regression import Logistic
experiment_name = "logistic_regression"
logistic_pipe = experiment(experiment_name, Logistic)

Accuracy: 0.6925767481905992
Confusion Matrix: TN=25515, FP=11381, FN=45, TP=226
Sensitivity: 0.8339483394833949
Specificity: 0.6915383781439722
0.7627433588136836


#### Naive Bayes

In [13]:
import packages.naive_bayes as naive_bayes

In [16]:
importlib.reload(naive_bayes)
from packages.naive_bayes import NaiveBayes
experiment_name = "naive_bayes"
nb_pipe = experiment(experiment_name, NaiveBayes)

Accuracy: 0.6716980116770254
Confusion Matrix: TN=24762, FP=12134, FN=68, TP=203
Sensitivity: 0.7490774907749077
Specificity: 0.6711296617519514
0.7101035762634296


#### Ensemble

In [21]:
import packages.ensemble as ensemble

In [22]:
importlib.reload(ensemble)
from packages.ensemble import Ensemble
experiment_name = "ensemble"
ensemble_pipe = experiment(experiment_name, Ensemble)

#### Bagging

In [10]:
import packages.bagging as bagging

In [11]:
importlib.reload(bagging)
from packages.bagging import Bagging
experiment_name = "bagging"
bagging_pipe = experiment(experiment_name, Bagging)

## Submission

In [23]:
def submitData():
    lb_data = pd.read_csv("data/leaderboard_data.csv")

    # load in additional features for each neuron
    feature_weights = pd.read_csv("./data/feature_weights.csv")
    morph_embeddings = pd.read_csv("./data/morph_embeddings.csv")

    # Merge Data
    # join all feature_weight_i columns into a single np.array column
    feature_weights["feature_weights"] = (
        feature_weights.filter(regex="feature_weight_")
        .sort_index(axis=1)
        .apply(lambda x: np.array(x), axis=1)
    )
    # delete the feature_weight_i columns
    feature_weights.drop(
        feature_weights.filter(regex="feature_weight_").columns, axis=1, inplace=True
    )

    # join all morph_embed_i columns into a single np.array column
    morph_embeddings["morph_embeddings"] = (
        morph_embeddings.filter(regex="morph_emb_")
        .sort_index(axis=1)
        .apply(lambda x: np.array(x), axis=1)
    )
    # delete the morph_embed_i columns
    morph_embeddings.drop(
        morph_embeddings.filter(regex="morph_emb_").columns, axis=1, inplace=True
    )

    # Merge the data
    lb_data = (
        lb_data.merge(
            feature_weights.rename(columns=lambda x: "pre_" + x), 
            how="left", 
            validate="m:1",
            copy=False,
        )
        .merge(
            feature_weights.rename(columns=lambda x: "post_" + x),
            how="left",
            validate="m:1",
            copy=False,
        )
        .merge(
            morph_embeddings.rename(columns=lambda x: "pre_" + x),
            how="left",
            validate="m:1",
            copy=False,
        )
        .merge(
            morph_embeddings.rename(columns=lambda x: "post_" + x),
            how="left",
            validate="m:1",
            copy=False,
        )
    )
    
    # generate the fw_similarity feature
    # cosine similarity function
    def row_feature_similarity(row):
        pre = row["pre_feature_weights"]
        post = row["post_feature_weights"]
        return (pre * post).sum() / (np.linalg.norm(pre) * np.linalg.norm(post))

        # compute the cosine similarity between the pre- and post- feature weights
    lb_data["fw_similarity"] = lb_data.apply(row_feature_similarity, axis=1)

    # generate projection group as pre->post
    lb_data["projection_group"] = (
        lb_data["pre_brain_area"].astype(str)
        + "->"
        + lb_data["post_brain_area"].astype(str)
    )
    
     # encoding Non-numerical features
    label_encoders = {}
    for column in ['compartment', 'pre_brain_area', 'post_brain_area', 'projection_group']:
        label_encoders[column] = LabelEncoder()
        lb_data[column] = label_encoders[column].fit_transform(lb_data[column])
    
    return lb_data

In [32]:
def submit(method, pipe, experiment_name):
    lb_data = submitData()
    
    temp = method([])
    li = temp.excluded_feature_indices
    li = [i-1 if i > 28 else i for i in li]
    
    # predict on leaderboard data
    predict_X = method([]).select_features(lb_data, li)
    lb_data["pred"] = pipe.predict_proba(predict_X)[:, 1]

    #create a boolean prediction solution
    lb_data["connected"] = lb_data["pred"] > .5
    
    submission_data = lb_data.filter(['ID','connected'])
    #writing csv files
    submission_data.to_csv(f'./submission_files/{experiment_name}_submission_data.csv',index=False)

In [34]:
submitData().info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 42593 entries, 0 to 42592
Data columns (total 35 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   ID                              42593 non-null  int64  
 1   axonal_coor_x                   42593 non-null  int64  
 2   axonal_coor_y                   42593 non-null  int64  
 3   axonal_coor_z                   42593 non-null  int64  
 4   dendritic_coor_x                42593 non-null  int64  
 5   dendritic_coor_y                42593 non-null  int64  
 6   dendritic_coor_z                42593 non-null  int64  
 7   adp_dist                        42593 non-null  float64
 8   post_skeletal_distance_to_soma  42593 non-null  float64
 9   pre_skeletal_distance_to_soma   42593 non-null  float64
 10  pre_oracle                      42593 non-null  float64
 11  pre_test_score                  42593 non-null  float64
 12  pre_rf_x                        

### random forest

In [33]:
experiment_name = "random_forests_2"
submit(RandomForest, rf_pipe, experiment_name)

[0, 10, 29, 30, 31, 32]
