In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#load in training data on each potential synapse
data = pd.read_csv("./train_data.csv")

#load in additional features for each neuron
feature_weights = pd.read_csv("./feature_weights.csv")
morph_embeddings = pd.read_csv("./morph_embeddings.csv")

In [2]:
# join all feature_weight_i columns into a single np.array column
feature_weights["feature_weights"] = (
    feature_weights.filter(regex="feature_weight_")
    .sort_index(axis=1)
    .apply(lambda x: np.array(x), axis=1)
)
# delete the feature_weight_i columns
feature_weights.drop(
    feature_weights.filter(regex="feature_weight_").columns, axis=1, inplace=True
)

# join all morph_embed_i columns into a single np.array column
morph_embeddings["morph_embeddings"] = (
    morph_embeddings.filter(regex="morph_emb_")
    .sort_index(axis=1)
    .apply(lambda x: np.array(x), axis=1)
)
# delete the morph_embed_i columns
morph_embeddings.drop(
    morph_embeddings.filter(regex="morph_emb_").columns, axis=1, inplace=True
)

In [3]:
data = (
    data.merge(
        feature_weights.rename(columns=lambda x: "pre_" + x), 
        how="left", 
        validate="m:1",
        copy=False,
    )
    .merge(
        feature_weights.rename(columns=lambda x: "post_" + x),
        how="left",
        validate="m:1",
        copy=False,
    )
    .merge(
        morph_embeddings.rename(columns=lambda x: "pre_" + x),
        how="left",
        validate="m:1",
        copy=False,
    )
    .merge(
        morph_embeddings.rename(columns=lambda x: "post_" + x),
        how="left",
        validate="m:1",
        copy=False,
    )
)

In [4]:
#cosine similarity function
def row_feature_similarity(row):
    pre = row["pre_feature_weights"]
    post = row["post_feature_weights"]
    return (pre * post).sum() / (np.linalg.norm(pre) * np.linalg.norm(post))

In [5]:
# compute the cosine similarity between the pre- and post- feature weights
data["fw_similarity"] = data.apply(row_feature_similarity, axis=1)

In [6]:
# generate projection group as pre->post
data["projection_group"] = (
    data["pre_brain_area"].astype(str)
    + "->"
    + data["post_brain_area"].astype(str)
)

In [7]:
# OHE these proj groups
one_hot = pd.get_dummies(data['projection_group'], prefix='proj_grp')
df = pd.concat([data, one_hot], axis=1)
prj_cols = [col for col in df.columns if 'proj_grp' in col]
prj_cols

['proj_grp_AL->AL',
 'proj_grp_AL->RL',
 'proj_grp_AL->V1',
 'proj_grp_RL->AL',
 'proj_grp_RL->RL',
 'proj_grp_RL->V1',
 'proj_grp_V1->AL',
 'proj_grp_V1->RL',
 'proj_grp_V1->V1']

In [10]:
# now do the same for leaderboard data
lb_data = pd.read_csv("./leaderboard_data.csv")
# fw sim engineering
lb_data = (
    lb_data.merge(
        feature_weights.rename(columns=lambda x: "pre_" + x), 
        how="left", 
        validate="m:1",
        copy=False,
    )
    .merge(
        feature_weights.rename(columns=lambda x: "post_" + x),
        how="left",
        validate="m:1",
        copy=False,
    )
    .merge(
        morph_embeddings.rename(columns=lambda x: "pre_" + x),
        how="left",
        validate="m:1",
        copy=False,
    )
    .merge(
        morph_embeddings.rename(columns=lambda x: "post_" + x),
        how="left",
        validate="m:1",
        copy=False,
    )
)
lb_data["fw_similarity"] = lb_data.apply(row_feature_similarity, axis=1)

# proj OHE engineering
lb_data["projection_group"] = (
    lb_data["pre_brain_area"].astype(str)
    + "->"
    + lb_data["post_brain_area"].astype(str)
)
one_hot = pd.get_dummies(lb_data['projection_group'], prefix='proj_grp')
lb_df = pd.concat([lb_data, one_hot], axis=1)
prj_cols = [col for col in lb_df.columns if 'proj_grp' in col]
lb_df = lb_df.drop(["projection_group"], axis=1)
lb_df.head()

Unnamed: 0,ID,axonal_coor_x,axonal_coor_y,axonal_coor_z,dendritic_coor_x,dendritic_coor_y,dendritic_coor_z,adp_dist,post_skeletal_distance_to_soma,pre_skeletal_distance_to_soma,...,fw_similarity,proj_grp_AL->AL,proj_grp_AL->RL,proj_grp_AL->V1,proj_grp_RL->AL,proj_grp_RL->RL,proj_grp_RL->V1,proj_grp_V1->AL,proj_grp_V1->RL,proj_grp_V1->V1
0,0,527499,544761,912282,529457,543459,916958,4783.37,239163.0,178032.0,...,-0.022573,0,0,1,0,0,0,0,0,0
1,1,1269890,471870,837816,1271230,469651,841045,4060.72,294573.0,1193910.0,...,0.35336,0,1,0,0,0,0,0,0,0
2,2,666057,469875,925239,665387,467214,923430,2912.8,4102.75,387714.0,...,0.052183,0,0,1,0,0,0,0,0,0
3,3,696633,453516,925806,696648,453417,921875,3272.83,31123.0,424714.0,...,0.052183,0,0,1,0,0,0,0,0,0
4,4,567525,585921,888426,570714,583676,891264,4441.75,114201.0,250411.0,...,0.009847,0,0,1,0,0,0,0,0,0


In [19]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import balanced_accuracy_score, accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import RandomOverSampler

# oversample connected neuron pairs
pipe = Pipeline(
    [("scaler", StandardScaler()), ("model", LogisticRegression(random_state=2))]
)
ros = RandomOverSampler(random_state=0)
X_resampled, y_resampled = ros.fit_resample(
    df[["fw_similarity", "adp_dist", ] + prj_cols], df["connected"]
)

# fit model
pipe.fit(X_resampled, y_resampled)

# predict on test data
lb_df["pred"] = pipe.predict_proba(lb_df[["fw_similarity", "adp_dist"] + prj_cols])[:, 1]

#create a boolean prediction solution
lb_df["connected"] = lb_df["pred"] > .5

In [20]:
lb_df.head()

Unnamed: 0,ID,axonal_coor_x,axonal_coor_y,axonal_coor_z,dendritic_coor_x,dendritic_coor_y,dendritic_coor_z,adp_dist,post_skeletal_distance_to_soma,pre_skeletal_distance_to_soma,...,proj_grp_AL->RL,proj_grp_AL->V1,proj_grp_RL->AL,proj_grp_RL->RL,proj_grp_RL->V1,proj_grp_V1->AL,proj_grp_V1->RL,proj_grp_V1->V1,pred,connected
0,0,527499,544761,912282,529457,543459,916958,4783.37,239163.0,178032.0,...,0,1,0,0,0,0,0,0,0.057975,False
1,1,1269890,471870,837816,1271230,469651,841045,4060.72,294573.0,1193910.0,...,1,0,0,0,0,0,0,0,0.106834,False
2,2,666057,469875,925239,665387,467214,923430,2912.8,4102.75,387714.0,...,0,1,0,0,0,0,0,0,0.300915,False
3,3,696633,453516,925806,696648,453417,921875,3272.83,31123.0,424714.0,...,0,1,0,0,0,0,0,0,0.232678,False
4,4,567525,585921,888426,570714,583676,891264,4441.75,114201.0,250411.0,...,0,1,0,0,0,0,0,0,0.083065,False


In [21]:
#columns should be ID, connected
submission_data = lb_df.filter(['ID','connected'])

In [22]:
submission_data

Unnamed: 0,ID,connected
0,0,False
1,1,False
2,2,False
3,3,False
4,4,False
...,...,...
42588,42588,True
42589,42589,False
42590,42590,False
42591,42591,False


In [25]:
submission_data.to_csv("./submission_1.csv", index=False)