In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity
#load in training data on each potential synapse
data = pd.read_csv("./train_data.csv")

#load in additional features for each neuron
feature_weights = pd.read_csv("./feature_weights.csv")
morph_embeddings = pd.read_csv("./morph_embeddings.csv")

In [2]:
# join all feature_weight_i columns into a single np.array column
feature_weights["feature_weights"] = (
    feature_weights.filter(regex="feature_weight_")
    .sort_index(axis=1)
    .apply(lambda x: np.array(x), axis=1)
)
# delete the feature_weight_i columns
feature_weights.drop(
    feature_weights.filter(regex="feature_weight_").columns, axis=1, inplace=True
)

# join all morph_embed_i columns into a single np.array column
morph_embeddings["morph_embeddings"] = (
    morph_embeddings.filter(regex="morph_emb_")
    .sort_index(axis=1)
    .apply(lambda x: np.array(x), axis=1)
)
# delete the morph_embed_i columns
morph_embeddings.drop(
    morph_embeddings.filter(regex="morph_emb_").columns, axis=1, inplace=True
)

In [3]:
data = (
    data.merge(
        feature_weights.rename(columns=lambda x: "pre_" + x), 
        how="left", 
        validate="m:1",
        copy=False,
    )
    .merge(
        feature_weights.rename(columns=lambda x: "post_" + x),
        how="left",
        validate="m:1",
        copy=False,
    )
    .merge(
        morph_embeddings.rename(columns=lambda x: "pre_" + x),
        how="left",
        validate="m:1",
        copy=False,
    )
    .merge(
        morph_embeddings.rename(columns=lambda x: "post_" + x),
        how="left",
        validate="m:1",
        copy=False,
    )
)

In [5]:
#cosine similarity function
def row_feature_similarity(row):
    pre = row["pre_feature_weights"]
    post = row["post_feature_weights"]
    return (pre * post).sum() / (np.linalg.norm(pre) * np.linalg.norm(post))

In [6]:
# compute the cosine similarity between the pre- and post- feature weights
data["fw_similarity"] = data.apply(row_feature_similarity, axis=1)

In [7]:
# generate projection group as pre->post
data["projection_group"] = (
    data["pre_brain_area"].astype(str)
    + "->"
    + data["post_brain_area"].astype(str)
)

In [45]:
data.columns

Index(['ID', 'axonal_coor_x', 'axonal_coor_y', 'axonal_coor_z',
       'dendritic_coor_x', 'dendritic_coor_y', 'dendritic_coor_z', 'adp_dist',
       'post_skeletal_distance_to_soma', 'pre_skeletal_distance_to_soma',
       'pre_oracle', 'pre_test_score', 'pre_rf_x', 'pre_rf_y', 'post_oracle',
       'post_test_score', 'post_rf_x', 'post_rf_y', 'compartment',
       'pre_brain_area', 'post_brain_area', 'pre_nucleus_x', 'pre_nucleus_y',
       'pre_nucleus_z', 'post_nucleus_x', 'post_nucleus_y', 'post_nucleus_z',
       'pre_nucleus_id', 'post_nucleus_id', 'connected', 'pre_feature_weights',
       'post_feature_weights', 'pre_morph_embeddings', 'post_morph_embeddings',
       'fw_similarity', 'projection_group'],
      dtype='object')

In [53]:
data['fw_post_interaction'] = data['fw_similarity'] * data['post_test_score']
data['fw_pre_interaction'] = data['fw_similarity'] * data['pre_test_score']

In [60]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import balanced_accuracy_score, accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import ADASYN

from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder


# Create a column transformer with OneHotEncoder for the categorical column

numeric_cols = ["fw_similarity", "adp_dist", "pre_oracle", "post_oracle", "fw_pre_interaction", "fw_post_interaction"]
cat_cols = ['projection_group', "compartment"]
all_cols = numeric_cols + cat_cols
# and StandardScaler for the numerical columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_cols),
        ('cat', OneHotEncoder(), cat_cols)
    ],
    remainder='passthrough'  # This will pass through any other columns untouched
)

# Now, modify your pipeline to include this preprocessor
pipe = ImbPipeline([
    ('preprocessing', preprocessor),  # Note how we include the preprocessor here
    ('sampling', ADASYN(random_state=2)),
    ('model', LogisticRegression(random_state=2, max_iter=300))
])

# Your param_grid remains the same, but you need to adjust the keys accordingly
param_grid = {
    'model__C': [0.00001, 0.0001, 0.0005, 0.001, 0.01, 0.1, 1, 2, 5, 10],
    'model__penalty': ['l1', 'l2'],
    'model__solver': ['liblinear', 'saga']
    # Add or modify parameters for the 'preprocessing' step if needed
}

train_data, test_data = train_test_split(data, test_size=0.2, random_state=1)

grid_search = GridSearchCV(pipe, param_grid, scoring='balanced_accuracy', cv=5, verbose=2, n_jobs=-1)

grid_search.fit(train_data[all_cols], train_data["connected"])

# Get the best estimator
best_model = grid_search.best_estimator_

# Predict on test data
test_data['pred'] = best_model.predict(test_data[all_cols])

Fitting 5 folds for each of 40 candidates, totalling 200 fits
[CV] END model__C=1e-05, model__penalty=l1, model__solver=liblinear; total time=  10.3s
[CV] END model__C=1e-05, model__penalty=l2, model__solver=liblinear; total time=   6.4s
[CV] END model__C=1e-05, model__penalty=l1, model__solver=liblinear; total time=  17.0s
[CV] END model__C=1e-05, model__penalty=l1, model__solver=saga; total time=  16.8s
[CV] END model__C=1e-05, model__penalty=l1, model__solver=liblinear; total time=  17.2s
[CV] END model__C=1e-05, model__penalty=l1, model__solver=saga; total time=  17.2s
[CV] END model__C=1e-05, model__penalty=l1, model__solver=saga; total time=  17.4s
[CV] END model__C=1e-05, model__penalty=l1, model__solver=liblinear; total time=  17.6s
[CV] END model__C=1e-05, model__penalty=l1, model__solver=liblinear; total time=  17.8s
[CV] END model__C=1e-05, model__penalty=l1, model__solver=saga; total time=  17.9s
[CV] END model__C=1e-05, model__penalty=l1, model__solver=saga; total time=  1



[CV] END ..model__C=1, model__penalty=l1, model__solver=saga; total time=  43.4s
[CV] END ..model__C=2, model__penalty=l2, model__solver=saga; total time=  24.8s
[CV] END ..model__C=2, model__penalty=l2, model__solver=saga; total time=  25.7s
[CV] END model__C=5, model__penalty=l1, model__solver=liblinear; total time=   2.1s
[CV] END model__C=5, model__penalty=l1, model__solver=liblinear; total time=   2.4s
[CV] END ..model__C=2, model__penalty=l2, model__solver=saga; total time=  24.6s
[CV] END ..model__C=2, model__penalty=l2, model__solver=saga; total time=  24.6s
[CV] END model__C=5, model__penalty=l1, model__solver=liblinear; total time=   2.1s
[CV] END model__C=5, model__penalty=l1, model__solver=liblinear; total time=   2.2s
[CV] END model__C=5, model__penalty=l1, model__solver=liblinear; total time=   5.1s
[CV] END ..model__C=2, model__penalty=l2, model__solver=saga; total time=  25.4s




[CV] END ..model__C=2, model__penalty=l1, model__solver=saga; total time=  42.5s




[CV] END ..model__C=2, model__penalty=l1, model__solver=saga; total time=  41.7s
[CV] END ..model__C=2, model__penalty=l1, model__solver=saga; total time=  42.7s




[CV] END ..model__C=2, model__penalty=l1, model__solver=saga; total time=  42.0s
[CV] END ..model__C=2, model__penalty=l1, model__solver=saga; total time=  42.8s
[CV] END model__C=5, model__penalty=l2, model__solver=liblinear; total time=   2.0s
[CV] END model__C=5, model__penalty=l2, model__solver=liblinear; total time=   2.0s
[CV] END model__C=5, model__penalty=l2, model__solver=liblinear; total time=   2.5s
[CV] END model__C=5, model__penalty=l2, model__solver=liblinear; total time=   2.1s
[CV] END model__C=5, model__penalty=l2, model__solver=liblinear; total time=   2.1s
[CV] END ..model__C=5, model__penalty=l2, model__solver=saga; total time=  19.6s
[CV] END ..model__C=5, model__penalty=l2, model__solver=saga; total time=  20.8s
[CV] END ..model__C=5, model__penalty=l2, model__solver=saga; total time=  19.6s
[CV] END ..model__C=5, model__penalty=l2, model__solver=saga; total time=  20.5s
[CV] END ..model__C=5, model__penalty=l2, model__solver=saga; total time=  21.4s
[CV] END mode



[CV] END ..model__C=5, model__penalty=l1, model__solver=saga; total time=  41.9s




[CV] END ..model__C=5, model__penalty=l1, model__solver=saga; total time=  41.9s
[CV] END ..model__C=5, model__penalty=l1, model__solver=saga; total time=  42.3s
[CV] END model__C=10, model__penalty=l2, model__solver=liblinear; total time=   2.3s
[CV] END model__C=10, model__penalty=l2, model__solver=liblinear; total time=   2.1s




[CV] END model__C=10, model__penalty=l2, model__solver=liblinear; total time=   2.5s
[CV] END ..model__C=5, model__penalty=l1, model__solver=saga; total time=  42.3s
[CV] END model__C=10, model__penalty=l2, model__solver=liblinear; total time=   1.9s




[CV] END ..model__C=5, model__penalty=l1, model__solver=saga; total time=  43.4s
[CV] END model__C=10, model__penalty=l2, model__solver=liblinear; total time=   2.3s
[CV] END .model__C=10, model__penalty=l2, model__solver=saga; total time=   3.8s
[CV] END .model__C=10, model__penalty=l2, model__solver=saga; total time=   3.6s
[CV] END .model__C=10, model__penalty=l2, model__solver=saga; total time=   3.9s
[CV] END .model__C=10, model__penalty=l2, model__solver=saga; total time=   3.1s
[CV] END .model__C=10, model__penalty=l2, model__solver=saga; total time=  20.2s




[CV] END .model__C=10, model__penalty=l1, model__solver=saga; total time=  38.0s
[CV] END .model__C=10, model__penalty=l1, model__solver=saga; total time=  38.1s
[CV] END .model__C=10, model__penalty=l1, model__solver=saga; total time=  38.1s




[CV] END .model__C=10, model__penalty=l1, model__solver=saga; total time=  37.3s




[CV] END .model__C=10, model__penalty=l1, model__solver=saga; total time=  37.6s


In [61]:
best_model.steps[2][1]

LogisticRegression(C=0.01, max_iter=300, penalty='l1', random_state=2,
                   solver='saga')

In [63]:
# Compute accuracy
accuracy = accuracy_score(test_data['connected'], test_data['pred'] > 0.5)
print(f"Accuracy: {accuracy}")

# Compute balanced accuracy
balanced_accuracy = balanced_accuracy_score(test_data['connected'], test_data['pred'] > 0.5)
print(f"Balanced Accuracy: {balanced_accuracy}")

# Display the confusion matrix
conf_matrix = confusion_matrix(test_data['connected'], test_data['pred'] > 0.5)
print(conf_matrix)


Accuracy: 0.7098501358732209
Balanced Accuracy: 0.7806008231374595
[[26152 10744]
 [   40   231]]


In [48]:
# Compute accuracy
accuracy = accuracy_score(test_data['connected'], test_data['pred'] > 0.5)
print(f"Accuracy: {accuracy}")

# Compute balanced accuracy
balanced_accuracy = balanced_accuracy_score(test_data['connected'], test_data['pred'] > 0.5)
print(f"Balanced Accuracy: {balanced_accuracy}")

# Display the confusion matrix
conf_matrix = confusion_matrix(test_data['connected'], test_data['pred'] > 0.5)
print(conf_matrix)


Accuracy: 0.7091236849893723
Balanced Accuracy: 0.7802349298156902
[[26125 10771]
 [   40   231]]


.7379

In [12]:
# from sklearn.linear_model import LogisticRegression
# from sklearn.metrics import balanced_accuracy_score, accuracy_score, confusion_matrix
# from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import StandardScaler
# from sklearn.pipeline import Pipeline
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.model_selection import RandomizedSearchCV
# from sklearn.model_selection import GridSearchCV
# from imblearn.over_sampling import SMOTE
# from imblearn.pipeline import Pipeline as ImbPipeline
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.svm import SVC
# from sklearn.compose import ColumnTransformer
# from sklearn.preprocessing import OneHotEncoder


# # Create a column transformer with OneHotEncoder for the categorical column
# numeric_cols = ["fw_similarity", "adp_dist"]
# # and StandardScaler for the numerical columns
# preprocessor = ColumnTransformer(
#     transformers=[
#         ('num', StandardScaler(), numeric_cols),
#     ],
#     remainder='passthrough'  # This will pass through any other columns untouched
# )

# # Now, modify your pipeline to include this preprocessor
# pipe = ImbPipeline([
#     ('preprocessing', preprocessor),  # Note how we include the preprocessor here
#     ('sampling', SMOTE(random_state=2)),
#     ('model', LogisticRegression(random_state=2))
# ])

# # Your param_grid remains the same, but you need to adjust the keys accordingly
# param_grid = {
#     'model__C': [0.00001, 0.0001, 0.0005, 0.001, 0.002, 0.003, 0.004, 0.01, 0.1, 1],
#     'model__penalty': ['l1', 'l2'],
#     'model__solver': ['liblinear', 'saga']
#     # Add or modify parameters for the 'preprocessing' step if needed
# }

# train_data, test_data = train_test_split(df, test_size=0.2, random_state=1)

# grid_search = GridSearchCV(pipe, param_grid, scoring='balanced_accuracy', cv=5, verbose=2, n_jobs=-1)

# grid_search.fit(train_data[["fw_similarity", "adp_dist"]], train_data["connected"])

# # Get the best estimator
# best_model = grid_search.best_estimator_

# # Predict on test data
# test_data['pred'] = best_model.predict(test_data[["fw_similarity", "adp_dist"]])

In [13]:
# best_model.steps[2][1]

In [14]:
# # Compute accuracy
# accuracy = accuracy_score(test_data['connected'], test_data['pred'] > 0.5)
# print(f"Accuracy: {accuracy}")

# # Compute balanced accuracy
# balanced_accuracy = balanced_accuracy_score(test_data['connected'], test_data['pred'] > 0.5)
# print(f"Balanced Accuracy: {balanced_accuracy}")

# # Display the confusion matrix
# conf_matrix = confusion_matrix(test_data['connected'], test_data['pred'] > 0.5)
# print(conf_matrix)


ACC WITH PROJ:
Accuracy: 0.6564156375279145
Balanced Accuracy: 0.7372041349695804
[[24175 12721]
 [   49   222]]