In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#load in training data on each potential synapse
data = pd.read_csv("./train_data.csv")

#load in additional features for each neuron
feature_weights = pd.read_csv("./feature_weights.csv")
morph_embeddings = pd.read_csv("./morph_embeddings.csv")

In [2]:
# join all feature_weight_i columns into a single np.array column
feature_weights["feature_weights"] = (
    feature_weights.filter(regex="feature_weight_")
    .sort_index(axis=1)
    .apply(lambda x: np.array(x), axis=1)
)
# delete the feature_weight_i columns
feature_weights.drop(
    feature_weights.filter(regex="feature_weight_").columns, axis=1, inplace=True
)

# join all morph_embed_i columns into a single np.array column
morph_embeddings["morph_embeddings"] = (
    morph_embeddings.filter(regex="morph_emb_")
    .sort_index(axis=1)
    .apply(lambda x: np.array(x), axis=1)
)
# delete the morph_embed_i columns
morph_embeddings.drop(
    morph_embeddings.filter(regex="morph_emb_").columns, axis=1, inplace=True
)

In [3]:
data = (
    data.merge(
        feature_weights.rename(columns=lambda x: "pre_" + x), 
        how="left", 
        validate="m:1",
        copy=False,
    )
    .merge(
        feature_weights.rename(columns=lambda x: "post_" + x),
        how="left",
        validate="m:1",
        copy=False,
    )
    .merge(
        morph_embeddings.rename(columns=lambda x: "pre_" + x),
        how="left",
        validate="m:1",
        copy=False,
    )
    .merge(
        morph_embeddings.rename(columns=lambda x: "post_" + x),
        how="left",
        validate="m:1",
        copy=False,
    )
)

In [4]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity

In [5]:
#cosine similarity function
def row_feature_similarity(row):
    pre = row["pre_feature_weights"]
    post = row["post_feature_weights"]
    return (pre * post).sum() / (np.linalg.norm(pre) * np.linalg.norm(post))

In [6]:
# compute the cosine similarity between the pre- and post- feature weights
data["fw_similarity"] = data.apply(row_feature_similarity, axis=1)

In [7]:
# generate projection group as pre->post
data["projection_group"] = (
    data["pre_brain_area"].astype(str)
    + "->"
    + data["post_brain_area"].astype(str)
)

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import balanced_accuracy_score, accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder


# Create a column transformer with OneHotEncoder for the categorical column
numeric_cols = ["fw_similarity", "adp_dist"]
# and StandardScaler for the numerical columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_cols),
        ('cat', OneHotEncoder(), ['projection_group'])
    ],
    remainder='passthrough'  # This will pass through any other columns untouched
)

# Now, modify your pipeline to include this preprocessor
pipe = ImbPipeline([
    ('preprocessing', preprocessor),  # Note how we include the preprocessor here
    ('sampling', SMOTE(random_state=2)),
    ('model', RandomForestClassifier(random_state=2))
])

# Your param_grid remains the same, but you need to adjust the keys accordingly
param_grid = {
    'model__n_estimators': [100, 200, 300],  # Number of trees in the forest
    'model__max_depth': [None, 10, 20, 30],  # Maximum depth of the tree
    'model__min_samples_split': [2, 5, 10],  # Minimum number of samples required to split a node
    'model__min_samples_leaf': [1, 2, 4]  # Minimum number of samples required at each leaf node
    # You can include more hyperparameters here
}

train_data, test_data = train_test_split(data, test_size=0.2, random_state=1)

grid_search = GridSearchCV(pipe, param_grid, scoring='balanced_accuracy', cv=5, verbose=3, n_jobs=-1)

grid_search.fit(train_data[["fw_similarity", "adp_dist", "projection_group"]], train_data["connected"])

# Get the best estimator
best_model = grid_search.best_estimator_

# Predict on test data
test_data['pred'] = best_model.predict(test_data[["fw_similarity", "adp_dist", "projection_group"]])

Fitting 5 folds for each of 108 candidates, totalling 540 fits


KeyboardInterrupt: 

In [None]:
best_model.steps[2][1]

LogisticRegression(C=0.002, random_state=2, solver='saga')

In [None]:
# Compute accuracy
accuracy = accuracy_score(test_data['connected'], test_data['pred'] > 0.5)
print(f"Accuracy: {accuracy}")

# Compute balanced accuracy
balanced_accuracy = balanced_accuracy_score(test_data['connected'], test_data['pred'] > 0.5)
print(f"Balanced Accuracy: {balanced_accuracy}")

# Display the confusion matrix
conf_matrix = confusion_matrix(test_data['connected'], test_data['pred'] > 0.5)
print(conf_matrix)


Accuracy: 0.6615276993031453
Balanced Accuracy: 0.7379474729808009
[[24366 12530]
 [   50   221]]


In [None]:
# from sklearn.linear_model import LogisticRegression
# from sklearn.metrics import balanced_accuracy_score, accuracy_score, confusion_matrix
# from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import StandardScaler
# from sklearn.pipeline import Pipeline
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.model_selection import RandomizedSearchCV
# from sklearn.model_selection import GridSearchCV
# from imblearn.over_sampling import SMOTE
# from imblearn.pipeline import Pipeline as ImbPipeline
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.svm import SVC
# from sklearn.compose import ColumnTransformer
# from sklearn.preprocessing import OneHotEncoder


# # Create a column transformer with OneHotEncoder for the categorical column
# numeric_cols = ["fw_similarity", "adp_dist"]
# # and StandardScaler for the numerical columns
# preprocessor = ColumnTransformer(
#     transformers=[
#         ('num', StandardScaler(), numeric_cols),
#     ],
#     remainder='passthrough'  # This will pass through any other columns untouched
# )

# # Now, modify your pipeline to include this preprocessor
# pipe = ImbPipeline([
#     ('preprocessing', preprocessor),  # Note how we include the preprocessor here
#     ('sampling', SMOTE(random_state=2)),
#     ('model', LogisticRegression(random_state=2))
# ])

# # Your param_grid remains the same, but you need to adjust the keys accordingly
# param_grid = {
#     'model__C': [0.00001, 0.0001, 0.0005, 0.001, 0.002, 0.003, 0.004, 0.01, 0.1, 1],
#     'model__penalty': ['l1', 'l2'],
#     'model__solver': ['liblinear', 'saga']
#     # Add or modify parameters for the 'preprocessing' step if needed
# }

# train_data, test_data = train_test_split(df, test_size=0.2, random_state=1)

# grid_search = GridSearchCV(pipe, param_grid, scoring='balanced_accuracy', cv=5, verbose=2, n_jobs=-1)

# grid_search.fit(train_data[["fw_similarity", "adp_dist"]], train_data["connected"])

# # Get the best estimator
# best_model = grid_search.best_estimator_

# # Predict on test data
# test_data['pred'] = best_model.predict(test_data[["fw_similarity", "adp_dist"]])

Fitting 5 folds for each of 40 candidates, totalling 200 fits
[CV] END model__C=1e-05, model__penalty=l1, model__solver=liblinear; total time=   0.1s
[CV] END model__C=1e-05, model__penalty=l1, model__solver=liblinear; total time=   0.1s
[CV] END model__C=1e-05, model__penalty=l1, model__solver=liblinear; total time=   0.1s
[CV] END model__C=1e-05, model__penalty=l1, model__solver=liblinear; total time=   0.2s
[CV] END model__C=1e-05, model__penalty=l1, model__solver=liblinear; total time=   0.2s
[CV] END model__C=1e-05, model__penalty=l1, model__solver=saga; total time=   0.2s
[CV] END model__C=1e-05, model__penalty=l1, model__solver=saga; total time=   0.2s
[CV] END model__C=1e-05, model__penalty=l1, model__solver=saga; total time=   0.2s
[CV] END model__C=1e-05, model__penalty=l2, model__solver=liblinear; total time=   0.1s
[CV] END model__C=1e-05, model__penalty=l2, model__solver=liblinear; total time=   0.1s
[CV] END model__C=1e-05, model__penalty=l1, model__solver=saga; total tim

In [None]:
# best_model.steps[2][1]

LogisticRegression(C=0.0001, random_state=2, solver='liblinear')

In [None]:
# # Compute accuracy
# accuracy = accuracy_score(test_data['connected'], test_data['pred'] > 0.5)
# print(f"Accuracy: {accuracy}")

# # Compute balanced accuracy
# balanced_accuracy = balanced_accuracy_score(test_data['connected'], test_data['pred'] > 0.5)
# print(f"Balanced Accuracy: {balanced_accuracy}")

# # Display the confusion matrix
# conf_matrix = confusion_matrix(test_data['connected'], test_data['pred'] > 0.5)
# print(conf_matrix)


Accuracy: 0.6214114671617295
Balanced Accuracy: 0.7250678980391279
[[22871 14025]
 [   46   225]]


ACC WITH PROJ:
Accuracy: 0.6564156375279145
Balanced Accuracy: 0.7372041349695804
[[24175 12721]
 [   49   222]]