In [None]:
# IPython magic tools
%load_ext autoreload
%autoreload 2

import sys
sys.path.append('../../../src/')

import os

from aind_vr_foraging_analysis.utils import parse, processing, plotting_utils as plotting, AddExtraColumns

# Plotting libraries
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
import matplotlib.patches as mpatches

import seaborn as sns
import pandas as pd
import numpy as np

sns.set_context('talk')

import warnings
pd.options.mode.chained_assignment = None  # Ignore SettingWithCopyWarning
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter("ignore", UserWarning)
warnings.filterwarnings("ignore", category=RuntimeWarning)

pdf_path = r'Z:\scratch\vr-foraging\sessions'
base_path = r'Z:\scratch\vr-foraging\data'
data_path = r'../../../data/'

# Define exponential function
def exponential_func(x, a, b):
    return a * np.exp(b * x)

def format_func(value, tick_number):
    return f"{value:.0f}"

results_path = r'C:\Users\tiffany.ona\OneDrive - Allen Institute\Documents\VR foraging\experiments\batch 4 - manipulating cost of travelling and global statistics\results'

import plotting_friction_experiment as friction

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline

from scipy.stats import ttest_1samp



In [4]:
def grid_search(X_mouse_scaled, y_mouse):
                    # Define the parameter grid
    param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100]}

    # Initialize the logistic regression model
    log_reg = LogisticRegression()

    # Initialize GridSearchCV
    grid_search = GridSearchCV(log_reg, param_grid, cv=5, scoring='accuracy')

    # Fit GridSearchCV
    grid_search.fit(X_mouse_scaled, y_mouse)

    # Get the best parameter
    best_C = grid_search.best_params_['C']
    print(f"The best value for C is: {best_C}")

    # Get the best score
    best_score = grid_search.best_score_
    print(f"The best cross-validation score is: {best_score:.2f}")

    # Perform 5-fold cross-validation
    log_reg = LogisticRegression(C=best_C)
    
    return best_C

**Load the dataset**

In [None]:
print('Loading')
summary_df = pd.read_csv(os.path.join(data_path, 'batch_4.csv'), index_col=0)
summary_df['perceived_reward_probability'] = summary_df['cumulative_rewards'] / (summary_df['visit_number'] +1)
summary_df = summary_df[(summary_df['experiment'] == 'data_collection')|(summary_df['experiment'] == 'friction')|(summary_df['experiment'] == 'control')|(summary_df['experiment'] == 'distance_long')|(summary_df['experiment'] == 'distance_short')|(summary_df['experiment'] == 'friction_15')|(summary_df['experiment'] == 'friction_optimized')|(summary_df['experiment'] == 'distance_extra_long')|(summary_df['experiment'] == 'distance_extra_short')]
summary_df = summary_df[(summary_df['mouse'] != 754573)&(summary_df['mouse'] != 754572)]

In [6]:
summary_df = summary_df.loc[(summary_df.experiment == "data_collection")&(summary_df.label == 'RewardSite')]

### **Run first version of the model**

#### **Run model without any interaction**

In [None]:
features = ['reward_probability', 'consecutive_failures', 'visit_number', 'cumulative_rewards', 'active_patch']

# Initialize dataframes to store weights and cross-validation results
weights_df = pd.DataFrame(columns=['regressors', 'weights', 'mouse', 'session'])
cv_results_df = pd.DataFrame(columns=['mouse', 'cv_score'])

for (mouse, session), mouse_df in summary_df.groupby(['mouse', 'session']):
    print(f"Mouse: {mouse}, Session: {session}")
    
    # Filter data for the current mouse
    
    # Select features and target variable
    X_mouse = mouse_df[features]
    y_mouse = mouse_df['has_choice'].astype(int)
    
    # Standardize the features
    scaler = StandardScaler()
    X_mouse_scaled = scaler.fit_transform(X_mouse)
    
    # Perform grid search to find the best value for C
    # best_C = grid_search(X_mouse_scaled, y_mouse)
    best_C=1
    
    # Perform 5-fold cross-validation
    log_reg = LogisticRegression(C=best_C)
    cv_scores = cross_val_score(log_reg, X_mouse_scaled, y_mouse, cv=5)
    
    # Fit the logistic regression model using formula
    log_reg.fit(X_mouse_scaled, y_mouse)
    
    # Get the weights for each feature
    feature_weights = pd.Series(log_reg.coef_[0], index=features)
    feature_weights = feature_weights.reset_index()
    feature_weights.rename(columns={'index': 'regressors', 0: 'weights'}, inplace=True)
    feature_weights['mouse'] = mouse
    feature_weights['session'] = session

    # Append the weights and cv scores to the respective dataframes
    weights_df = pd.concat([weights_df, feature_weights], ignore_index=True)
    cv_results_df = pd.concat([cv_results_df, pd.DataFrame({'session': [session], 'mouse': [mouse], 'cv_score': [cv_scores.mean()]})], ignore_index=True)

    # Print the cross-validation scores and their mean
    print(f"Mean cross-validation score: {cv_scores.mean():.2f}")
    print('\n')
    # Get the weights for each feature

weights_df['mouse'] = weights_df['mouse'].round(0).astype(str)

In [None]:
# Plot using the collapsed data
fig, axes = plt.subplots(4, 4, figsize=(20, 14), sharex=True)

# Perform t-tests and plot significance
for (mouse, group), ax in zip(weights_df.groupby('mouse'), axes.flatten()):
    # Perform t-test for each regressor in the group
    significant_regressors = []
    for regressor in group['regressors'].unique():
        regressor_data = group[group['regressors'] == regressor]['weights']
        t_stat, p_value = ttest_1samp(regressor_data, 0)
        
        # Determine the significance level
        if p_value < 0.001:
            significance = '***'
        elif p_value < 0.01:
            significance = '**'
        elif p_value < 0.05:
            significance = '*'
        else:
            significance = None

        if significance:
            significant_regressors.append((regressor, regressor_data.max(), significance))

    # Plot the swarmplot
    sns.swarmplot(
        data=group, 
        x='regressors', 
        y='weights', 
        palette='tab10', 
        ax=ax, 
        hue='regressors', 
        legend=False
    )
    ax.set_title(f'Mouse {mouse}')
    ax.set_xlabel('')
    ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right')
    ax.hlines(0, -0.5, len(group['regressors'].unique()) - 0.5, color='black', linestyle='--')

    # Annotate significant results
    for regressor, max_value, significance in significant_regressors:
        x = list(group['regressors'].unique()).index(regressor)
        y = max_value + 0.05  # Position above max value
        ax.text(x, y, significance, ha='center', va='bottom', fontsize=12, color='black')

# Manually create the legend
handles = []
for regressor, color in zip(weights_df['regressors'].unique(), sns.color_palette('tab10', len(weights_df['regressors'].unique()))):
    handles.append(mpatches.Patch(color=color, label=regressor))

# Add legend at the bottom with 3 columns
fig.legend(
    handles=handles,
    bbox_to_anchor=(0.5, -0.05),  # Centered below the figure
    loc='upper center',
    ncol=3,  # Number of columns
    title='Features',
    prop={'size': 12}
)

sns.despine()
plt.tight_layout()
plt.subplots_adjust(bottom=0.15)  # Add space at the bottom for the legend
plt.xticks(rotation=45, ha='right')
plt.show()


In [None]:


# Aggregate the weights by mouse and regressor
aggregated_df = weights_df.groupby(['mouse', 'regressors'], as_index=False).mean()

# Perform t-tests on the aggregated data
t_test_results = []
for regressor in aggregated_df['regressors'].unique():
    regressor_data = aggregated_df[aggregated_df['regressors'] == regressor]['weights']
    t_stat, p_value = ttest_1samp(regressor_data, 0)
    
    # Determine the significance level
    if p_value < 0.001:
        significance = '***'
    elif p_value < 0.01:
        significance = '**'
    elif p_value < 0.05:
        significance = '*'
    else:
        significance = None

    t_test_results.append({'regressor': regressor, 'p_value': p_value, 'significance': significance})

t_test_results_df = pd.DataFrame(t_test_results)

# Plot
plt.figure(figsize=(5, 6))

# One point per mouse
sns.swarmplot(
    data=aggregated_df, 
    x='regressors', 
    y='weights', 
    hue='regressors', 
    palette='tab10', 
    dodge=True
)

# Annotate significance levels
for i, row in t_test_results_df.iterrows():
    regressor = row['regressor']
    significance = row['significance']
    if significance:
        x = list(aggregated_df['regressors'].unique()).index(regressor)
        y = aggregated_df[aggregated_df['regressors'] == regressor]['weights'].max() + 0.1
        plt.text(x, y, significance, ha='center', va='bottom', fontsize=12, color='black')

# Add horizontal line at 0
plt.axhline(0, color='black', linestyle='--')

# Customize labels and legend
plt.xlabel('')
plt.xlim(-1, len(aggregated_df['regressors'].unique()))
plt.ylabel('Weight')
plt.xticks(rotation=45, ha='right')
plt.title('Weights Per Regressor \n (Aggregated by Mouse)')

# Manually create legend
handles = []
for regressor, color in zip(aggregated_df['regressors'].unique(), sns.color_palette('tab10', len(aggregated_df['regressors'].unique()))):
    handles.append(mpatches.Patch(color=color, label=regressor))
plt.tight_layout()
sns.despine()
plt.show()


#### **Run model without any interaction**

In [None]:
features = ['reward_probability', 'consecutive_failures', 'visit_number', 'cumulative_rewards', 'active_patch']

# Initialize dataframes to store weights and cross-validation results
weights_df = pd.DataFrame(columns=['regressors', 'weights', 'mouse', 'session'])
cv_results_df = pd.DataFrame(columns=['mouse', 'cv_score'])

for (mouse, session), mouse_df in summary_df.groupby(['mouse', 'session']):
    print(f"Mouse: {mouse}, Session: {session}")
    
    # Select features and target variable
    X_mouse = mouse_df[features]
    y_mouse = mouse_df['has_choice'].astype(int)
    
    # Define the pipeline
    poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
    scaler = StandardScaler()
    log_reg = LogisticRegression(C=1, max_iter=1000)
    
    pipeline = make_pipeline(poly, scaler, log_reg)
    
    # Perform 5-fold cross-validation
    cv_scores = cross_val_score(pipeline, X_mouse, y_mouse, cv=5)
    
    # Fit the pipeline
    pipeline.fit(X_mouse, y_mouse)
    
    # Extract the feature names after applying PolynomialFeatures
    poly_features = poly.fit(X_mouse).get_feature_names_out(features)
    
    # Get the weights for each feature
    log_reg_model = pipeline.named_steps['logisticregression']
    feature_weights = pd.Series(log_reg_model.coef_[0], index=poly_features)
    feature_weights = feature_weights.reset_index()
    feature_weights.rename(columns={'index': 'regressors', 0: 'weights'}, inplace=True)
    feature_weights['mouse'] = mouse
    feature_weights['session'] = session

    # Append the weights and cv scores to the respective dataframes
    weights_df = pd.concat([weights_df, feature_weights], ignore_index=True)
    cv_results_df = pd.concat([cv_results_df, pd.DataFrame({'session': [session], 'mouse': [mouse], 'cv_score': [cv_scores.mean()]})], ignore_index=True)

    # Print the cross-validation scores and their mean
    print(f"Mean cross-validation score: {cv_scores.mean():.2f}")
    print('\n')

weights_df['mouse'] = weights_df['mouse'].round(0).astype(str)

In [None]:
# Plot using the collapsed data
fig, axes = plt.subplots(2, 5, figsize=(26, 8), sharex=True)

# Perform t-tests and plot significance
for (mouse, group), ax in zip(weights_df.groupby('mouse'), axes.flatten()):
    # Perform t-test for each regressor in the group
    significant_regressors = []
    for regressor in group['regressors'].unique():
        regressor_data = group[group['regressors'] == regressor]['weights']
        t_stat, p_value = ttest_1samp(regressor_data, 0)
        
        # Determine the significance level
        if p_value < 0.001:
            significance = '***'
        elif p_value < 0.01:
            significance = '**'
        elif p_value < 0.05:
            significance = '*'
        else:
            significance = None

        if significance:
            significant_regressors.append((regressor, regressor_data.max(), significance))

    # Plot the swarmplot
    sns.swarmplot(
        data=group, 
        x='regressors', 
        y='weights', 
        palette='tab20', 
        ax=ax, 
        hue='regressors', 
        legend=False
    )
    ax.set_title(f'Mouse {mouse}')
    ax.set_xlabel('')
    ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right')
    ax.hlines(0, -0.5, len(group['regressors'].unique()) - 0.5, color='black', linestyle='--')

    # Annotate significant results
    for regressor, max_value, significance in significant_regressors:
        x = list(group['regressors'].unique()).index(regressor)
        y = max_value + 0.05  # Position above max value
        ax.text(x, y, significance, ha='center', va='bottom', fontsize=12, color='black')

# Manually create the legend
handles = []
for regressor, color in zip(weights_df['regressors'].unique(), sns.color_palette('tab20', len(weights_df['regressors'].unique()))):
    handles.append(mpatches.Patch(color=color, label=regressor))

# Add legend at the bottom with 3 columns
fig.legend(
    handles=handles,
    bbox_to_anchor=(0.5, 0.05),  # Centered below the figure
    loc='upper center',
    ncol=3,  # Number of columns
    title='Features',
    prop={'size': 12}
)

sns.despine()
plt.tight_layout()
plt.subplots_adjust(bottom=0.10)  # Add space at the bottom for the legend
plt.xticks(rotation=45, ha='right')
plt.show()


In [None]:

# Aggregate the weights by mouse and regressor
aggregated_df = weights_df.groupby(['mouse', 'regressors'], as_index=False).mean()

# Perform t-tests on the aggregated data
t_test_results = []
for regressor in aggregated_df['regressors'].unique():
    regressor_data = aggregated_df[aggregated_df['regressors'] == regressor]['weights']
    t_stat, p_value = ttest_1samp(regressor_data, 0)
    
    # Determine the significance level
    if p_value < 0.001:
        significance = '***'
    elif p_value < 0.01:
        significance = '**'
    elif p_value < 0.05:
        significance = '*'
    else:
        significance = None

    t_test_results.append({'regressor': regressor, 'p_value': p_value, 'significance': significance})

t_test_results_df = pd.DataFrame(t_test_results)

# Plot
plt.figure(figsize=(10, 6))

# One point per mouse
sns.swarmplot(
    data=aggregated_df, 
    x='regressors', 
    y='weights', 
    hue='regressors', 
    palette='tab20', 
    dodge=True
)

# Annotate significance levels
for i, row in t_test_results_df.iterrows():
    regressor = row['regressor']
    significance = row['significance']
    if significance:
        x = list(aggregated_df['regressors'].unique()).index(regressor)
        y = aggregated_df[aggregated_df['regressors'] == regressor]['weights'].max() + 0.1
        plt.text(x-0.2, y, significance, ha='center', va='bottom', fontsize=12, color='black')

# Add horizontal line at 0
plt.axhline(0, color='black', linestyle='--')

# Customize labels and legend
plt.xlabel('')
plt.xlim(-1.5, len(aggregated_df['regressors'].unique()) - 0.5)
plt.ylabel('Weight')
plt.xticks(rotation=45, ha='right')
plt.title('Weights Per Regressor (Aggregated by Mouse)')

# Manually create legend
handles = []
for regressor, color in zip(aggregated_df['regressors'].unique(), sns.color_palette('tab20', len(aggregated_df['regressors'].unique()))):
    handles.append(mpatches.Patch(color=color, label=regressor))
plt.tight_layout()
plt.subplots_adjust(bottom=0.3)  # Adjust space for the legend
sns.despine()
plt.show()


In [None]:
# Example: collapse sessions by mouse and regressor
collapsed_df = (
    weights_df
    .groupby(['mouse', 'regressors'], as_index=False)
    .agg({'weights': 'mean'})  # Replace 'mean' with the desired aggregation function
)

# Plot using the collapsed data
fig, ax = plt.subplots(figsize=(4, 4))
sns.swarmplot(data=collapsed_df, x='regressors', y='weights', hue='mouse', palette='tab10')
sns.boxplot(data=collapsed_df, x='regressors', y='weights', color='grey')
plt.hlines(0, -0.5, 4.5, color='black', linestyle='--')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., ncol=2)
plt.xticks(rotation=45, ha='right')
sns.despine()
plt.show()

#### **Remove variables one at a time to find which ones are needed for the movel**

In [None]:
features = ['reward_probability', 'consecutive_failures', 'visit_number', 'cumulative_rewards', 'active_patch']
# Initialize dataframes to store weights and cross-validation results
weights_df = pd.DataFrame(columns=features)
cv_results_df = pd.DataFrame(columns=['mouse', 'cv_score'])

# Perform logistic regression excluding one feature at a time
for feature in features:
    new_features = features.copy()  
    new_features.remove(feature)
    
    # Iterate over each unique mouse
    for mouse in summary_df['mouse'].unique():
        for session in summary_df.loc[(summary_df['mouse'] == mouse)].session.unique():
            print(f"Mouse: {mouse}, Session: {session}")
            
            # Filter data for the current mouse
            mouse_df = summary_df[(summary_df['mouse'] == mouse)&(summary_df['session'] == session)]
            
            # Select features and target variable
            X_mouse = mouse_df[new_features]
            y_mouse = mouse_df['has_choice'].astype(int)
            
            # Standardize the features
            scaler = StandardScaler()
            X_mouse_scaled = scaler.fit_transform(X_mouse)
            

            cv_scores = cross_val_score(log_reg, X_mouse_scaled, y_mouse, cv=5)
            
            # Fit the logistic regression model
            log_reg.fit(X_mouse_scaled, y_mouse)
            
            # Get the weights for each feature
            feature_weights = pd.Series(log_reg.coef_[0], index=new_features)
            
            # Append the weights and cv scores to the respective dataframes
            weights_df = pd.concat([weights_df, feature_weights.to_frame().T], ignore_index=True)
            cv_results_df = pd.concat([cv_results_df, pd.DataFrame({'feature_removed': [feature],'session': [session], 'mouse': [mouse], 'cv_score': [cv_scores.mean()]})], ignore_index=True)
    

            # Print the cross-validation scores and their mean
            print(f"Cross-validation scores: {cv_scores}")
            print(f"Mean cross-validation score: {cv_scores.mean():.2f}")

            # Get the weights for each feature
            feature_weights = pd.Series(log_reg.coef_[0], index=new_features)
            print('\n')


In [None]:
fig, ax = plt.subplots(figsize=(12, 4))
sns.boxplot(data=cv_results_df, x='mouse', y='cv_score', hue='feature_removed', palette='tab10')
plt.xticks(rotation=45)
sns.despine()
plt.legend(title='Feature removed', loc='upper left', bbox_to_anchor=(1, 1))