code by Ellie Peters

# imports

In [254]:
#packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import preprocessing #not needed with current code but can be used to make a*b, a^2, b^2 terms if wanted
from sklearn.preprocessing import StandardScaler, MinMaxScaler#not needed with current code but can be used to scale features
import itertools
import re

#settings to display full dfs 
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# read in data

In [255]:
##input required
file ="crossterms_testing" 
sheet = "Sheet1" 
header= [3] #adjust header rows as needed 
################

#read excel to df
inp_df = pd.read_excel(f"{file}.xlsx",sheet,header=header) 
#drop rows containing nan
inp_df = inp_df.dropna(axis=0) #axis=0 drops rows

#make dictionary for x numbers and names
parms_dict = {}
for col in inp_df.columns:
    parms_dict[col]=inp_df.loc[0,col]
#print(parms_dict)

#drop row containing x names
inp_df=inp_df.drop(index=0) #identifies row by index

#check your df
display(inp_df)

Unnamed: 0,ID,substrate,ligand,y_meas,x1,x2,x3,x4,x5,x6
1,24pyr_L1,24pyr,1,-0.0994365,-0.0616538,4.90372,17.1007,17.2711,19.6745,6.54109
3,24pyr_L10,24pyr,10,0.516354,-0.067275,4.63758,4.68636,10.6044,11.934,6.32459
4,24pyr_L11,24pyr,11,0.724141,-0.0668346,4.38261,9.17012,9.55844,11.5916,3.6523
6,24pyr_L16,24pyr,16,-2.03843,-0.0220091,5.31014,0.0,8.21734,8.21734,1.35131
7,24pyr_L17,24pyr,17,-2.14293,-0.0482304,4.28787,0.0,8.27906,8.27906,1.33771
8,24pyr_L62,24pyr,62,-2.53085,-0.0564602,4.80798,0.605367,8.25103,8.27933,1.31261
10,24pyr_L83,24pyr,83,0.217959,-0.061127,3.65858,6.61447,8.60556,10.1432,3.27473
11,24pyr_L90,24pyr,90,0.370499,-0.0644085,3.92963,2.85349,18.9895,19.3598,19.3973
12,24pyr_L104,24pyr,104,-0.0704694,-0.0380879,6.26314,10.1452,17.9989,19.3431,7.01174
13,24pyr_L330,24pyr,330,0.325186,-0.0578396,4.0271,9.26739,14.0324,20.6762,21.3825


# optional: scale the features

skip this section if you want to calculate interaction terms from unscaled features and then scale all features in the modeling script

note: if you scale features now, you do not need to scale in the modeling script.
    
this scales all features and will use the scaled features when calculating the interaction terms

note: scaling before train/test split introduced data leakage

In [241]:
def manual_standard_scaler(feat):
    #does the math "manually" instead of calling a scaler
    u = np.mean(feat)
    s = np.std(feat)
    scaled_feat = (feat-u)/s
    return(scaled_feat)

#replace feature values in inp_df with scaled feature values
for col in inp_df.columns:
    if 'x' in col: #skip the ID and y cols
        inp_df[col] = manual_standard_scaler(inp_df[col])

inp_df

Unnamed: 0,ID,substrate,ligand,y_meas,x1,x2,x3,x4,x5,x6
1,24pyr_L1,24pyr,1,-0.0994365,-0.0616538,4.90372,17.1007,17.2711,19.6745,6.54109
3,24pyr_L10,24pyr,10,0.516354,-0.067275,4.63758,4.68636,10.6044,11.934,6.32459
4,24pyr_L11,24pyr,11,0.724141,-0.0668346,4.38261,9.17012,9.55844,11.5916,3.6523
6,24pyr_L16,24pyr,16,-2.03843,-0.0220091,5.31014,0.0,8.21734,8.21734,1.35131
7,24pyr_L17,24pyr,17,-2.14293,-0.0482304,4.28787,0.0,8.27906,8.27906,1.33771
8,24pyr_L62,24pyr,62,-2.53085,-0.0564602,4.80798,0.605367,8.25103,8.27933,1.31261
10,24pyr_L83,24pyr,83,0.217959,-0.061127,3.65858,6.61447,8.60556,10.1432,3.27473
11,24pyr_L90,24pyr,90,0.370499,-0.0644085,3.92963,2.85349,18.9895,19.3598,19.3973
12,24pyr_L104,24pyr,104,-0.0704694,-0.0380879,6.26314,10.1452,17.9989,19.3431,7.01174
13,24pyr_L330,24pyr,330,0.325186,-0.0578396,4.0271,9.26739,14.0324,20.6762,21.3825


# gather interaction terms

In [242]:
##input required
#set ranges of features to use (use chain if you need to combine nonconsecutive ranges)
#use the number assosicated with the x number. e.g., for x3, use 3
range1_start = 1
range1_end   = 3
range2_start = 5
range2_end   = 6
#set types of interactions
int_types = ['*','/'] #options include multiplication and division. choose one or more. 
    #note: division is term from range 1 divided by term from range 2
################


#get the features assosicated with the ranges set
list1 = list(range(range1_start,range1_end+1))
list2 = list(range(range2_start,range2_end+1))
#add x to the feature numbers
list1 = ['x' + str(item) for item in list1]
list2 = ['x' + str(item) for item in list2]
#make list of cross terms to calc
new_terms = []
for operator in int_types:
    for pair in itertools.product(list1, list2):
        new_terms.append(operator.join(pair))

for term in new_terms:    
    #get elements of the terms
    f1=term
    for i in int_types:
        f1=f1.replace(i,'!')
    t1=f1.split('!')[0]
    t2=f1.split('!')[1]
    #get x names associated with x number 
    t1_name = parms_dict[t1] 
    t2_name = parms_dict[t2]
    
    #add column for new terms
    inp_df[term]=""
    #calc operations selected and add to new col
    if '*' in term: 
        inp_df[term]=inp_df[t1]*inp_df[t2] #gives the same result as using sklearn.preprocessing.polynomialfeatures
        t1t2_name = f"{t1_name}_*_{t2_name}"
    if '/' in term:
        inp_df[term]=inp_df[t1]*inp_df[t2]
        t1t2_name = f"{t1_name}_/_{t2_name}"
    #update parms_dict
    parms_dict[term]=t1t2_name

#add x names back to df
names_row = pd.DataFrame(parms_dict,index =[0])
out_df = pd.concat([names_row, inp_df]).reset_index(drop = True)

display(out_df)

Unnamed: 0,ID,substrate,ligand,y_meas,x1,x2,x3,x4,x5,x6,x1*x5,x1*x6,x2*x5,x2*x6,x3*x5,x3*x6,x1/x5,x1/x6,x2/x5,x2/x6,x3/x5,x3/x6
0,ID,substrate,ligand,DDG‡ measured,vmin_vmin_boltz,sterimol_B1_boltz,visvol_total_delta,visvol_prox_boltz,visvol_prox_max,vbur_max_delta_qvbur_vburminconf,vmin_vmin_boltz_*_visvol_prox_max,vmin_vmin_boltz_*_vbur_max_delta_qvbur_vburmin...,sterimol_B1_boltz_*_visvol_prox_max,sterimol_B1_boltz_*_vbur_max_delta_qvbur_vburm...,visvol_total_delta_*_visvol_prox_max,visvol_total_delta_*_vbur_max_delta_qvbur_vbur...,vmin_vmin_boltz_/_visvol_prox_max,vmin_vmin_boltz_/_vbur_max_delta_qvbur_vburmin...,sterimol_B1_boltz_/_visvol_prox_max,sterimol_B1_boltz_/_vbur_max_delta_qvbur_vburm...,visvol_total_delta_/_visvol_prox_max,visvol_total_delta_/_vbur_max_delta_qvbur_vbur...
1,24pyr_L1,24pyr,1,-0.0994365,-0.0616538,4.90372,17.1007,17.2711,19.6745,6.54109,-1.21301,-0.403283,96.4781,32.0757,336.446,111.857,-1.21301,-0.403283,96.4781,32.0757,336.446,111.857
2,24pyr_L10,24pyr,10,0.516354,-0.067275,4.63758,4.68636,10.6044,11.934,6.32459,-0.802858,-0.425487,55.3447,29.3308,55.9268,29.6393,-0.802858,-0.425487,55.3447,29.3308,55.9268,29.6393
3,24pyr_L11,24pyr,11,0.724141,-0.0668346,4.38261,9.17012,9.55844,11.5916,3.6523,-0.77472,-0.2441,50.8014,16.0066,106.296,33.492,-0.77472,-0.2441,50.8014,16.0066,106.296,33.492
4,24pyr_L16,24pyr,16,-2.03843,-0.0220091,5.31014,0,8.21734,8.21734,1.35131,-0.180856,-0.0297411,43.6352,7.17564,0,0,-0.180856,-0.0297411,43.6352,7.17564,0,0
5,24pyr_L17,24pyr,17,-2.14293,-0.0482304,4.28787,0,8.27906,8.27906,1.33771,-0.399302,-0.0645184,35.4995,5.73593,0,0,-0.399302,-0.0645184,35.4995,5.73593,0,0
6,24pyr_L62,24pyr,62,-2.53085,-0.0564602,4.80798,0.605367,8.25103,8.27933,1.31261,-0.467452,-0.0741102,39.8068,6.311,5.01204,0.794612,-0.467452,-0.0741102,39.8068,6.311,5.01204,0.794612
7,24pyr_L83,24pyr,83,0.217959,-0.061127,3.65858,6.61447,8.60556,10.1432,3.27473,-0.620024,-0.200174,37.1097,11.9808,67.092,21.6606,-0.620024,-0.200174,37.1097,11.9808,67.092,21.6606
8,24pyr_L90,24pyr,90,0.370499,-0.0644085,3.92963,2.85349,18.9895,19.3598,19.3973,-1.24694,-1.24935,76.0768,76.2243,55.243,55.3501,-1.24694,-1.24935,76.0768,76.2243,55.243,55.3501
9,24pyr_L104,24pyr,104,-0.0704694,-0.0380879,6.26314,10.1452,17.9989,19.3431,7.01174,-0.736738,-0.267062,121.149,43.9156,196.24,71.1356,-0.736738,-0.267062,121.149,43.9156,196.24,71.1356


# export 

In [243]:
##input required
out_excelname = 'name'
################

out_df.to_excel(f"{out_excelname}.xlsx")