In [1]:
import pandas as pd
import os

In [2]:
raw_data = pd.read_csv(os.path.join('data','raw_responses.csv'))
df = raw_data.copy(deep=True)

In [3]:
# All respondents agree to terms and conditions
print(raw_data.shape)
print(raw_data.loc[raw_data['I agree to these terms and conditions'] == 'Yes'].shape)

(483, 85)
(483, 85)


In [4]:
col = 'Which of the following roles best describes you?'

# Job roles are multi-select in input
# Create a new column with single roles
df['Single_Role'] = 'TBD'
df.loc[df[col].str.contains('Undergraduate'),'Single_Role'] = 'Student'
df.loc[df[col].str.contains('Postdoctoral'),'Single_Role'] = 'Postdoctoral Fellow'
df.loc[df[col].str.contains('staff'),'Single_Role'] = 'Faculty/Staff'
df.loc[df[col].str.contains('engineer'),'Single_Role'] = 'Faculty/Staff'
df.loc[df[col]=='Image/data analyst','Single_Role'] = 'Faculty/Staff'
df.loc[df[col].str.contains('director'),'Single_Role'] = 'Facility Director/Manager'
df.loc[df[col].str.contains('investigator'),'Single_Role'] = 'Principle Investigator'
df.loc[df[col].str.contains('Clinician'),'Single_Role'] = 'Clinician'

In [5]:
col = 'Which of the following roles best describes you?'
df['Trainee_Status'] = 'Non Trainee'
df.loc[df[col].str.contains('Undergraduate'),'Trainee_Status'] = 'Trainee'
df.loc[df[col].str.contains('Postdoctoral'),'Trainee_Status'] = 'Trainee'

In [6]:
col = 'How do you generally go about solving an image analysis problem? Check the approach(es) you use the most.' 

# having ', ' pattern in text response is problematic for parsing multi-select
df[col] = df[col].str.replace(" \(ChatGPT, BioimageIO, etc\)","", regex=True)


In [7]:
col = 'What image analysis tools have you used before? (check all that apply)'

# having ', ' pattern in text response is problematic for parsing multi-select
df[col] = df[col].str.replace(" \(Columbus, Nikon Elements, Softworx, Zen, etc\)","", regex=True)
df[col] = df[col].str.replace(" \(Imaris, Huygens, Volocity, etc\)","", regex=True)
df[col] = df[col].str.replace(" \(ImageJ, Fiji, QuPath, Icy, CellProfiler, napari, etc\)","", regex=True)
df[col] = df[col].str.replace(" \(Python e.g. scikit-image, MATLAB, etc\)","", regex=True)
# "None"  isn't being parsed correctly
df[col] = df[col].fillna(value='No tools')

In [8]:
col = 'What image analysis tools have you used before? (check all that apply).1'

# having ', ' pattern in text response is problematic for parsing multi-select
df[col] = df[col].str.replace(" \(AutoMET, Gatan Digital Micrograph, Aztec, etc\)","", regex=True)
df[col] = df[col].str.replace(" \(Avizo, Imaris, Volocity, etc\)","", regex=True)
df[col] = df[col].str.replace(" \(ImageJ, Fiji, Gwyddion, etc\)","", regex=True)
df[col] = df[col].str.replace(" \(Python e.g. scikit-image, MATLAB, etc\)","", regex=True)
# "None"  isn't being parsed correctly
df[col] = df[col].fillna(value='No tools')

In [9]:
col = 'What image analysis tools do you use the most?'
# "None"  isn't being parsed correctly
df[col] = df[col].fillna(value='No tools')

col = 'What image analysis tools do you use the most?.1'
# "None"  isn't being parsed correctly
df[col] = df[col].fillna(value='No tools')

In [10]:
# Place scales into bins
df['Work'] = 'TBD'
df.loc[df['How would you describe your work?']>5, 'Work'] = 'Analyst'
df.loc[df['How would you describe your work?']<3, 'Work'] = 'Imaging'
df.loc[df['How would you describe your work?'].isin([3,4,5]), 'Work'] = 'Balanced'
df['Computation'] = 'TBD'
df.loc[df['How would you rate your computational skills?']>5, 'Computation'] = 'High Skill'
df.loc[df['How would you rate your computational skills?']<3, 'Computation'] = 'Low Skill'
df.loc[df['How would you rate your computational skills?'].isin([3,4,5]), 'Computation'] = 'Medium Skill'
df['Comfort'] = 'TBD'
df.loc[df['How would you rate your comfort in developing new computational skills?']>5, 'Comfort'] = 'High Comfort'
df.loc[df['How would you rate your comfort in developing new computational skills?']<3, 'Comfort'] = 'Low Comfort'
df.loc[df['How would you rate your comfort in developing new computational skills?'].isin([3,4,5]), 'Comfort'] = 'Medium Comfort'
df['Manual'] = 'TBD'
df.loc[df['How manual would you say your current typical analysis workflow(s) are? ']>5, 'Manual'] = 'Highly Manual'
df.loc[df['How manual would you say your current typical analysis workflow(s) are? ']<3, 'Manual'] = 'Highly Automated'
df.loc[df['How manual would you say your current typical analysis workflow(s) are? '].isin([3,4,5]), 'Manual'] = 'Variable Automation'

In [11]:
# Make column of Physical vs Life Sciences
LS_PS_col = 'The next question will ask you about particular image analysis tools and techniques. Do you want to answer questions about microscopy in the field/area of life sciences or physical sciences?'
df['Science_Cat'] = 'TBD'
df.loc[df[LS_PS_col] == 'Life sciences: e.g. biology, biomedicine','Science_Cat'] = 'Life Sciences'
df.loc[df[LS_PS_col] == 'Physical sciences: e.g. chemistry, geology, materials sciences','Science_Cat'] = 'Physical Sciences'

In [12]:
# having ', ' pattern in text response is problematic for parsing multi-select
col = 'Where did you hear about this survey? Please select all that apply.'
df[col] = df[col].str.replace(" \(LinkedIN, BlueSky, Mastodon\)","", regex=True)

col = 'How would you most prefer to be notified about image analysis workshops, sessions, or conferences being planned?'
df[col] = df[col].str.replace(" \(LinkedIN, BlueSky, Mastodon\)","", regex=True)

In [13]:
col = 'Have you put into practice things you have learned at image analysis workshops/tutorials? If not, why not?'
df[col] = df[col].fillna('')
df['In_Practice'] = ''
pos_list = ['yes','Yes','all the time','Absolutely','YES',"adapated","sometimes","Sometime",'repeated','partially','Si','Always','I have',
            'Of course','tried','Some','Partially','Most','Repeated','depends','Adapted','work in progress',
            'CLIJ','automation','particle','record macros']
pattern = '|'.join(pos_list)
df.loc[df[col].str.contains(pattern, regex=True),'In_Practice'] = 'Yes'

neg_list = ['No','no','not','didn\'t',
            'issues','Too much time','had time','lot of time','Lack','time cost','weren\'t right','commercial',
            'didn\'t provide']
pattern = '|'.join(neg_list)
df.loc[df[col].str.contains(pattern),'In_Practice'] = 'No'

In [14]:
df.to_csv(os.path.join('data','cleaned_responses.csv'), index=False)