# GenAI usage Survey overview and initial cleaning

In [None]:
import pandas as pd

In [None]:
from metadata import QUESTION_ID_MAP

### Read results

In [None]:
df_orig = pd.read_csv('data/results-survey334734.csv')
df = df_orig.copy()
print(f"Number columns: {df.shape[1]}")
print(f"Number responses: {df.shape[0]}")
print(f"Number not submitted: {df['Date submitted'].isna().sum()}")

### Remove rows where date submitted is none 

In [None]:
df = df[df['Date submitted'].notna()]
print(f"New number of responses: {df.shape[0]}")

### Remove not needed columns

In [None]:
# Last page is 6 for all, thus can be removed.
# Same for consent (all yes)
df.drop(["Date submitted", "Last page", "Start language", 
         "First, we would like to obtain your consent to complete the survey. Please read the following text and indicate whether you agree to participate and proceed with the survey.       	I am voluntarily participating in this survey for the BFH Generative AI Lab.  	I am aware that I will not receive any financial compensation for my participation.  	The data from this survey will be used to investigate the use of generative AI in the workplace.  	I am aware that the survey is anonymous and that no conclusions can be drawn about the participants.  	No information will be collected that allows conclusions to be drawn about my identity or my place of work unless I voluntarily provide such information in one of the available comment fields.  	The data collected will be used exclusively for publications in the context of this research project and any follow-up projects.  	I am aware that I cannot be contacted directly to view the final report, as the survey is anonymous.  	The contact person for this survey is Yannis Schmutz (yannis.schmutz@bfh.ch).  [I agree]"
         ],
        axis=1,
        inplace=True)

### Rename questions

In [None]:
df = df.rename(columns=QUESTION_ID_MAP)
df = df.rename(columns={'Response ID': 'resp_id'})

In [None]:
df.head(2)

In [None]:
df.set_index('resp_id')

In [None]:
df.head(2)

### Remove students that don't have an employer

In [None]:
df = df[~(((df.Q10 == "I’m not employed") | df.Q10.isna()) & (df.Q5 == "Student"))]
df.shape

## Nan value overview per column

In [None]:
nans_per_col = df.isna().sum()
print(nans_per_col)

## Save base cleaned version

In [None]:
df.to_csv('data/base1.csv', index=False)