# Data Cleaning and preparation 

Importing necessary packages 

In [None]:
import pandas as pd
import missingno as msno
import matplotlib.pyplot as plt

Importing raw survey data

In [None]:
# Load the survey data
file_path = 'Myfilepath'
raw_data = pd.read_csv(file_path)

# Display the first few rows to understand the structure of the data
display(raw_data.head())

# Display the last few rows to understand the structure of the data
display(raw_data.tail())

Listing the columns 


In [None]:
# Display the list of column names before cleaning
raw_data.columns

Check for duplicates 

In [None]:
# Check for duplicates based on the "ResponseId" column, without removing them
duplicates = raw_data.duplicated(subset="ResponseId", keep=False)

# Display duplicates if any
duplicates_found = raw_data[duplicates]

# Display the number of duplicate entries
duplicates_found.shape, duplicates_found.head()

Removing unnecessary rows and columns

In [None]:
# Remove the first two rows, as they contain metadata and import IDs
cleaned_data = raw_data.drop([0, 1])

# Drop unnecessary columns like metadata (timestamps, IP address, etc.)
columns_to_drop = ['StartDate', 'EndDate', 'Status', 'IPAddress', 'Progress', 'Duration (in seconds)', 'Finished', 
                   'RecordedDate', 'ResponseId', 'RecipientLastName', 'RecipientFirstName', 'RecipientEmail', 'ExternalReference', 
                   'LocationLatitude', 'LocationLongitude', 'DistributionChannel', 'UserLanguage', 'Q2', 'Prolific-ID', 'PROLIFIC_PID']

cleaned_data = cleaned_data.drop(columns=columns_to_drop)

# Display the first few rows of the cleaned data
cleaned_data.head()

Display new updated data 

In [None]:
# Display the list of column names again after removing unnecessary columns
cleaned_data.columns.tolist()

Renaming columns for clarity 

In [None]:
# Define the mapping for renaming columns
column_rename_map = {
    'Q3': 'AI_Knowledge_Level',
    'Q4': 'AI_Use_Outside_Work',
    'Q5_1': 'Text_Generation_Familiarity',
    'Q5_2': 'Image_Generation_Familiarity',
    'Q5_7': 'Speech_Generation_Familiarity',
    'Q6': 'Confidence_in_AI_Use',
    'Q7': 'AI_Restrictions_At_Work',
    'Q8': 'AI_Use_At_Work',
    'Q9': 'AI_Tools_Used_At_Work',
    'Q9_9_TEXT': 'Other_AI_Tools_Used',
    'Q10': 'AI_Work_Applications',
    'Q10_11_TEXT': 'Other_AI_Applications',
    'Q11': 'AI_Concerns_In_Role',
    'Q12': 'Interest_in_Learning_AI',
    'Q13': 'Time_Willing_to_Learn_AI',
    'Q14': 'AI_Topics_Interest',
    'Q15': 'Preferred_Learning_Method',
    'Q15_7_TEXT': 'Other_Learning_Methods',
    'Q16': 'Barriers_to_Learning_AI',
    'Q16_9_TEXT': 'Other_Learning_Barriers',
    'Q17': 'Company_Training_Support',
    'Q18': 'Company_AI_Training_Offered',
    'Q19': 'Importance_of_AI_Training',
    'Q20': 'Job_Satisfaction',
    'Q21': 'AI_Impact_on_Job_Satisfaction',
    'Q23': 'Comfort_with_AI_in_Work',
    'Q24': 'AI_Helping_Complete_Work_Faster',
    'Q25': 'AI_Helping_Reduce_Errors',
    'Q26': 'AI_Job_Risk',
    'Q27': 'AI_Job_Security',
    'Q28': 'Company_Industry',
    'Q29': 'Company_Department',
    'Q30': 'Company_Size',
    'Q31': 'Company_Revenue',
    'Q32': 'Country_of_Residence',
    'Q33': 'Gender',
    'Q33_4_TEXT': 'Other_Gender_Description',
    'Q34': 'Education_Level',
    'Q35': 'Age'
}

# Apply the renaming to the dataframe
cleaned_data.rename(columns=column_rename_map, inplace=True)

# Display the list of column names to confirm renaming
cleaned_data.columns.to_list()

Inspecting the dataset

In [None]:
# Get the number of rows and columns in the dataset
rows, columns = cleaned_data.shape

# Display the row and column count
print(f"Total number of rows: {rows}")
print(f"Total number of columns: {columns}")

In [None]:
# Get basic info about the dataset
cleaned_data.info()

Check missing values 

In [None]:
# Check for missing values in each column of the cleaned dataset
missing_values = cleaned_data.isnull().sum()

# Bar chart of missing data
msno.bar(cleaned_data)
plt.show()

# Display the missing values for each column
print(missing_values)

Drop columns with large amounts of missing data

In [None]:
# Drop columns with a large amount of missing data
columns_to_drop = ['Other_AI_Tools_Used', 'Other_AI_Applications', 'Other_Learning_Methods', 'Other_Learning_Barriers', 'Other_Gender_Description']
cleaned_data = cleaned_data.drop(columns=columns_to_drop)

# Check the shape of the data after dropping columns
print(cleaned_data.shape)

Deleting rows with missing values 

In [None]:
# Find rows with missing values
rows_with_missing = cleaned_data[cleaned_data.isnull().any(axis=1)]

# Display rows with missing values
display(rows_with_missing)

In [None]:
# Drop rows with any missing values
cleaned_data = cleaned_data.dropna()

# Verify the shape of the data after dropping the row
print(cleaned_data.shape)

# Bar chart of missing data
msno.bar(cleaned_data)
plt.show()

Reser index to ensure row numbering is continuous and consistent

In [None]:
# Reset the index and drop the old index column
cleaned_data.reset_index(drop=True, inplace=True)

# Verify the new indexing
display(cleaned_data.head())