In [57]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [58]:
# Load in the full dataset
df = pd.read_csv("cleaned_data.csv")

# Make a copy of the full dataset; this is the one we will manipulate while still retaining the full original data
df_subset = df
df_subset.head()

Unnamed: 0,Country,Age,Annual_Salary,Gender,Education,Payment_Method_Credit_Debit,Living_Region,Online_Service_Preference,AI_Enhance_Experience,AI_Satisfaction,...,AI_Tools_Used_Voice_Photo,Payment_Method_COD,Payment_Method_Ewallet,Product_Category_Appliances,Product_Category_Electronics,Product_Category_Groceries,Product_Category_Personal_Care,Product_Category_Clothing,AI_Trust,AI_Usage
0,INDIA,Gen X,Medium High,Female,Master's Degree,NO,Metropolitan,NO,YES,YES,...,YES,YES,NO,YES,YES,YES,YES,YES,Moderate,high
1,INDIA,Gen Z,Low,Male,University Graduate,YES,Metropolitan,NO,YES,YES,...,YES,YES,YES,YES,YES,NO,NO,NO,Moderate,high
2,INDIA,Gen X,Medium High,Male,University Graduate,YES,Rural Areas,YES,YES,YES,...,NO,YES,YES,YES,YES,YES,NO,NO,Moderate,low
3,INDIA,Gen X,High,Male,University Graduate,YES,Rural Areas,YES,YES,YES,...,NO,YES,YES,YES,YES,YES,NO,YES,Moderate,none
4,INDIA,Gen Z,Low,Male,University Graduate,NO,Rural Areas,YES,YES,YES,...,NO,YES,YES,YES,NO,YES,NO,YES,Moderate,low


In [59]:
## This is for binary columns

# Specify the columns you want to modify
columns_to_modify = ['Payment_Method_Credit_Debit', 'Online_Service_Preference', 'AI_Enhance_Experience', 'AI_Satisfaction',
                     'AI_Tools_Used_Chatbots', 'AI_Tools_Used_Virtual_Assistant', 'AI_Tools_Used_Voice_Photo', 
                     'Payment_Method_COD', 'Payment_Method_Ewallet', 'Product_Category_Appliances', 
                     'Product_Category_Electronics', 'Product_Category_Groceries', 'Product_Category_Personal_Care',
                     'Product_Category_Clothing']

# Function to convert to binary
def col_binary(value):
    return 1 if value == 'YES' else 0

# Function to convert to binary
def gender_binary(value):
    return 1 if value == 'Male' else 0


# Apply the function to the specified columns
df_subset[columns_to_modify] = df_subset[columns_to_modify].applymap(col_binary)
df_subset['Gender'] = df_subset['Gender'].map(gender_binary)



df_subset.head()

Unnamed: 0,Country,Age,Annual_Salary,Gender,Education,Payment_Method_Credit_Debit,Living_Region,Online_Service_Preference,AI_Enhance_Experience,AI_Satisfaction,...,AI_Tools_Used_Voice_Photo,Payment_Method_COD,Payment_Method_Ewallet,Product_Category_Appliances,Product_Category_Electronics,Product_Category_Groceries,Product_Category_Personal_Care,Product_Category_Clothing,AI_Trust,AI_Usage
0,INDIA,Gen X,Medium High,0,Master's Degree,0,Metropolitan,0,1,1,...,1,1,0,1,1,1,1,1,Moderate,high
1,INDIA,Gen Z,Low,1,University Graduate,1,Metropolitan,0,1,1,...,1,1,1,1,1,0,0,0,Moderate,high
2,INDIA,Gen X,Medium High,1,University Graduate,1,Rural Areas,1,1,1,...,0,1,1,1,1,1,0,0,Moderate,low
3,INDIA,Gen X,High,1,University Graduate,1,Rural Areas,1,1,1,...,0,1,1,1,1,1,0,1,Moderate,none
4,INDIA,Gen Z,Low,1,University Graduate,0,Rural Areas,1,1,1,...,0,1,1,1,0,1,0,1,Moderate,low


In [60]:
## This is for the ordered data
# Defining the ordinal mapping for the Age column
age_mapping = {
    'Gen X' : 1, 
    'Gen Z' : 2,
    'Millennials' : 3,
    'Baby Boomers' : 4
}

# Defining the ordinal mapping for the Salary column
salary_mapping= {
    'Low' : 1, 
    'Medium' : 2,
    'Medium High' : 3,
    'High' : 4
}

# Defining the ordinal mapping for the education column
education_mapping = {
    'Highschool Graduate' : 1, 
    'University Graduate' : 2,
    "Master's Degree" : 3,
    'Doctorate Degree' : 4
}

# Defining the ordinal mapping for the ai_trust column
aitrust_mapping = {
    'Low' : 1, 
    'Moderate' : 2,
    'High' : 3
}

# Defining the ordinal mapping for the ai_trust column
aitrust_mapping = {
    'Low' : 1, 
    'Moderate' : 2,
    'High' : 3
}

# Defining the ordinal mapping for the ai_trust column
aiusage_mapping = {
    'none' : 0, 
    'low' : 1, 
    'moderate' : 2,
    'high' : 3
}
# Apply the mapping to each of the columns
df_subset['Annual_Salary'] = df_subset['Annual_Salary'].map(salary_mapping)
df_subset['Age'] = df_subset['Age'].map(age_mapping)
df_subset['Education'] = df_subset['Education'].map(education_mapping)
df_subset['AI_Trust'] = df_subset['AI_Trust'].map(aitrust_mapping)
df_subset['AI_Usage'] = df_subset['AI_Usage'].map(aiusage_mapping)

df_subset.head()

Unnamed: 0,Country,Age,Annual_Salary,Gender,Education,Payment_Method_Credit_Debit,Living_Region,Online_Service_Preference,AI_Enhance_Experience,AI_Satisfaction,...,AI_Tools_Used_Voice_Photo,Payment_Method_COD,Payment_Method_Ewallet,Product_Category_Appliances,Product_Category_Electronics,Product_Category_Groceries,Product_Category_Personal_Care,Product_Category_Clothing,AI_Trust,AI_Usage
0,INDIA,1,3,0,3,0,Metropolitan,0,1,1,...,1,1,0,1,1,1,1,1,2,3
1,INDIA,2,1,1,2,1,Metropolitan,0,1,1,...,1,1,1,1,1,0,0,0,2,3
2,INDIA,1,3,1,2,1,Rural Areas,1,1,1,...,0,1,1,1,1,1,0,0,2,1
3,INDIA,1,4,1,2,1,Rural Areas,1,1,1,...,0,1,1,1,1,1,0,1,2,0
4,INDIA,2,1,1,2,0,Rural Areas,1,1,1,...,0,1,1,1,0,1,0,1,2,1


In [61]:
## Freqeuncy encoding 
# Finding the frequency of each value 
country_freq = df_subset['Country'].value_counts()

# applying the frequences to the country column
df_subset['Country'] = df_subset['Country'].map(country_freq)

# Defining the mapping for living_region
living_region_mapping = {
    'Metropolitan': 1,
    'Suburban Areas': 10,
    'Rural Areas': 11
}

df_subset['Living_Region'] = df_subset['Living_Region'].map(living_region_mapping)

df_subset.head()


Unnamed: 0,Country,Age,Annual_Salary,Gender,Education,Payment_Method_Credit_Debit,Living_Region,Online_Service_Preference,AI_Enhance_Experience,AI_Satisfaction,...,AI_Tools_Used_Voice_Photo,Payment_Method_COD,Payment_Method_Ewallet,Product_Category_Appliances,Product_Category_Electronics,Product_Category_Groceries,Product_Category_Personal_Care,Product_Category_Clothing,AI_Trust,AI_Usage
0,165,1,3,0,3,0,1,0,1,1,...,1,1,0,1,1,1,1,1,2,3
1,165,2,1,1,2,1,1,0,1,1,...,1,1,1,1,1,0,0,0,2,3
2,165,1,3,1,2,1,11,1,1,1,...,0,1,1,1,1,1,0,0,2,1
3,165,1,4,1,2,1,11,1,1,1,...,0,1,1,1,1,1,0,1,2,0
4,165,2,1,1,2,0,11,1,1,1,...,0,1,1,1,0,1,0,1,2,1


In [62]:
df_subset.dtypes

Country                            int64
Age                                int64
Annual_Salary                      int64
Gender                             int64
Education                          int64
Payment_Method_Credit_Debit        int64
Living_Region                      int64
Online_Service_Preference          int64
AI_Enhance_Experience              int64
AI_Satisfaction                    int64
AI_Tools_Used_Chatbots             int64
AI_Tools_Used_Virtual_Assistant    int64
AI_Tools_Used_Voice_Photo          int64
Payment_Method_COD                 int64
Payment_Method_Ewallet             int64
Product_Category_Appliances        int64
Product_Category_Electronics       int64
Product_Category_Groceries         int64
Product_Category_Personal_Care     int64
Product_Category_Clothing          int64
AI_Trust                           int64
AI_Usage                           int64
dtype: object

In [63]:
# We might need to change these values at a later time
df['Living_Region'].unique()


array([ 1, 11, 10], dtype=int64)

In [64]:
# *** Please Note ***
# For These Columns:
# 'Payment_Method_Credit_Debit',
# 'Online_Service_Preference',
# 'AI_Enhance_Experience',
# 'AI_Satisfaction'
# 'AI_Tools_Used_Chatbots',
# 'AI_Tools_Used_Virtual_Assistant',
# 'AI_Tools_Used_Voice_Photo',
# 'Payment_Method_COD',
# 'Payment_Method_Ewallet',
# 'Product_Category_Appliances',
# 'Product_Category_Electronics',
# 'Product_Category_Groceries',
# 'Product_Category_Personal_Care', 
# 'Product_Category_Clothing'

# 1 = Yes | 0 = No
# ___________________________________________________________________________________________-

# For This Column:
# 'Gender'

# 1 = Male | 0 = Female
# ___________________________________________________________________________________________-

# For This Column:
# 'Age'

# 1 = Gen X | 2 = Gen Z | 3 = Millennials | 4 = Baby Boomers
# ___________________________________________________________________________________________

# For These Columns:
# 'Salary'

# 1 = Low | 2 = Medium | 3 = Medium High | 4 = High
# ___________________________________________________________________________________________

# For These Columns:
# 'Education'

# 1 = Highschool Graduate | 2 = University Graduate | 3 = Master's Degree | 4 = Doctorate Degree
# ___________________________________________________________________________________________

# For These Columns:
# 'AI Trust'

# 1 = Low | 2 = Moderate | 3 = High
# ___________________________________________________________________________________________

# For These Columns:
# 'AI Usage'

# 0 = None | 1 = Low | 2 = Moderate | 3 = High

In [65]:
# Export DataFrame to CSV
df.to_csv('encoded_data.csv', index=False)

