# Environment Config and Data Loading

In [1]:
import numpy as np 
import pandas as pd 

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/online-payment-intentions/In22-CS3121-Project Dataset.xlsx


In [None]:
df = pd.read_excel("/kaggle/input/online-payment-intentions/In22-CS3121-Project Dataset.xlsx")

In [None]:
df_copy = df.copy()

# Data Overview

In [None]:
df_copy.head()

In [None]:
df_copy.describe()

In [None]:
df_copy.info()

In [None]:
df_copy.isnull().sum()

## Naming Convention

In [None]:
df_copy.columns = df_copy.columns.str.strip().str.replace(r'(?<=[a-z])(?=[A-Z])', '_', regex=True).str.lower()
df_copy.columns

# Data Preprocessing

## Time Conversion and Validation

In [None]:
#df_copy["timestamp"] = pd.to_datetime(df_copy["timestamp"])

In [None]:
df_copy.loc[529]

Invalid Date - There is no 31st day in September (09). 

In [None]:
df_copy.iloc[529:535]

529, 530, 531, 532 have invalid dates, will be marked as NaT

In [None]:
df_copy['timestamp'] = pd.to_datetime(df_copy['timestamp'], errors='coerce')

In [None]:
df_copy.iloc[529:535]

In [None]:
# df_copy.dtypes

## Gender

In [None]:
df_copy["gender"].isna().sum()

In [None]:
df_copy["gender"].unique()

In [None]:
df_copy['gender'] = df_copy['gender'].map({'Male': 0, 'Female': 1, 'Prefer not to say': -1})

In [None]:
df_copy["gender"].unique()

## Age

In [None]:
df_copy["age"].isna().sum()

In [None]:
df_copy["age"].unique()

In [None]:
df_copy['age'] = df_copy['age'].map({'18 - 25': 0, '25 - 35': 1, '35 - 45': 2, '45 - 55': 3})

## Highest Level of Education

In [None]:
df_copy["highest level of education"].isna().sum()

In [None]:
df_copy["highest level of education"].unique()

In [None]:
df_copy["highest level of education"].value_counts()

Option school can be considered invalid, as it has a low frequency and ambiguous

In [None]:
df_copy.loc[df_copy['highest level of education'] == 'School ']


Assuming "School" refers to basic education, I'm merging it with "Grade 8 pass"

In [None]:
df_copy['highest level of education'] = df_copy['highest level of education'].replace('School ', 'Grade 8 pass')

In [None]:
df_copy['highest level of education'] = df_copy['highest level of education'].map({'Grade 8 pass': 0, 'High school': 1, 'Diploma / Certificate Course': 2, "Bachelor's Degree": 3, "Master's Degree or higher": 4, "Postgraduate Diploma": 5})

## Marital Status

In [None]:
df_copy["marital status"].unique()

In [None]:
df_copy["marital status"].isna().sum()

In [None]:
df_copy["marital status"] = df_copy["marital status"].map({'Single':0, 'Married':1})

## have used online shopping platforms before

In [None]:
df_copy["have used online shopping platforms before"].isna().sum()

In [None]:
df_copy["have used online shopping platforms before"].unique()

In [None]:
df_copy["have used online shopping platforms before"] = df_copy["have used online shopping platforms before"].map({'No':0,'Yes':1})

## have you made online purchases during crisis time?

In [None]:
df_copy["have you made online purchases during crisis time?"].isna().sum()

In [None]:
df_copy["have you made online purchases during crisis time?"].unique()

In [None]:
df_copy["have you made online purchases during crisis time?"] = df_copy["have you made online purchases during crisis time?"].map({'No':0,'Yes':1})

## professional background

In [None]:
df_copy["professional background"].isna().sum()

In [None]:
df_copy["professional background"].unique()

This distinction matters because unemployment rates only count people who are actively seeking work, while joblessness includes everyone without a job, even if they’re not looking for one.

In [None]:
df_copy["professional background"].value_counts()

In [None]:
df_copy[df_copy["professional background"] == "Jobless"]

In [None]:
df_copy[df_copy["professional background"] == "Unemployed"]

Although there is a subtle difference between unemployed and jobless, in our context, it is not considerable due to the low frequency, hence merging into one category.

In [None]:
df_copy['professional background'] = df_copy['professional background'].replace('Jobless', 'Unemployed')

In [None]:
df_copy['professional background'], _ = pd.factorize(df_copy['professional background'])

## general e-commerce platforms

In [None]:
df_copy["general e-commerce platforms"].isna().sum()

In [None]:
df_copy["general e-commerce platforms"].value_counts()

In [None]:
df_copy["general e-commerce platforms"].unique()

In [None]:
df_copy["general e-commerce platforms"] = df_copy["general e-commerce platforms"].str.lower().str.strip()
df_copy['general e-commerce platforms'] = df_copy['general e-commerce platforms'].str.rstrip(',')

In [None]:
df_copy = df_copy[~df_copy["general e-commerce platforms"].str.contains("purchase|books|clothing", na=False)]

In [None]:
mode_value = df_copy["general e-commerce platforms"].mode()[0]

df_copy["general e-commerce platforms"].fillna(mode_value, inplace=True)

In [None]:
platform_mapping = {
    "ali express.com": "ali-express",
    "aliexpress.com": "ali-express",
    "ali  express" : "ali-express",
    "ali express": "ali-express",
    "aliexpress": "ali-express",
    "ebay": "ebay",
    "daraz.lk": "daraz",
    "kapruka.com": "kapruka",
    "ikman.lk": "ikman",
    "keels website": "keels",
    "takas.lk": "takas",
    "wishique": "wishque",
    "vishq": "wishque",
    "wow.lk" : "wow"
}

df_copy = df_copy.copy()
df_copy["general e-commerce platforms"] = df_copy["general e-commerce platforms"].replace(platform_mapping, regex=True)

In [None]:
df_copy["general e-commerce platforms"] = df_copy["general e-commerce platforms"].replace("kapruka, ali-express.com", "kapruka, ali-express")

In [None]:
df_copy["general e-commerce platforms"].value_counts()

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer

df_copy['platforms_list'] = df_copy['general e-commerce platforms'].apply(
    lambda x: [platform.strip() for platform in x.split(',') if platform.strip() != '']
)

mlb = MultiLabelBinarizer()
one_hot_encoded = pd.DataFrame(mlb.fit_transform(df_copy['platforms_list']),
                               columns=mlb.classes_,
                               index=df_copy.index)

df_copy = df_copy.join(one_hot_encoded)


In [None]:
df_copy.columns

## specialty online stores / automobile

In [None]:
df_copy["specialty online stores / automobile"].isna().sum()

In [None]:
df_copy["specialty online stores / automobile"].unique()

In [None]:
df_copy["specialty online stores / automobile"].value_counts()

In [None]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer

specialty_col = "specialty online stores / automobile"

# standardized by converting to lowercase and stripping extra whitespaces
df_copy[specialty_col] = df_copy[specialty_col].str.lower().str.strip()

# Remove any trailing commas and extra spaces
df_copy[specialty_col] = df_copy[specialty_col].str.rstrip(",")

df_copy[specialty_col] = df_copy[specialty_col].replace(
    "no experience in this flatform", "x", regex=True
)

# replacing ambiguous values that indicate no participation or missing experience
df_copy[specialty_col] = df_copy[specialty_col].replace(
    ["no", "none", "n/a", "x"], "zero specialty online stores / automobile"
)


# Fill missing values with "unknown"
df_copy[specialty_col].fillna("specialty online stores / automobile unknown", inplace=True)

df_copy["specialty_platform_list"] = df_copy[specialty_col].apply(
    lambda x: [platform.strip() for platform in x.split(",") if platform.strip() != ""]
)

#One-hot encoding the list-of-platforms using MultiLabelBinarizer
mlb = MultiLabelBinarizer()
specialty_encoded = pd.DataFrame(
    mlb.fit_transform(df_copy["specialty_platform_list"]),
    columns=mlb.classes_,
    index=df_copy.index
)

df_copy = pd.concat([df_copy, specialty_encoded], axis=1)

df_copy.columns

## online pharmacies

In [None]:
df_copy["online pharmacies"].isna().sum()

In [None]:
df_copy["online pharmacies"] = df_copy["online pharmacies"].fillna("online pharmacy unknown")

In [None]:
df_copy["online pharmacies"].unique()

In [None]:
df_copy["online pharmacies"] = df_copy["online pharmacies"].str.lower().str.strip()

In [None]:
df_copy["online pharmacies"] = df_copy["online pharmacies"].replace(["no", "none", "n/a", 'no ', "didn't order any pharmacy items online"], "no online pharmacies")
df_copy["online pharmacies"] = df_copy["online pharmacies"].replace(["nearby",'pharmacy'], "nearby pharmacy")
df_copy["online pharmacies"] = df_copy["online pharmacies"].replace("healthgurd.lk, ceymed.lk, nearby", "healthgurd.lk, ceymed.lk, nearby pharmacy")

Didn't order any pharmacy items online                                1\
No                                                                          1\
None                                                                       2\
No                                                                        4

Nearby                                                                     2\
nearby                                                                    98


Inconsistence Entries

In [None]:
df_copy["online pharmacies"].value_counts()

In [None]:
df_copy["pharmacy_list"] = df_copy["online pharmacies"].str.split(", ")

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
df_encoded = pd.DataFrame(mlb.fit_transform(df_copy["pharmacy_list"]), columns=mlb.classes_, index=df_copy.index)

df_copy = df_copy.join(df_encoded)

df_copy.columns

## fashion and beauty retailers

In [None]:
df_copy['fashion and beauty retailers'].isna().sum()

In [None]:
df_copy['fashion and beauty retailers'] = df_copy['fashion and beauty retailers'].fillna("unknown fashion and beauty retailers")

In [None]:
df_copy['fashion and beauty retailers'].unique()

In [None]:
df_copy['fashion and beauty retailers'].value_counts()

### Standardize Formatting

In [None]:
df_copy["fashion and beauty retailers"] = df_copy["fashion and beauty retailers"].str.lower().str.strip()

### Handle Missing or Irrelevant Values

In [None]:
 df_copy["fashion and beauty retailers"] = df_copy["fashion and beauty retailers"].replace(["didn’t used","n/a"] , "no fashion and beauty retailers")

In [None]:
 df_copy["fashion and beauty retailers"] = df_copy["fashion and beauty retailers"].replace(["kelly felder", " kellf felder", "Kelly Felder", "Kelly felder", " kellyfelder"], "kelly felder")

In [None]:
 df_copy["fashion and beauty retailers"] = df_copy["fashion and beauty retailers"].replace("kapruka.com/fashion, gflock, kellf felder" , "kapruka.com/fashion, gflock, kelly felder")

In [None]:
 df_copy["fashion and beauty retailers"] = df_copy["fashion and beauty retailers"].replace('fashionbug.lk, nolimit.lk / gflock.lk', "fashionbug.lk, nolimit.lk, gflock")

In [None]:
 df_copy["fashion and beauty retailers"] = df_copy["fashion and beauty retailers"].replace('kapruka.com/fashion, fashionbug.lk,', "kapruka.com/fashion, fashionbug.lk")

In [None]:
 df_copy["fashion and beauty retailers"] = df_copy["fashion and beauty retailers"].replace('odel.lk,', "odel.lk")

In [None]:
 df_copy["fashion and beauty retailers"] = df_copy["fashion and beauty retailers"].replace('kapruka.com/fashion, kellyfelder', "kapruka.com/fashion, kelly felder")

In [None]:
df_copy["retailer_list"] = df_copy["fashion and beauty retailers"].str.split(", ")

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
df_encoded = pd.DataFrame(mlb.fit_transform(df_copy["retailer_list"]), columns=mlb.classes_, index=df_copy.index)

df_copy = df_copy.join(df_encoded)

In [None]:
df_copy.columns

## grocery delivery services

In [None]:
df_copy['grocery delivery services'].isna().sum()

In [None]:
df_copy['grocery delivery services'] = df_copy['grocery delivery services'].fillna("unknown grocery delivery services")

In [None]:
df_copy['grocery delivery services'].unique()

In [None]:
df_copy['grocery delivery services'].value_counts()

In [None]:
df_copy["grocery delivery services"] = df_copy["grocery delivery services"].str.lower().str.strip()

In [None]:
df_copy["grocery delivery services"] = df_copy["grocery delivery services"].replace(["n/a"], "grocery delivery services not used")

In [None]:
df_copy["grocery delivery services"] = df_copy["grocery delivery services"].replace(["ubereats.com, pickmefoods.com, keellssuper.lk, arpico.lk, glomark"], "ubereats.com, pickmefoods.com, keellssuper.lk, arpico.lk, glomark.lk")

In [None]:
df_copy["grocery delivery services"] = df_copy["grocery delivery services"].replace(["ubereats.com, cargillis food city"], "ubereats.com, cargillsonline.com")

In [None]:
df_copy["grocery delivery services"] = df_copy["grocery delivery services"].replace(["keellssuper.lk, keels"], "keellssuper.lk")

In [None]:
df_copy["grocery_delivery_list"] = df_copy["grocery delivery services"].apply(
    lambda x: [service.strip() for service in x.split(", ") if service.strip() != ""]
)

# One-hot encoding the resulting list using MultiLabelBinarizer.
mlb = MultiLabelBinarizer()
one_hot_encoded = pd.DataFrame(
    mlb.fit_transform(df_copy["grocery_delivery_list"]),
    columns=mlb.classes_,
    index=df_copy.index
)

# Merging the one-hot encoded columns with the original DataFrame.
df_copy = pd.concat([df_copy, one_hot_encoded], axis=1)

In [None]:
df_copy.columns

## Level of Agreement

In [None]:
df_copy['please indicate your level of agreement with the following statements. [it is easy to navigate through the online platform to find the products or services i need.]'].isna().sum()

In [None]:
df_copy['please indicate your level of agreement with the following statements. [it is easy to navigate through the online platform to find the products or services i need.]'].unique()

In [None]:
df_copy['please indicate your level of agreement with the following statements. [it is easy to navigate through the online platform to find the products or services i need.]'].value_counts()

In [None]:
level_of_agreement_columns = [col for col in df_copy.columns if col.startswith("please indicate your level of agreement")]
print(len(level_of_agreement_columns))
level_of_agreement_columns

In [None]:
uniques = []
for col in level_of_agreement_columns:
    if set(df_copy[col].unique().tolist()) not in uniques:
        uniques.append(set(df_copy[col].unique().tolist()))

print(len(uniques))

for ls in uniques:
    print(ls)

In [None]:
for col in level_of_agreement_columns:
    df_copy[col] = df_copy[col].str.lower().str.strip()
    df_copy[col] = df_copy[col].replace(["srongly disagree"], "strongly disagree")
    

In [None]:
for col in level_of_agreement_columns:
    print(f"null count-{df_copy[col].isna().sum()}, categories-{sorted(df_copy[col].unique().tolist())}")

In [None]:
from sklearn.preprocessing import OrdinalEncoder

# same order for each agreement column
categories_order = [ ['strongly disagree', 'disagree', 'neutral', 'agree', 'strongly agree'] ] * len(level_of_agreement_columns)

# Apply Ordinal Encoding
encoder = OrdinalEncoder(categories=categories_order)

df_copy[level_of_agreement_columns] = encoder.fit_transform(df_copy[level_of_agreement_columns])

df_copy.head()


In [None]:
df_copy.info()

##

## reliability and validity of the survey questions

### Cronbatch’s alpha coefficient value

In [None]:
import numpy as np

def cronbach_alpha(df_items):
    df_items = df_items.dropna(axis=0)
    item_vars = df_items.var(ddof=1)
    total_var = df_items.sum(axis=1).var(ddof=1)
    n_items = df_items.shape[1]
    alpha = (n_items / (n_items - 1)) * (1 - item_vars.sum() / total_var)
    return alpha

In [None]:
ease_nav = df_copy[['please indicate your level of agreement with the following statements. [it is easy to navigate through the online platform to find the products or services i need.]', 'please indicate your level of agreement with the following statements. [the online platform provides clear and intuitive navigation options.]']]
alpha_ease_nav = cronbach_alpha(ease_nav)
print("Cronbach's Alpha for Ease of Navigation:", alpha_ease_nav)

In [None]:
ins_clarity = df_copy[['please indicate your level of agreement with the following statements. [it was easy for me to learn how to use the online platform for making online purchases.]', 'please indicate your level of agreement with the following statements. [i quickly became proficient in using the online platform for making online purchases during a crisis.]']]
alpha_ins_clarity = cronbach_alpha(ins_clarity)
print("Cronbach's Alpha for Clarity of Instructions:", alpha_ins_clarity)

In [None]:
learning_curve = df_copy[[
    'please indicate your level of agreement with the following statements. [the instructions provided on the online platform are clear and easy to understand.]',
    'please indicate your level of agreement with the following statements. [i can easily follow the instructions given on the online platform for making online purchases.]'
]]
alpha_learning_curve = cronbach_alpha(learning_curve)
print("Cronbach's Alpha for Learning Curve:", alpha_learning_curve)


In [None]:
response_time = df_copy[[
    'please indicate your level of agreement with the following statements. [the online platform responds quickly to my actions, such as loading pages and processing transactions.]',
    'please indicate your level of agreement with the following statements. [i don\'t experience delays or long waiting times when using the online platform for making online purchases during a crisis.]'
]]
alpha_response_time = cronbach_alpha(response_time)
print("Cronbach's Alpha for Response Time:", alpha_response_time)


In [None]:
error_handling = df_copy[[
    'please indicate your level of agreement with the following statements. [the online platform effectively handles errors or mistakes, such as providing clear error messages and easy recovery options.]',
    'please indicate your level of agreement with the following statements. [i rarely encounter errors or issues when using the online platform for making online purchases during a crisis.]'
]]
alpha_error_handling = cronbach_alpha(error_handling)
print("Cronbach's Alpha for Error Handling:", alpha_error_handling)


In [None]:
product_availability = df_copy[[
    'please indicate your level of agreement with the following statements. [the online platform offers a wide range of products and services that meet my needs during a crisis.]',
    'please indicate your level of agreement with the following statements. [i can find the products or services i need on the online platform during a crisis.]'
]]
alpha_product_availability = cronbach_alpha(product_availability)
print("Cronbach's Alpha for Product Availability:", alpha_product_availability)


In [None]:
convenience = df_copy[[
    'please indicate your level of agreement with the following statements. [the online platform makes it easy to search for products and complete transactions during a crisis.]',
    'please indicate your level of agreement with the following statements. [using the online platform for making purchases during a crisis is convenient and saves time.]'
]]
alpha_convenience = cronbach_alpha(convenience)
print("Cronbach's Alpha for Convenience:", alpha_convenience)


In [None]:
cost_effectiveness = df_copy[[
    'please indicate your level of agreement with the following statements. [the online platform offers competitive prices, discounts, or cost-saving benefits during a crisis.]',
    'please indicate your level of agreement with the following statements. [i perceive that using the online platform for making purchases during a crisis can help me save money.]'
]]
alpha_cost_effectiveness = cronbach_alpha(cost_effectiveness)
print("Cronbach's Alpha for Cost-effectiveness:", alpha_cost_effectiveness)


In [None]:
information_accessibility = df_copy[[
    'please indicate your level of agreement with the following statements. [the online platform provides detailed and accurate product information during a crisis.]',
    'please indicate your level of agreement with the following statements. [i can easily access user reviews, ratings, and other relevant information to support my purchase decisions.]'
]]
alpha_information_accessibility = cronbach_alpha(information_accessibility)
print("Cronbach's Alpha for Information Accessibility:", alpha_information_accessibility)


In [None]:
personalization = df_copy[[
    'please indicate your level of agreement with the following statements. [the online platform tailors recommendations, suggestions, or personalized offers based on my preferences.]',
    'please indicate your level of agreement with the following statements. [i feel that the online platform understands my needs and preferences during a crisis.]'
]]
alpha_personalization = cronbach_alpha(personalization)
print("Cronbach's Alpha for Personalization:", alpha_personalization)


In [None]:
privacy_protection = df_copy[[
    'please indicate your level of agreement with the following statements. [the online platform takes appropriate measures to protect my privacy.]',
    'please indicate your level of agreement with the following statements. [i trust that my personal information will be handled securely by the online platform.]'
]]
alpha_privacy_protection = cronbach_alpha(privacy_protection)
print("Cronbach's Alpha for Privacy Protection:", alpha_privacy_protection)


In [None]:
payment_security = df_copy[[
    'please indicate your level of agreement with the following statements. [the online platform provides secure payment methods to protect against fraudulent activities during a crisis.]',
    'please indicate your level of agreement with the following statements. [i trust that my payment details are handled securely by the online platform.]'
]]
alpha_payment_security = cronbach_alpha(payment_security)
print("Cronbach's Alpha for Payment Security:", alpha_payment_security)


In [None]:
transparent_policies = df_copy[[
    'please indicate your level of agreement with the following statements. [the online platform provides clear and easily accessible policies regarding data handling, privacy, and security.]',
    'please indicate your level of agreement with the following statements. [i feel confident in the online platform\'s transparency regarding its data protection practices.]'
]]
alpha_transparent_policies = cronbach_alpha(transparent_policies)
print("Cronbach's Alpha for Transparent Policies:", alpha_transparent_policies)


In [None]:
word_of_mouth = df_copy[[
    'please indicate your level of agreement with the following statements. [i am influenced by recommendations and opinions from family and friends when making online purchases during a crisis.]',
    'please indicate your level of agreement with the following statements. [i consider the experiences and suggestions shared by people i know before making online purchases during a crisis.]'
]]
alpha_word_of_mouth = cronbach_alpha(word_of_mouth)
print("Cronbach's Alpha for Word of Mouth and Recommendations:", alpha_word_of_mouth)


In [None]:
online_reviews_ratings = df_copy[[
    'please indicate your level of agreement with the following statements. [i consider online reviews and ratings when making purchasing decisions during a crisis.]',
    'please indicate your level of agreement with the following statements. [positive reviews and high ratings increase my confidence in making online purchases during a crisis.]'
]]
alpha_online_reviews_ratings = cronbach_alpha(online_reviews_ratings)
print("Cronbach's Alpha for Online Reviews and Ratings:", alpha_online_reviews_ratings)


In [None]:
social_proof = df_copy[[
    'please indicate your level of agreement with the following statements. [observing others making online purchases during a crisis influences my own intention to make similar purchases.]',
    'please indicate your level of agreement with the following statements. [i am more likely to make online purchases during a crisis if i see others doing the same]'
]]
alpha_social_proof = cronbach_alpha(social_proof)
print("Cronbach's Alpha for Social Proof:", alpha_social_proof)


In [None]:
normative_pressure = df_copy[[
    'please indicate your level of agreement with the following statements. [social norms and expectations regarding online shopping during a crisis influence my own intention to make online purchases.]',
    'please indicate your level of agreement with the following statements. [i feel pressure to make online purchases during a crisis due to the expectations of others.]'
]]
alpha_normative_pressure = cronbach_alpha(normative_pressure)
print("Cronbach's Alpha for Normative Pressure:", alpha_normative_pressure)


In [None]:
information_sharing = df_copy[[
    'please indicate your level of agreement with the following statements. [i am willing to share my own online shopping experiences, recommendations, or opinions with others during a crisis.]',
    'please indicate your level of agreement with the following statements. [sharing information about my online purchases during a crisis is important for influencing others\' purchase decisions.]'
]]
alpha_information_sharing = cronbach_alpha(information_sharing)
print("Cronbach's Alpha for Information Sharing:", alpha_information_sharing)


In [None]:
intention_to_purchase = df_copy[[
    'please indicate your level of agreement with the following statements. [overall, i have a positive attitude toward online shopping during a crisis.]',
    'please indicate your level of agreement with the following statements. [i believe that online shopping is a practical and efficient way to make purchases during a crisis.]'
]]
alpha_intention_to_purchase = cronbach_alpha(intention_to_purchase)
print("Cronbach's Alpha for Intention to Purchase Online:", alpha_intention_to_purchase)


In [None]:
social_media_influence = df_copy[[
    'please indicate your level of agreement with the following statements. [during a crisis, social media platforms, influencers, and online communities influence my online purchase decisions.]',
    'please indicate your level of agreement with the following statements. [i am likely to make online purchases during a crisis based on what i see or learn from social media platforms.]'
]]
alpha_social_media_influence = cronbach_alpha(social_media_influence)
print("Cronbach's Alpha for Social Media Posts/Influence:", alpha_social_media_influence)


### Inter-item correlation

In [None]:
variables = {
    "Ease of Navigation": ease_nav,
    "Clarity of Instructions": ins_clarity,
    "Learning Curve": learning_curve,
    "Error Handling": error_handling,
    "Response Time": response_time,
    "Product Availability": product_availability,
    "Convenience": convenience,
    "Cost-effectiveness": cost_effectiveness,
    "Information Accessibility": information_accessibility,
    "Personalization": personalization,
    "Privacy Protection": privacy_protection,
    "Payment Security": payment_security,
    "Transparent Policies": transparent_policies,
    "Word of Mouth and Recommendations": word_of_mouth,
    "Social Media Posts/Influence": social_media_influence,
    "Online Reviews and Ratings": online_reviews_ratings,
    "Social Proof": social_proof,
    "Normative Pressure": normative_pressure,
    "Information Sharing": information_sharing,
    "Intention to Purchase Online": intention_to_purchase,
}


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

for var_name, df_subset in variables.items():
    corr = df_subset.corr(method='pearson')
    plt.figure(figsize=(6, 4))
    sns.heatmap(corr, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
    plt.title(f"Inter-item Correlation: {var_name}")
    plt.tight_layout()
    plt.show()


# EDA

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
df_copy2 = df_copy.copy()

In [None]:
# Rename duplicate 'amazon' columns
amazon_indices = [i for i, col in enumerate(df_copy2.columns) if col == 'amazon']
if len(amazon_indices) == 2:
    new_columns = df_copy2.columns.tolist()
    new_columns[amazon_indices[0]] = 'amazon_ecommerce'
    new_columns[amazon_indices[1]] = 'amazon_automotive'
    df_copy2.columns = new_columns
    print("Renamed 'amazon' columns to 'amazon_ecommerce' and 'amazon_automotive'")
else:
    print("Expected 2 'amazon' columns, found:", len(amazon_indices))

In [None]:
column_info = pd.DataFrame({
    "Column Name": df_copy2.columns,
    "Data Type": df_copy2.dtypes.values
})
pd.set_option('display.max_rows', None) 
print(column_info)

In [None]:
df_copy2 = df_copy2.drop(['timestamp', 'platforms_list', 'specialty_platform_list', 'pharmacy_list', 'retailer_list', 'grocery_delivery_list'], axis=1)

In [None]:
column_info = pd.DataFrame({
    "Column Name": df_copy2.columns,
    "Data Type": df_copy2.dtypes.values
})
pd.set_option('display.max_rows', None) 
print(column_info)

In [None]:
column_info.to_excel("column_info.xlsx", index=False)

**Unique values in 'general e-commerce platforms':** <br>
['daraz, 'ikman', 'kapruka', 'wow', 'takas', 'wishque', 'ali-express', 'strong.lk', 'ebay', 'keels', 'amazon', 'pickme', 'lassana.com', 'instagram stores', 'shein']

**Unique values in 'specialty online stores / automobile':** <br>
['specialty online stores / automobile unknown', 'patpat.lk', 'riyasewana.lk', 'lankavechicle.com', 'autolanka.com', 'pricelanka.lk', 'patpat.lk', 'toyota', 'zero specialty online stores / automobile', 'ikman.lk', 'amazon']

**Unique values in 'online pharmacies':** <br>
['online pharmacy unknown', 'epharma.lk, 'onlinepharmacy.lk', 'healthgurd.lk', 'ceymed.lk', 'union chemist', 'healthnet.lk', 'no online pharmacies', 'nearby pharmacy']

**Unique values in 'fashion and beauty retailers':** <br>
['unknown fashion and beauty retailers', 'kapruka.com/fashion', 'fashionbug.lk', 'odel.lk', 'thilakawardana.lk', 'noorareed.lk', 'midnightdivas.com', 'kelly felder', 'zigzag.lk', 'moose', 'chenara dodge', 'dsi', 'no fashion and beauty retailers', 'nills', 'spring and summer', 'nolimit.lk, gflock', 'moods', 'amanthe', 's&s', 'carlo clothing', 'tharshana and insta shops', 'kynd fashion, carnage', 'mimosa']

**Unique values in 'grocery delivery services':** <br>
['ubereats.com', 'pickmefoods.com', 'keellssuper.lk', 'arpico.lk', 'unknown grocery delivery services', 'glomark.lk', 'cinnamonhotel.com', 'cargillsonline.com', 'grocery delivery services not used']

In [None]:
binary_features = [
    'have used online shopping platforms before', 'have you made online purchases during crisis time?',
    'ali-express', 'amazon_ecommerce', 'daraz', 'ebay', 'ikman', 'instagram stores', 'kapruka', 'keels', 
    'lassana.com', 'pickme', 'shein', 'strong.lk', 'takas', 'wishque', 'wow',
    'amazon_automotive', 'autolanka.com', 'ikman.lk', 'lankavechicle.com', 'patpat.lk', 'pricelanka.lk', 
    'riyasewana.lk', 'specialty online stores / automobile unknown', 'toyota',
    'zero specialty online stores / automobile', 'ceymed.lk', 'epharma.lk', 
    'healthgurd.lk', 'healthnet.lk', 'nearby pharmacy', 'no online pharmacies', 
    'online pharmacy unknown', 'onlinepharmacy.lk', 'union chemist', 'amanthe', 
    'carlo clothing', 'carnage,', 'chenara dodge', 'dsi', 'fashionbug.lk', 'gflock',
    'kapruka.com/fashion', 'kelly felder', 'kynd fashion', 'midnightdivas.com', 'mimosa', 'moods',
    'moose', 'nills', 'no fashion and beauty retailers', 'nolimit.lk', 'noorareed.lk', 'odel.lk',
    's&s', 'spring and summer', 'tharshana and insta shops', 'thilakawardana.lk', 
    'unknown fashion and beauty retailers', 'zigzag.lk', 'arpico.lk', 
    'cargillsonline.com', 'cinnamonhotel.com', 'glomark.lk', 'grocery delivery services not used', 
    'keellssuper.lk', 'pickmefoods.com', 'ubereats.com', 'unknown grocery delivery services'
]

other_features = [
    'please indicate your level of agreement with the following statements. [it is easy to navigate through the online platform to find the products or services i need.]',
    'please indicate your level of agreement with the following statements. [the online platform provides clear and intuitive navigation options.]',
    'please indicate your level of agreement with the following statements. [it was easy for me to learn how to use the online platform for making online purchases.]',
    'please indicate your level of agreement with the following statements. [i quickly became proficient in using the online platform for making online purchases during a crisis.]',
    'please indicate your level of agreement with the following statements. [the instructions provided on the online platform are clear and easy to understand.]',
    'please indicate your level of agreement with the following statements. [i can easily follow the instructions given on the online platform for making online purchases.]',
    'please indicate your level of agreement with the following statements. [the online platform responds quickly to my actions, such as loading pages and processing transactions.]',
    'please indicate your level of agreement with the following statements. [i don\'t experience delays or long waiting times when using the online platform for making online purchases during a crisis.]',
    'please indicate your level of agreement with the following statements. [the online platform effectively handles errors or mistakes, such as providing clear error messages and easy recovery options.]',
    'please indicate your level of agreement with the following statements. [i rarely encounter errors or issues when using the online platform for making online purchases during a crisis.]',
    'please indicate your level of agreement with the following statements. [the online platform offers a wide range of products and services that meet my needs during a crisis.]',
    'please indicate your level of agreement with the following statements. [i can find the products or services i need on the online platform during a crisis.]',
    'please indicate your level of agreement with the following statements. [using the online platform for making purchases during a crisis is convenient and saves time.]',
    'please indicate your level of agreement with the following statements. [the online platform makes it easy to search for products and complete transactions during a crisis.]',
    'please indicate your level of agreement with the following statements. [the online platform offers competitive prices, discounts, or cost-saving benefits during a crisis.]',
    'please indicate your level of agreement with the following statements. [i perceive that using the online platform for making purchases during a crisis can help me save money.]',
    'please indicate your level of agreement with the following statements. [the online platform provides detailed and accurate product information during a crisis.]',
    'please indicate your level of agreement with the following statements. [i can easily access user reviews, ratings, and other relevant information to support my purchase decisions.]',
    'please indicate your level of agreement with the following statements. [the online platform tailors recommendations, suggestions, or personalized offers based on my preferences.]',
    'please indicate your level of agreement with the following statements. [i feel that the online platform understands my needs and preferences during a crisis.]',
    'please indicate your level of agreement with the following statements. [the online platform takes appropriate measures to protect my privacy.]',
    'please indicate your level of agreement with the following statements. [i trust that my personal information will be handled securely by the online platform.]',
    'please indicate your level of agreement with the following statements. [the online platform provides secure payment methods to protect against fraudulent activities during a crisis.]',
    'please indicate your level of agreement with the following statements. [i trust that my payment details are handled securely by the online platform.]',
    'please indicate your level of agreement with the following statements. [the online platform provides clear and easily accessible policies regarding data handling, privacy, and security.]',
    'please indicate your level of agreement with the following statements. [i feel confident in the online platform\'s transparency regarding its data protection practices.]',
    'please indicate your level of agreement with the following statements. [i am influenced by recommendations and opinions from family and friends when making online purchases during a crisis.]',
    'please indicate your level of agreement with the following statements. [i consider the experiences and suggestions shared by people i know before making online purchases during a crisis.]',
    'please indicate your level of agreement with the following statements. [during a crisis, social media platforms, influencers, and online communities influence my online purchase decisions.]',
    'please indicate your level of agreement with the following statements. [i am likely to make online purchases during a crisis based on what i see or learn from social media platforms.]',
    'please indicate your level of agreement with the following statements. [i consider online reviews and ratings when making purchasing decisions during a crisis.]',
    'please indicate your level of agreement with the following statements. [positive reviews and high ratings increase my confidence in making online purchases during a crisis.]',
    'please indicate your level of agreement with the following statements. [observing others making online purchases during a crisis influences my own intention to make similar purchases.]',
    'please indicate your level of agreement with the following statements. [i am more likely to make online purchases during a crisis if i see others doing the same]',
    'please indicate your level of agreement with the following statements. [social norms and expectations regarding online shopping during a crisis influence my own intention to make online purchases.]',
    'please indicate your level of agreement with the following statements. [i feel pressure to make online purchases during a crisis due to the expectations of others.]',
    'please indicate your level of agreement with the following statements. [i am willing to share my own online shopping experiences, recommendations, or opinions with others during a crisis.]',
    'please indicate your level of agreement with the following statements. [sharing information about my online purchases during a crisis is important for influencing others\' purchase decisions.]',
    'please indicate your level of agreement with the following statements. [overall, i have a positive attitude toward online shopping during a crisis.]',
    'please indicate your level of agreement with the following statements. [i believe that online shopping is a practical and efficient way to make purchases during a crisis.]',
    'please indicate your level of agreement with the following statements. [i am concerned about the security of my personal and financial information when shopping online during a crisis.]',
    'please indicate your level of agreement with the following statements. [i am cautious about the reliability and authenticity of products or services offered by online platforms during a crisis.]',
    'please indicate your level of agreement with the following statements. [overall, i am satisfied with online purchase during a crisis?]',
    'please indicate your level of agreement with the following statements. [i have changed my online shopping behavior during a crisis compared to non-crisis periods?]',
    'please indicate your level of agreement with the following statements. [it was convenient for me to do online shopping during a crisis compared to traditional in-store shopping.]',
    'please indicate your level of agreement with the following statements. [i believe that online shopping during a crisis offers better value for money compared to traditional shopping methods.]'
]

## Gender

In [None]:
df_copy2["gender"].unique()

In [None]:
gender_map = {0: 'Male', 1: 'Female', -1: 'Prefer not to say'}
df_copy2['gender_label'] = df_copy2['gender'].map(gender_map)

# Countplot using seaborn
plt.figure(figsize=(6, 4))
sns.countplot(data=df_copy2, x='gender_label', palette=['skyblue', 'lightpink', 'gray'])
plt.title('Gender Distribution')
plt.xlabel('Gender')
plt.ylabel('Count')
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()

In [None]:
# for feature in binary_features:
#     print(f"{feature}: {df_copy2[feature].unique()}")

In [None]:
output_dir = '/kaggle/working/plots'
os.makedirs(output_dir, exist_ok=True)

In [None]:
# from scipy.stats import chi2_contingency, pointbiserialr, kruskal, spearmanr
# from statsmodels.stats.multitest import multipletests
# import warnings
# warnings.filterwarnings('ignore')

# # 1. Association: Gender vs. Binary Features (Chi-squared Test)
# chi2_results = []
# for feature in binary_features:
#     contingency_table = pd.crosstab(df_copy2['gender_label'], df_copy2[feature])
#     chi2, p, _, _ = chi2_contingency(contingency_table)
#     chi2_results.append({'Feature': feature, 'Chi2': chi2, 'p-value': p})

# # Adjust p-values for multiple testing
# chi2_df = pd.DataFrame(chi2_results)
# chi2_df['p-adjusted'] = multipletests(chi2_df['p-value'], method='bonferroni')[1]

# # Plot significant binary features (p-adjusted < 0.05)
# significant_binary = chi2_df[chi2_df['p-adjusted'] < 0.05]['Feature']
# for feature in significant_binary:
#     plt.figure(figsize=(8, 6))
#     sns.countplot(data=df_copy2, x='gender_label', hue=feature, palette='Set2')
#     plt.title(f'Gender vs. {feature} Usage')
#     plt.xlabel('Gender')
#     plt.ylabel('Count')
#     plt.legend(['Not Used', 'Used'])
#     plt.tight_layout()
#     plt.savefig(f'gender_vs_{feature}_stacked_bar.png')
#     plt.close()

# # 2. Correlation: Gender vs. Binary Features (Point-biserial Correlation)
# # Filter Male and Female only (exclude Prefer not to say)
# df_binary = df_copy2[df_copy2['gender'] != -1].copy()
# df_binary['gender_binary'] = df_binary['gender']  # 0: Male, 1: Female
# pb_results = []
# for feature in binary_features:
#     corr, p = pointbiserialr(df_binary['gender_binary'], df_binary[feature])
#     pb_results.append({'Feature': feature, 'Point-biserial': corr, 'p-value': p})

# # Adjust p-values
# pb_df = pd.DataFrame(pb_results)
# pb_df['p-adjusted'] = multipletests(pb_df['p-value'], method='bonferroni')[1]

# # Plot point-biserial correlations
# plt.figure(figsize=(10, 6))
# sns.barplot(data=pb_df, x='Feature', y='Point-biserial', palette='coolwarm')
# plt.title('Point-biserial Correlation: Gender vs. Binary Features')
# plt.xlabel('Feature')
# plt.ylabel('Correlation Coefficient')
# plt.xticks(rotation=45)
# plt.tight_layout()
# plt.savefig('gender_vs_binary_correlations.png')
# plt.close()

# # 3. Association: Gender vs. Other Features (Kruskal-Wallis Test)
# kw_results = []
# for feature in other_features:
#     groups = [df_copy2[df_copy2['gender'] == g][feature] for g in [-1, 0, 1]]
#     stat, p = kruskal(*groups)
#     kw_results.append({'Feature': feature, 'Kruskal Stat': stat, 'p-value': p})

# # Adjust p-values
# kw_df = pd.DataFrame(kw_results)
# kw_df['p-adjusted'] = multipletests(kw_df['p-value'], method='bonferroni')[1]

# # Plot significant other features (p-adjusted < 0.05)
# significant_other = kw_df[kw_df['p-adjusted'] < 0.05]['Feature']
# for feature in significant_other:
#     plt.figure(figsize=(8, 6))
#     sns.boxplot(data=df_copy2, x='gender_label', y=feature, palette='Set3')
#     plt.title(f'Gender vs. {feature}')
#     plt.xlabel('Gender')
#     plt.ylabel('Agreement Score (1-5)')
#     plt.tight_layout()
#     plt.savefig(f'gender_vs_{feature}_boxplot.png')
#     plt.close()

# # 4. Correlation: Gender vs. Other Features (Spearman’s Rank Correlation)
# sp_results = []
# for feature in other_features:
#     corr, p = spearmanr(df_copy2['gender'], df_copy2[feature])
#     sp_results.append({'Feature': feature, 'Spearman': corr, 'p-value': p})

# # Adjust p-values
# sp_df = pd.DataFrame(sp_results)
# sp_df['p-adjusted'] = multipletests(sp_df['p-value'], method='bonferroni')[1]

# # Plot Spearman correlations
# plt.figure(figsize=(10, 6))
# sns.barplot(data=sp_df, x='Feature', y='Spearman', palette='coolwarm')
# plt.title('Spearman Correlation: Gender vs. Other Features')
# plt.xlabel('Feature')
# plt.ylabel('Correlation Coefficient')
# plt.xticks(rotation=45)
# plt.tight_layout()
# plt.savefig('gender_vs_other_correlations.png')
# plt.close()

# # 5. Pair Plot for Selected Features
# # Select top features (e.g., first binary and first other feature, adjust based on significance)
# selected_features = binary_features[:1] + other_features[:1]
# sns.pairplot(df_copy2, vars=selected_features, hue='gender_label', palette='Set1', diag_kind='hist')
# plt.suptitle('Pair Plot of Selected Features by Gender', y=1.02)
# plt.tight_layout()
# plt.savefig('gender_pair_plot.png')
# plt.close()

# # Save results to CSV for reference
# chi2_df.to_csv('chi2_results.csv', index=False)
# pb_df.to_csv('pointbiserial_results.csv', index=False)
# kw_df.to_csv('kruskal_results.csv', index=False)
# sp_df.to_csv('spearman_results.csv', index=False)

# print("EDA completed. Results saved to CSV files and plots saved as PNG files.")

In [None]:
from scipy.stats import chi2_contingency, pointbiserialr, kruskal, spearmanr
from statsmodels.stats.multitest import multipletests
import warnings
import re
import os

warnings.filterwarnings('ignore')

output_dir = '/kaggle/working/plots'
os.makedirs(output_dir, exist_ok=True)

# Function to sanitize filenames
def sanitize_filename(name):
    return re.sub(r'[^\w\s.-]', '_', name.replace('/', '_').replace('.', '_')).replace(' ', '_')

# Association: Gender vs. Binary Features (Chi-squared Test)
chi2_results = []
for feature in binary_features:
    contingency_table = pd.crosstab(df_copy2['gender_label'], df_copy2[feature])
    chi2, p, _, _ = chi2_contingency(contingency_table)
    chi2_results.append({'Feature': feature, 'Chi2': chi2, 'p-value': p})

# Adjust p-values for multiple testing
chi2_df = pd.DataFrame(chi2_results)
chi2_df['p-adjusted'] = multipletests(chi2_df['p-value'], method='bonferroni')[1]

# Plot significant binary features (p-adjusted < 0.05)
significant_binary = chi2_df[chi2_df['p-adjusted'] < 0.05]['Feature']
for feature in significant_binary:
    try:
        plt.figure(figsize=(8, 6))
        sns.countplot(data=df_copy2, x='gender_label', hue=feature, palette='Set2')
        plt.title(f'Gender vs. {feature} Usage')
        plt.xlabel('Gender')
        plt.ylabel('Count')
        plt.legend(['Not Used', 'Used'])
        plt.tight_layout()
        filename = f"{output_dir}/gender_vs_{sanitize_filename(feature)}_stacked_bar.png"
        plt.savefig(filename)
        plt.close()
        print(f"Saved plot: {filename}")
    except Exception as e:
        print(f"Failed to save plot for {feature}: {e}")
        plt.close()
        continue

# Correlation: Gender vs. Binary Features (Point-biserial Correlation)
# Filter Male and Female only
df_binary = df_copy2[df_copy2['gender'] != -1].copy()
df_binary['gender_binary'] = df_binary['gender']  # 0: Male, 1: Female
pb_results = []
for feature in binary_features:
    corr, p = pointbiserialr(df_binary['gender_binary'], df_binary[feature])
    pb_results.append({'Feature': feature, 'Point-biserial': corr, 'p-value': p})

# Adjust p-values
pb_df = pd.DataFrame(pb_results)
pb_df['p-adjusted'] = multipletests(pb_df['p-value'], method='bonferroni')[1]

# Plot point-biserial correlations
try:
    plt.figure(figsize=(10, 6))
    sns.barplot(data=pb_df, x='Feature', y='Point-biserial', palette='coolwarm')
    plt.title('Point-biserial Correlation: Gender vs. Binary Features')
    plt.xlabel('Feature')
    plt.ylabel('Correlation Coefficient')
    plt.xticks(rotation=45)
    plt.tight_layout()
    filename = f"{output_dir}/gender_vs_binary_correlations.png"
    plt.savefig(filename)
    plt.close()
    print(f"Saved plot: {filename}")
except Exception as e:
    print(f"Failed to save point-biserial correlation plot: {e}")
    plt.close()

#  Association: Gender vs. Other Features (Kruskal-Wallis Test)
kw_results = []
for feature in other_features:
    groups = [df_copy2[df_copy2['gender'] == g][feature] for g in [-1, 0, 1]]
    stat, p = kruskal(*groups)
    kw_results.append({'Feature': feature, 'Kruskal Stat': stat, 'p-value': p})

# Adjust p-values
kw_df = pd.DataFrame(kw_results)
kw_df['p-adjusted'] = multipletests(kw_df['p-value'], method='bonferroni')[1]

# Plot significant other features (p-adjusted < 0.05)
significant_other = kw_df[kw_df['p-adjusted'] < 0.05]['Feature']
for feature in significant_other:
    try:
        plt.figure(figsize=(8, 6))
        sns.boxplot(data=df_copy2, x='gender_label', y=feature, palette='Set3')
        plt.title(f'Gender vs. {feature}')
        plt.xlabel('Gender')
        plt.ylabel('Agreement Score (1-5)')
        plt.tight_layout()
        filename = f"{output_dir}/gender_vs_{sanitize_filename(feature)}_boxplot.png"
        plt.savefig(filename)
        plt.close()
        print(f"Saved plot: {filename}")
    except Exception as e:
        print(f"Failed to save plot for {feature}: {e}")
        plt.close()
        continue

# 4. Correlation: Gender vs. Other Features (Spearman’s Rank Correlation)
sp_results = []
for feature in other_features:
    corr, p = spearmanr(df_copy2['gender'], df_copy2[feature])
    sp_results.append({'Feature': feature, 'Spearman': corr, 'p-value': p})

# Adjust p-values
sp_df = pd.DataFrame(sp_results)
sp_df['p-adjusted'] = multipletests(sp_df['p-value'], method='bonferroni')[1]

# Plot Spearman correlations
try:
    plt.figure(figsize=(10, 6))
    sns.barplot(data=sp_df, x='Feature', y='Spearman', palette='coolwarm')
    plt.title('Spearman Correlation: Gender vs. Other Features')
    plt.xlabel('Feature')
    plt.ylabel('Correlation Coefficient')
    plt.xticks(rotation=45)
    plt.tight_layout()
    filename = f"{output_dir}/gender_vs_other_correlations.png"
    plt.savefig(filename)
    plt.close()
    print(f"Saved plot: {filename}")
except Exception as e:
    print(f"Failed to save Spearman correlation plot: {e}")
    plt.close()

# # Pair Plot for Selected Features
# # Select top features (e.g., first binary and first other feature, adjust based on significance)
# selected_features = binary_features[:1] + other_features[:1]
# try:
#     sns.pairplot(df_copy2, vars=selected_features, hue='gender_label', palette='Set1', diag_kind='hist')
#     plt.suptitle('Pair Plot of Selected Features by Gender', y=1.02)
#     plt.tight_layout()
#     filename = f"{output_dir}/gender_pair_plot.png"
#     plt.savefig(filename)
#     plt.close()
#     print(f"Saved plot: {filename}")
# except Exception as e:
#     print(f"Failed to save pair plot: {e}")
#     plt.close()

# Save results to CSV for reference
chi2_df.to_csv('/kaggle/working/chi2_results.csv', index=False)
pb_df.to_csv('/kaggle/working/pointbiserial_results.csv', index=False)
kw_df.to_csv('/kaggle/working/kruskal_results.csv', index=False)
sp_df.to_csv('/kaggle/working/spearman_results.csv', index=False)

print("EDA completed. Results saved to CSV files in /kaggle/working/ and plots saved as PNG files in /kaggle/working/plots/")

## Age

In [None]:
df_copy["age"].unique()

In [None]:
age_map = {0: '18 - 25', 1: '25 - 35', 2: '35 - 45', 3: '45 - 55'}
df_copy['age_group'] = df_copy['age'].map(age_map)

# Plot with Seaborn
plt.figure(figsize=(7, 4))
sns.countplot(data=df_copy, x='age_group', order=age_map.values(), palette='pastel')
plt.title('Age Group Distribution')
plt.xlabel('Age Group')
plt.ylabel('Count')
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()