## Note
* **A detailed analysis is given on the report**

In [5]:
import numpy as np 
import pandas as pd 

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/online-payment-intentions-preprocessed/df_copy - with platform columns (shortened column names).csv
/kaggle/input/online-payment-intentions-preprocessed/df_copy - without platform columns.csv
/kaggle/input/online-payment-intentions-preprocessed/df_copy - with platform columns.csv
/kaggle/input/online-payment-intentions/In22-CS3121-Project Dataset.xlsx


In [6]:
df_copy = pd.read_csv("/kaggle/input/online-payment-intentions-preprocessed/df_copy - with platform columns (shortened column names).csv")

In [7]:
df_copy2 = df_copy.copy()

# Descriptive Data Analysis

In [8]:
df_copy2 = df_copy2.drop(['platforms_list', 'specialty_platform_list', 'pharmacy_list', 'retailer_list', 'grocery_delivery_list'], axis=1)

In [9]:
column_info = pd.DataFrame({
    "Column Name": df_copy2.columns,
    "Data Type": df_copy2.dtypes.values
})
pd.set_option('display.max_rows', None) 
print(column_info)

                                           Column Name Data Type
0                                           Unnamed: 0     int64
1                                               gender     int64
2                                                  age     int64
3                           highest level of education     int64
4                                       marital status     int64
5           have used online shopping platforms before     int64
6                              professional background     int64
7                         general e-commerce platforms    object
8                 specialty online stores / automobile    object
9                                    online pharmacies    object
10                        fashion and beauty retailers    object
11                           grocery delivery services    object
12   it is easy to navigate through the online plat...     int64
13   the online platform provides clear and intuiti...     int64
14   it was easy for me t

In [10]:
column_info.to_excel("column_info.xlsx", index=False)

**Age:** <br>
age_map = {0: '18-25', 1: '25-35', 2: '35-45', 3: '45-55'} <br><br>
**Highest level of education:** <br>
education_map = {
    0: 'Grade 8 pass', 1: 'High school', 2: 'Diploma / Certificate Course',
    3: "Bachelor's Degree", 4: "Master's Degree or higher", 5: "Postgraduate Diploma"
} <br> <br>
**Marital status:** <br>
marital_stat_map = {0: 'Single', 1: 'Married'} <br><br>
**Professional Background:** <br>
profession_map = {0: 'Employed (full-time)', 1: 'Employed (part-time)', 2: 'Student', 3: 'Unemployed', 4: 'Self-employed / Entrepreneur'}

In [11]:
# Nominal (binary)
binary_features = [
    'have used online shopping platforms before', 'have you made online purchases during crisis time?',
]

# Nominal (Since "Used", "Not Used", and "Unknown" are used for the EDA)
platform_groups = [
    'general e-commerce platforms', 'specialty online stores / automobile', 'online pharmacies', 
    'fashion and beauty retailers', 'grocery delivery services'
]

# Ordinal (0:4 without a specified interval)
other_features = [
    'it is easy to navigate through the online platform to find the products or services i need',
    'the online platform provides clear and intuitive navigation options',
    'it was easy for me to learn how to use the online platform for making online purchases',
    'i quickly became proficient in using the online platform for making online purchases during a crisis',
    'the instructions provided on the online platform are clear and easy to understand',
    'i can easily follow the instructions given on the online platform for making online purchases',
    'the online platform responds quickly to my actions, such as loading pages and processing transactions.',
    'i don\'t experience delays or long waiting times when using the online platform for making online purchases during a crisis',
    'the online platform effectively handles errors or mistakes, such as providing clear error messages and easy recovery options',
    'i rarely encounter errors or issues when using the online platform for making online purchases during a crisis',
    'the online platform offers a wide range of products and services that meet my needs during a crisis',
    'i can find the products or services i need on the online platform during a crisis',
    'using the online platform for making purchases during a crisis is convenient and saves time',
    'the online platform makes it easy to search for products and complete transactions during a crisis',
    'the online platform offers competitive prices, discounts, or cost-saving benefits during a crisis',
    'i perceive that using the online platform for making purchases during a crisis can help me save money',
    'the online platform provides detailed and accurate product information during a crisis',
    'i can easily access user reviews, ratings, and other relevant information to support my purchase decisions',
    'the online platform tailors recommendations, suggestions, or personalized offers based on my preferences',
    'i feel that the online platform understands my needs and preferences during a crisis',
    'the online platform takes appropriate measures to protect my privacy',
    'i trust that my personal information will be handled securely by the online platform',
    'the online platform provides secure payment methods to protect against fraudulent activities during a crisis',
    'i trust that my payment details are handled securely by the online platform',
    'the online platform provides clear and easily accessible policies regarding data handling, privacy, and security',
    'i feel confident in the online platform\'s transparency regarding its data protection practices',
    'i am influenced by recommendations and opinions from family and friends when making online purchases during a crisis',
    'i consider the experiences and suggestions shared by people i know before making online purchases during a crisis',
    'during a crisis, social media platforms, influencers, and online communities influence my online purchase decisions',
    'i am likely to make online purchases during a crisis based on what i see or learn from social media platforms',
    'i consider online reviews and ratings when making purchasing decisions during a crisis',
    'positive reviews and high ratings increase my confidence in making online purchases during a crisis',
    'observing others making online purchases during a crisis influences my own intention to make similar purchases',
    'i am more likely to make online purchases during a crisis if i see others doing the same',
    'social norms and expectations regarding online shopping during a crisis influence my own intention to make online purchases',
    'i feel pressure to make online purchases during a crisis due to the expectations of others',
    'i am willing to share my own online shopping experiences, recommendations, or opinions with others during a crisis',
    'sharing information about my online purchases during a crisis is important for influencing others\' purchase decisions',
    'overall, i have a positive attitude toward online shopping during a crisis',
    'i believe that online shopping is a practical and efficient way to make purchases during a crisis',
    'i am concerned about the security of my personal and financial information when shopping online during a crisis',
    'i am cautious about the reliability and authenticity of products or services offered by online platforms during a crisis',
    'overall, i am satisfied with online purchase during a crisis?',
    'i have changed my online shopping behavior during a crisis compared to non-crisis periods?',
    'it was convenient for me to do online shopping during a crisis compared to traditional in-store shopping',
    'i believe that online shopping during a crisis offers better value for money compared to traditional shopping methods'
]

In [12]:
for platform_group in platform_groups:
    unique_values = df_copy2[platform_group].unique()
    print("Unique values of", platform_group, ":\n", unique_values, "\n")

Unique values of general e-commerce platforms :
 ['daraz, ikman' 'daraz, kapruka, ikman' 'daraz, wow' 'daraz, kapruka'
 'kapruka, ikman, takas, wishque' 'daraz, kapruka, ikman, takas, wow'
 'kapruka, ali-express' 'daraz' 'daraz, kapruka, ikman, wishque'
 'daraz, ikman, wow' 'ikman' 'kapruka' 'daraz, ali-express'
 'daraz, strong.lk' 'ali-express, ebay' 'kapruka, ikman' 'daraz, keels'
 'daraz, ikman, takas, wow' 'daraz, keels, wishque'
 'daraz, kapruka, ikman, amazon ,ebay' 'kapruka, ikman, takas, wow'
 'kapruka, ikman, takas' 'daraz, kapruka, ikman, takas'
 'daraz, kapruka, pickme' 'daraz, kapruka, wow' 'ebay' 'daraz, wishque'
 'daraz, kapruka, lassana.com' 'daraz, takas, wow'
 'daraz, kapruka, ikman, wow' 'daraz, ikman, takas'
 'daraz, instagram stores' 'shein'] 

Unique values of specialty online stores / automobile :
 ['specialty online stores / automobile unknown' 'patpat.lk'
 'patpat.lk, riyasewana.lk' 'lankavechicle.com' 'riyasewana.lk'
 'autolanka.com' 'autolanka.com, riyasewana.

In [13]:
nominal = [
    'marital status', 'gender', 'professional background', 'general e-commerce platforms', 
    'specialty online stores / automobile', 'online pharmacies', 'fashion and beauty retailers', 
    'grocery delivery services', 'have used online shopping platforms before', 
    'have you made online purchases during crisis time?'
]

ordinal = [
    'age', 'highest level of education',
    'it is easy to navigate through the online platform to find the products or services i need',
    'the online platform provides clear and intuitive navigation options',
    'it was easy for me to learn how to use the online platform for making online purchases',
    'i quickly became proficient in using the online platform for making online purchases during a crisis',
    'the instructions provided on the online platform are clear and easy to understand',
    'i can easily follow the instructions given on the online platform for making online purchases',
    'the online platform responds quickly to my actions, such as loading pages and processing transactions.',
    'i don\'t experience delays or long waiting times when using the online platform for making online purchases during a crisis',
    'the online platform effectively handles errors or mistakes, such as providing clear error messages and easy recovery options',
    'i rarely encounter errors or issues when using the online platform for making online purchases during a crisis',
    'the online platform offers a wide range of products and services that meet my needs during a crisis',
    'i can find the products or services i need on the online platform during a crisis',
    'using the online platform for making purchases during a crisis is convenient and saves time',
    'the online platform makes it easy to search for products and complete transactions during a crisis',
    'the online platform offers competitive prices, discounts, or cost-saving benefits during a crisis',
    'i perceive that using the online platform for making purchases during a crisis can help me save money',
    'the online platform provides detailed and accurate product information during a crisis',
    'i can easily access user reviews, ratings, and other relevant information to support my purchase decisions',
    'the online platform tailors recommendations, suggestions, or personalized offers based on my preferences',
    'i feel that the online platform understands my needs and preferences during a crisis',
    'the online platform takes appropriate measures to protect my privacy',
    'i trust that my personal information will be handled securely by the online platform',
    'the online platform provides secure payment methods to protect against fraudulent activities during a crisis',
    'i trust that my payment details are handled securely by the online platform',
    'the online platform provides clear and easily accessible policies regarding data handling, privacy, and security',
    'i feel confident in the online platform\'s transparency regarding its data protection practices',
    'i am influenced by recommendations and opinions from family and friends when making online purchases during a crisis',
    'i consider the experiences and suggestions shared by people i know before making online purchases during a crisis',
    'during a crisis, social media platforms, influencers, and online communities influence my online purchase decisions',
    'i am likely to make online purchases during a crisis based on what i see or learn from social media platforms',
    'i consider online reviews and ratings when making purchasing decisions during a crisis',
    'positive reviews and high ratings increase my confidence in making online purchases during a crisis',
    'observing others making online purchases during a crisis influences my own intention to make similar purchases',
    'i am more likely to make online purchases during a crisis if i see others doing the same',
    'social norms and expectations regarding online shopping during a crisis influence my own intention to make online purchases',
    'i feel pressure to make online purchases during a crisis due to the expectations of others',
    'i am willing to share my own online shopping experiences, recommendations, or opinions with others during a crisis',
    'sharing information about my online purchases during a crisis is important for influencing others\' purchase decisions',
    'overall, i have a positive attitude toward online shopping during a crisis',
    'i believe that online shopping is a practical and efficient way to make purchases during a crisis',
    'i am concerned about the security of my personal and financial information when shopping online during a crisis',
    'i am cautious about the reliability and authenticity of products or services offered by online platforms during a crisis',
    'overall, i am satisfied with online purchase during a crisis?',
    'i have changed my online shopping behavior during a crisis compared to non-crisis periods?',
    'it was convenient for me to do online shopping during a crisis compared to traditional in-store shopping',
    'i believe that online shopping during a crisis offers better value for money compared to traditional shopping methods'
]

# Descriptive Analysis: Numeric

## Nominal Data

In [14]:
# Define not used and unknown values for platform groups
not_used_values = {
    'general e-commerce platforms': None,
    'specialty online stores / automobile': 'zero specialty online stores / automobile',
    'online pharmacies': 'no online pharmacies',
    'fashion and beauty retailers': 'no fashion and beauty retailers',
    'grocery delivery services': 'grocery delivery services not used'
}

unknown_values = {
    'general e-commerce platforms': None,
    'specialty online stores / automobile': 'specialty online stores / automobile unknown',
    'online pharmacies': 'online pharmacy unknown',
    'fashion and beauty retailers': 'unknown fashion and beauty retailers',
    'grocery delivery services': 'unknown grocery delivery services'
}

marital_stat_map = {0: 'Single', 1: 'Married'}
profession_map = {0: 'Employed (full-time)', 1: 'Employed (part-time)', 2: 'Student', 3: 'Unemployed', 4: 'Self-employed / Entrepreneur'}
gender_map = {0: 'Male', 1: 'Female', -1: 'Prefer not to say'}  
binary_map = {0: 'No', 1: 'Yes'}

# Function to categorize platform group responses
def categorize_platform_response(value, group):
    if pd.isna(value) or (group in unknown_values and str(value).strip() == unknown_values[group]):
        return 'unknown'
    elif group in not_used_values and str(value).strip() == not_used_values[group]:
        return 'not used'
    elif value and not pd.isna(value):
        return 'used'
    return 'unknown'

results = []

for feature in nominal:
    if feature in platform_groups:
        df_copy2[f'{feature}_status'] = df_copy2[feature].apply(lambda x: categorize_platform_response(x, feature))
        data = df_copy2[f'{feature}_status'].dropna()
        categories = ['used', 'not used', 'unknown']
    else:
        mapping = {}
        if feature == 'marital status':
            mapping = marital_stat_map
        elif feature == 'professional background':
            mapping = profession_map
        elif feature == 'gender':
            mapping = gender_map
        else: 
            mapping = binary_map
        data = df_copy2[feature].dropna().map(mapping).dropna()
        categories = list(mapping.values())

    # Frequency
    frequency = data.value_counts().reindex(categories, fill_value=0)

    total = len(data)

    # Normalized Frequency
    normalized_frequency = (frequency / total * 100).round(2)

    # Mode
    mode_values = data.mode().tolist()
    if len(mode_values) == len(categories):
        mode = 'All values equally frequent'
    else:
        mode = ', '.join(map(str, mode_values))

    for cat in categories:
        results.append({
            'Feature': feature,
            'Category': cat,
            'Frequency': frequency.get(cat, 0),
            'Normalized Frequency (%)': normalized_frequency.get(cat, 0),
            'Mode': mode
        })

results_df = pd.DataFrame(results)

output_dir = '/kaggle/working/numeric/nominal'
os.makedirs(output_dir, exist_ok=True)
output_xlsx = f'{output_dir}/nominal_data_statistics.xlsx'
results_df.to_excel(output_xlsx, index=False)

print(f"Saved nominal data statistics to: {output_xlsx}")

Saved nominal data statistics to: /kaggle/working/numeric/nominal/nominal_data_statistics.xlsx


## Ordinal Data

In [15]:
import warnings

warnings.filterwarnings('ignore')

age_map = {
    0: '18-25', 1: '25-35', 2: '35-45', 3: '45-55'
}
education_map = {
    0: 'Grade 8 pass', 1: 'High school', 2: 'Diploma / Certificate Course',
    3: "Bachelor's Degree", 4: "Master's Degree or higher", 5: "Postgraduate Diploma"
}
agreement_mapping = {
    0: 'Strongly Disagree',
    1: 'Disagree',
    2: 'Neutral',
    3: 'Agree',
    4: 'Strongly Agree'
}

results = []

for feature in ordinal:
    data = df_copy2[feature].dropna()

    if feature == 'age':
        mapping = age_map
        categories = list(age_map.keys())
    elif feature == 'highest level of education':
        mapping = education_map
        categories = list(education_map.keys())
    else:
        mapping = agreement_mapping
        categories = list(agreement_mapping.keys())

    mapped_data = data.map(lambda x: mapping[x] if x in mapping else x).dropna()

    # Frequency
    frequency = data.value_counts().reindex(categories, fill_value=0).rename(index=mapping)

    total = len(data)

    # Normalized Frequency
    normalized_frequency = (frequency / total * 100).round(2)

    # Cumulative Frequency
    cumulative_frequency = frequency.cumsum()

    # Normalized Cumulative Frequency
    normalized_cumulative_frequency = (cumulative_frequency / total * 100).round(2)

    # Mode
    mode_values = mapped_data.mode().tolist()
    mode = ', '.join(mode_values) if mode_values else 'No mode'
    if len(mode_values) == len(categories):
        mode = 'All values equally frequent'

    # Median
    median_value = np.median(data)
    if median_value.is_integer() and median_value in mapping:
        median = mapping[int(median_value)]
    else:
        # Handle non-integer or out-of-range median
        lower_idx = int(np.floor(median_value))
        upper_idx = int(np.ceil(median_value))
        if lower_idx in mapping and upper_idx in mapping:
            median = f"{mapping[lower_idx]} to {mapping[upper_idx]}"
        else:
            median = 'N/A'
            
    for cat in categories:
        cat_label = mapping[cat] 
        results.append({
            'Feature': feature,
            'Category': cat_label,
            'Frequency': frequency.get(cat, 0),
            'Normalized Frequency (%)': normalized_frequency.get(cat, 0),
            'Cumulative Frequency': cumulative_frequency.get(cat, 0),
            'Normalized Cumulative Frequency (%)': normalized_cumulative_frequency.get(cat, 0),
            'Mode': mode,
            'Median': median
        })

results_df = pd.DataFrame(results)

output_dir = '/kaggle/working/numeric/ordinal'
os.makedirs(output_dir, exist_ok=True)
output_xlsx = f'{output_dir}/ordinal_data_statistics.xlsx'
results_df.to_excel(output_xlsx, index=False)
print(f"Saved ordinal data statistics to: {output_xlsx}")

Saved ordinal data statistics to: /kaggle/working/numeric/ordinal/ordinal_data_statistics.xlsx


# Descriptive Analysis: Visual

## Nominal

In [16]:
import matplotlib.pyplot as plt
import zipfile
import os

output_dir = '/kaggle/working/visual/nominal'
os.makedirs(output_dir, exist_ok=True)

marital_stat_map = {0: 'Single', 1: 'Married'}
profession_map = {0: 'Employed (full-time)', 1: 'Employed (part-time)', 2: 'Student', 3: 'Unemployed', 4: 'Self-employed / Entrepreneur'}
gender_map = {0: 'Male', 1: 'Female', -1: 'Prefer not to say'}
binary_map = {0: 'No', 1: 'Yes'}
platform_groups = ['general e-commerce platforms', 'specialty online stores / automobile', 'online pharmacies', 'fashion and beauty retailers', 'grocery delivery services']

not_used_values = {
    'general e-commerce platforms': None,
    'specialty online stores / automobile': 'zero specialty online stores / automobile',
    'online pharmacies': 'no online pharmacies',
    'fashion and beauty retailers': 'no fashion and beauty retailers',
    'grocery delivery services': 'grocery delivery services not used'
}

unknown_values = {
    'general e-commerce platforms': None,
    'specialty online stores / automobile': 'specialty online stores / automobile unknown',
    'online pharmacies': 'online pharmacy unknown',
    'fashion and beauty retailers': 'unknown fashion and beauty retailers',
    'grocery delivery services': 'unknown grocery delivery services'
}

def categorize_platform_response(value, group):
    if pd.isna(value) or (group in unknown_values and str(value).strip() == unknown_values[group]):
        return 'unknown'
    elif group in not_used_values and str(value).strip() == not_used_values[group]:
        return 'not used'
    elif value and not pd.isna(value):
        return 'used'
    return 'unknown'

required_columns = ['marital status', 'gender', 'professional background', 'have used online shopping platforms before', 'have you made online purchases during crisis time?'] + platform_groups
missing_columns = [col for col in required_columns if col not in df_copy2.columns]
if missing_columns:
    print(f"Warning: Missing columns in df_copy2: {missing_columns}")
else:
    df_copy2 = df_copy2.replace({
        'marital status': marital_stat_map,
        'gender': gender_map,
        'professional background': profession_map,
        'have used online shopping platforms before': binary_map,
        'have you made online purchases during crisis time?': binary_map
    })

    # Pie Chart for marital status
    if 'marital status' in df_copy2.columns:
        marital_counts = df_copy2['marital status'].value_counts()
        plt.figure(figsize=(10, 8))
        plt.pie(
            marital_counts, labels=marital_counts.index, autopct='%1.1f%%', startangle=90, 
            textprops={'fontsize': 16}, wedgeprops={'edgecolor': 'black', 'linewidth': 0.5}
        )
        plt.title('Marital Status Distribution',  fontsize=20)
        plt.legend(marital_counts.index, title="Marital Status", loc="best", fontsize=14)
        plt.axis('equal')
        plt.savefig(os.path.join(output_dir, 'marital_status_pie_chart.png'))
        plt.close()
    else:
        print("Warning: 'marital status' column not found for pie chart.")

    # Pie Chart for gender
    if 'gender' in df_copy2.columns:
        gender_counts = df_copy2['gender'].value_counts()
        plt.figure(figsize=(10, 8))
        plt.pie(
            gender_counts, labels=gender_counts.index, autopct='%1.1f%%', startangle=90, 
            textprops={'fontsize': 16}, wedgeprops={'edgecolor': 'black', 'linewidth': 0.5}
        )
        plt.title('Gender Distribution', fontsize=20, pad=20)
        plt.legend(gender_counts.index, title="Gender", loc="best", fontsize=14)
        plt.axis('equal')
        plt.savefig(os.path.join(output_dir, 'gender_pie_chart.png'))
        plt.close()
    else:
        print("Warning: 'gender' column not found for pie chart.")

    # Pie Chart for have used online shopping platforms before
    if 'have used online shopping platforms before' in df_copy2.columns:
        used_platforms_counts = df_copy2['have used online shopping platforms before'].value_counts()
        plt.figure(figsize=(10, 8))
        plt.pie(
            used_platforms_counts, labels=used_platforms_counts.index, autopct='%1.1f%%', startangle=90, 
            textprops={'fontsize': 16}, wedgeprops={'edgecolor': 'black', 'linewidth': 0.5}
        )
        plt.title('Have Used Online Shopping Platforms Before', fontsize=20, pad=20)
        plt.legend(used_platforms_counts.index, title='Online Shopping', loc="best", fontsize=14)
        plt.axis('equal')
        plt.savefig(os.path.join(output_dir, 'used_platforms_pie_chart.png'))
        plt.close()
    else:
        print("Warning: 'have used online shopping platforms before' column not found for pie chart.")

    # Pie Chart for have you made online purchases during crisis time?
    if 'have you made online purchases during crisis time?' in df_copy2.columns:
        crisis_purchases_counts = df_copy2['have you made online purchases during crisis time?'].value_counts()
        plt.figure(figsize=(10, 8))
        plt.pie(
            crisis_purchases_counts, labels=crisis_purchases_counts.index, autopct='%1.1f%%', startangle=90, 
            textprops={'fontsize': 16}, wedgeprops={'edgecolor': 'black', 'linewidth': 0.5}
        )
        plt.title('Online Purchases During Crisis', fontsize=20, pad=20)
        plt.legend(crisis_purchases_counts.index, title="Online Purchases", loc="best", fontsize=14)
        plt.axis('equal')
        plt.savefig(os.path.join(output_dir, 'crisis_purchases_pie_chart.png'))
        plt.close()
    else:
        print("Warning: 'have you made online purchases during crisis time?' column not found for pie chart.")

    # Bar Chart for professional background
    if 'professional background' in df_copy2.columns:
        prof_counts = df_copy2['professional background'].value_counts()
        plt.figure(figsize=(12, 10))
        bars = plt.bar(prof_counts.index, prof_counts.values, color='skyblue', label='Count')
        
        for bar in bars:
            yval = bar.get_height()
            plt.text(bar.get_x() + bar.get_width()/2, yval + 5, yval, ha='center', va='bottom')
        
        plt.title('Professional Background Distribution', fontsize=14)
        plt.xlabel('Profession', fontsize=12)
        plt.ylabel('Count', fontsize=12)
        plt.xticks(rotation=35, ha='right', fontsize=12)
        plt.legend(handles=[bars[0]], labels=['Count'], fontsize=12)
        plt.tight_layout()
        plt.savefig(os.path.join(output_dir, 'professional_background_bar_chart.png'))
        plt.close()
    else:
        print("Warning: 'professional background' column not found for bar chart.")

    if all(pg in df_copy2.columns for pg in platform_groups):
        for platform in platform_groups:
            status_counts = df_copy2[platform].apply(lambda x: categorize_platform_response(x, platform)).value_counts()
            all_statuses = ['used', 'not used', 'unknown']
            # for status in all_statuses:
            #     if status not in status_counts:
            #         status_counts[status] = 0
            
            plt.figure(figsize=(10, 8))
            plt.pie(
                status_counts.values, labels=status_counts.index, autopct='%1.1f%%', startangle=90, 
                textprops={'fontsize': 16}, wedgeprops={'edgecolor': 'black', 'linewidth': 0.5},
                pctdistance=0.85
            )
            plt.title(f'{platform} Usage Distribution', fontsize=20, pad=20)
            plt.legend(status_counts.index, title="platform", loc="best", fontsize=14)
            plt.axis('equal')
            plt.savefig(os.path.join(output_dir, f'{platform.replace(" ", "_").replace("/", "_")}_pie_chart.png'))
            plt.close()
    else:
        print("Warning: One or more platform columns not found in df_copy2 for pie charts.")

In [17]:
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import os
import ast

output_dir = '/kaggle/working/visual/nominal'
os.makedirs(output_dir, exist_ok=True)

not_used_values = {
    'general e-commerce platforms': None,
    'specialty online stores / automobile': 'zero specialty online stores / automobile',
    'online pharmacies': 'no online pharmacies',
    'fashion and beauty retailers': 'no fashion and beauty retailers',
    'grocery delivery services': 'grocery delivery services not used'
}

unknown_values = {
    'general e-commerce platforms': None,
    'specialty online stores / automobile': 'specialty online stores / automobile unknown',
    'online pharmacies': 'online pharmacy unknown',
    'fashion and beauty retailers': 'unknown fashion and beauty retailers',
    'grocery delivery services': 'unknown grocery delivery services'
}

def categorize_platform_response(value, group):
    if pd.isna(value) or (group in unknown_values and str(value).strip() == unknown_values[group]):
        return 'unknown'
    elif group in not_used_values and str(value).strip() == not_used_values[group]:
        return 'not used'
    elif value and not pd.isna(value):
        return 'used'
    return 'unknown'

# Tag Cloud for individual platform popularity
def extract_platforms(text):
    if pd.isna(text) or text in ['unknown', 'not used']:
        return []
    return [x.strip() for x in str(text).split(',')]

all_platforms = []
for feature in platform_groups:
    platforms = df_copy2[feature].apply(extract_platforms)
    all_platforms.extend([p for sublist in platforms for p in sublist if p])
platform_text = ' '.join(all_platforms)
wordcloud = WordCloud(width=800, height=400, background_color='white', min_font_size=10).generate(platform_text)
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.title('Popularity of Individual Platforms Across All Groups', fontsize=16, pad=20)
plt.axis('off')
plt.savefig(os.path.join(output_dir, 'platform_tag_cloud.png'))
plt.close()

for platform in platform_groups:
    # Extract platforms for the current group
    platforms = df_copy2[platform].apply(extract_platforms)
    all_platforms = [p for sublist in platforms for p in sublist if p]
    platform_text = ' '.join(all_platforms)
            
    # Generate word cloud
    wordcloud = WordCloud(width=800, height=400, background_color='white', min_font_size=10).generate(platform_text)
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.title(f'Popularity of Individual Platforms in {platform}', fontsize=16, pad=20)
    plt.axis('off')
    plt.savefig(os.path.join(output_dir, f'{platform.replace(" ", "_").replace("/", "_")}_tag_cloud.png'))
    plt.close()

# Stacked Bar Chart for categorized data
for feature in platform_groups:
    df_copy2[f'{feature}_status'] = df_copy2[feature].apply(lambda x: categorize_platform_response(x, feature))

# Aggregate categorized data
categorized_data = pd.DataFrame()
for feature in platform_groups:
    status_counts = df_copy2[f'{feature}_status'].value_counts().reindex(['used', 'not used', 'unknown'], fill_value=0)
    categorized_data[feature] = status_counts

categorized_data.plot(kind='bar', stacked=True, figsize=(10, 6))
plt.title('Distribution of Platform Usage Status', fontsize=16)
plt.xlabel('Status', fontsize=14)
plt.ylabel('Count', fontsize=14)
plt.xticks(rotation=0)
plt.legend(title='Platform Group', fontsize=14)
plt.savefig(os.path.join(output_dir, 'platform_usage_stacked_bar.png'))
plt.close()

In [18]:
zip_filename = 'plots.zip'
with zipfile.ZipFile(os.path.join(output_dir, zip_filename), 'w', zipfile.ZIP_DEFLATED) as zipf:
    for filename in ['marital_status_pie_chart.png', 'gender_pie_chart.png', 
                    'used_platforms_pie_chart.png',
                    'crisis_purchases_pie_chart.png', 'professional_background_bar_chart.png',
                    'general_e-commerce_platforms_pie_chart.png',
                    'specialty_online_stores___automobile_pie_chart.png',
                    'online_pharmacies_pie_chart.png',
                    'fashion_and_beauty_retailers_pie_chart.png',
                    'grocery_delivery_services_pie_chart.png',
                    'general_e-commerce_platforms_tag_cloud.png',
                    'specialty_online_stores___automobile_tag_cloud.png',
                    'online_pharmacies_tag_cloud.png',
                    'fashion_and_beauty_retailers_tag_cloud.png',
                    'grocery_delivery_services_tag_cloud.png',
                    'platform_tag_cloud.png', 'platform_usage_stacked_bar.png']:
        file_path = os.path.join(output_dir, filename)
        if os.path.exists(file_path):
            zipf.write(file_path, filename)
        else:
            print(f"Warning: {filename} not found, skipping.")

print(f"Plots have been zipped into {os.path.join(output_dir, zip_filename)}")

Plots have been zipped into /kaggle/working/visual/nominal/plots.zip


## Ordinal

In [20]:
""" 
Group related features into categories (usability, security, satisfaction) and 
create a clustered bar chart to compare the distribution of responses across a 
few selected representative features within each category.  
"""

output_dir = '/kaggle/working/visual/ordinal'
os.makedirs(output_dir, exist_ok=True)

# Clustered Bar Charts for Agreement Features
categories = {
    'Usability': {
        'features': [
            'it is easy to navigate through the online platform to find the products or services i need',
            'using the online platform for making purchases during a crisis is convenient and saves time',
            'the online platform responds quickly to my actions, such as loading pages and processing transactions.'
        ],
        'labels': [
            'Ease of Navigation',
            'Convenience',
            'Platform Speed'
        ]
    },
    'Security': {
        'features': [
            'the online platform provides secure payment methods to protect against fraudulent activities during a crisis',
            'i trust that my personal information will be handled securely by the online platform',
            'i am concerned about the security of my personal and financial information when shopping online during a crisis'
        ],
        'labels': [
            'Secure Payments',
            'Trust in Security',
            'Security Concerns'
        ]
    },
    'Satisfaction': {
        'features': [
            'overall, i am satisfied with online purchase during a crisis?',
            'i have changed my online shopping behavior during a crisis compared to non-crisis periods?',
            'i consider online reviews and ratings when making purchasing decisions during a crisis'
        ],
        'labels': [
            'Overall Satisfaction',
            'Behavior Change',
            'Influence of Reviews'
        ]
    }
}

for category, info in categories.items():
    features = info['features']
    labels = info['labels']
    if all(f in df_copy2.columns for f in features):
        # Prepare data for clustered bar chart
        counts = []
        for feature in features:
            feature_counts = df_copy2[feature].map(agreement_mapping).value_counts()
            counts.append([feature_counts.get(resp, 0) for resp in agreement_mapping.values()])
        
        # Plot clustered bar chart
        x = np.arange(len(agreement_mapping))
        width = 0.25
        plt.figure(figsize=(12, 8))
        for i, (label, count) in enumerate(zip(labels, counts)):
            plt.bar(x + i * width, count, width, label=label, color=['skyblue', 'lightgreen', 'salmon'][i])
        
        plt.title(f'{category} Responses', fontsize=16)
        plt.xlabel('Response', fontsize=14)
        plt.ylabel('Count', fontsize=14)
        plt.xticks(x + width, agreement_mapping.values(), rotation=45, ha='right', fontsize=12)
        plt.legend(title='Feature', bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=14, title_fontsize=14)
        plt.tight_layout()
        plt.savefig(os.path.join(output_dir, f'{category.lower()}_clustered_bar_chart.png'))
        plt.close()
    else:
        print(f"Warning: One or more features for {category} category not found in df_copy2.")


# Bar Chart for age
age_counts = df_copy2['age'].map(age_map).value_counts().sort_index()
plt.figure(figsize=(12, 8))
bars = plt.bar(age_counts.index, age_counts.values, color='lightblue')
for bar in bars:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, yval + 0.5, yval, ha='center', va='bottom')
plt.title('Age Distribution', fontsize=16)
plt.xlabel('Age Group', fontsize=14)
plt.ylabel('Count', fontsize=14)
plt.xticks(rotation=45, ha='right', fontsize=12)
plt.tight_layout()
plt.savefig(os.path.join(output_dir, 'age_bar_chart.png'))
plt.close()

# Bar Chart for highest level of education
edu_counts = df_copy2['highest level of education'].map(education_map).value_counts().sort_index()
plt.figure(figsize=(12, 8))
bars = plt.bar(edu_counts.index, edu_counts.values, color='lightcoral')
for bar in bars:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, yval + 0.5, yval, ha='center', va='bottom')
plt.title('Highest Level of Education Distribution', fontsize=16)
plt.xlabel('Education Level', fontsize=14)
plt.ylabel('Count', fontsize=14)
plt.xticks(rotation=45, ha='right', fontsize=12)
plt.tight_layout()
plt.savefig(os.path.join(output_dir, 'education_bar_chart.png'))
plt.close()

zip_filename = 'plots.zip'
with zipfile.ZipFile(os.path.join(output_dir, zip_filename), 'w', zipfile.ZIP_DEFLATED) as zipf:
    for filename in ['usability_clustered_bar_chart.png', 'security_clustered_bar_chart.png',
                    'satisfaction_clustered_bar_chart.png', 'age_bar_chart.png',
                    'education_bar_chart.png']:
        file_path = os.path.join(output_dir, filename)
        if os.path.exists(file_path):
            zipf.write(file_path, filename)
        else:
            print(f"Warning: {filename} not found, skipping.")

print(f"Plots have been zipped into {os.path.join(output_dir, zip_filename)}")

Plots have been zipped into /kaggle/working/visual/ordinal/plots.zip


In [2]:
# import os
# import shutil

# # Define the working directory
# working_dir = '/kaggle/working'

# # Remove all files and subdirectories
# for item in os.listdir(working_dir):
#     item_path = os.path.join(working_dir, item)
#     try:
#         if os.path.isfile(item_path):
#             os.unlink(item_path)
#         elif os.path.isdir(item_path):
#             shutil.rmtree(item_path)
#     except Exception as e:
#         print(f"Error removing {item_path}: {e}")

# print(f"All files and directories in {working_dir} have been cleared.")

All files and directories in /kaggle/working have been cleared.
