## Analysing responses from Market research survey

In [None]:
# import libraries
import pandas as pd
import plotly.io as pio
import plotly.express as px
import plotly.graph_objects as go
pio.renderers.default = 'notebook'

### User A = Don't use Amenity Data


In [None]:
# import csv file as dataframe
userA_path = "userA_responses.csv"
userA_data = pd.read_csv(userA_path, delimiter=",")
userA_data.head(5)

#### Basic demographic data
0. Number of respondents in user A
1. County percentages
2. Sector percentages

In [None]:
# how many people in user A
userA_count = len(userA_data)
print("Number of respondents in User A:",userA_count)

In [None]:
# need to fix this, i think it is duplicated
## is it worth exploding in the preprocess? its causing me so many issues now
## not it not worth it, i removed it and added it below

# plot county & sector
countyA_count = userA_data["county"].value_counts()
sectorA_count = userA_data["sector"].value_counts()

# county & sector colors
county_colors = px.colors.qualitative.Set1 
sector_colors = px.colors.qualitative.Set2

# create first pie chart for "County"
pie_data_countyA = pd.DataFrame({'County': countyA_count.index, 'Count': countyA_count.values})
fig_countyA = px.pie(pie_data_countyA, names='County', values='Count', title="User A County Distribution", color_discrete_sequence=county_colors)

# show labels on the pie & pull out big slices
fig_countyA.update_traces(textinfo='label+percent', textposition='inside')
fig_countyA.update_traces(
    pull=[0.07 if count > 0.05 * countyA_count.sum() else 0 for count in countyA_count.values] 
)

# adjust layout
fig_countyA.update_layout(
    legend=dict(
        title="Counties",
        orientation="v",
        x=1.1,
        y=1,
        xanchor="left",
        yanchor="top",
        font=dict(size=10),
        bgcolor="rgba(255, 255, 255, 0.8)"
    )
)

# show county pie chart
fig_countyA.show()

# create second pie chart for "Sector"
pie_data_sectorA = pd.DataFrame({'Sector': sectorA_count.index, 'Count': sectorA_count.values})
fig_sectorA = px.pie(pie_data_sectorA, names='Sector', values='Count', title="User A Sector Distribution", color_discrete_sequence=sector_colors)

# show labels on the pie & pull out big slices
fig_sectorA.update_traces(textinfo='label+percent', textposition='inside')
fig_sectorA.update_traces(
    pull=[0.07 if count > 0.10 * sectorA_count.sum() else 0 for count in sectorA_count.values]  
)

# adjust layout
fig_sectorA.update_layout(
    legend=dict(
        title="Sectors",
        orientation="v",
        x=1.1,
        y=1,
        xanchor="left",
        yanchor="top",
        font=dict(size=10),
        bgcolor="rgba(255, 255, 255, 0.8)"
    )
)

# show sector pie chart
fig_sectorA.show()

#### Technology usage
1. Which device mostly used day-to-day?
2. How often do you use digital tools for navigation?

In [None]:
## explode necessary columns here

def explode_multiple_answers(data, column_names, delimiter=';'):
    """
    Function to plot the count of devices in a specified column, handling multiple answers.
    
    Parameters:
    - data: DataFrame containing the survey data
    - column_name: The name of the column to analyze (e.g., 'device_personal')
    - delimiter: The delimiter separating multiple values (default is ';')
    
    Returns:
    - A Plotly bar chart
    """
    # Loop through the list of columns and apply the split and explode process
    for column_name in column_names:
        if column_name in data.columns:
            # Check if the delimiter exists in any of the rows of the column
            if data[column_name].str.contains(delimiter).any():
                # Split the values in the column by the delimiter
                data[column_name + '_exploded'] = data[column_name].str.split(delimiter)
                # Explode the column to create multiple rows
                data = data.explode(column_name + '_exploded')
            else:
                # If no delimiter is found, no need to split or explode
                data[column_name + '_exploded'] = data[column_name]  # Keep original values in a new column
    
    return data

userAcols_to_explode = ["device_personal","why_impractical_demo_personal","other_amenity_personal","other_feature_personal"]

users_A_expl = explode_multiple_answers(userA_data,userAcols_to_explode)


In [None]:
# replacing other answers in device_personal with "Other"
device_list = ["Laptop","Smartphone","Desktop computer","Tablet"]
users_A_expl["device_personal_exploded"] = users_A_expl["device_personal_exploded"].apply(
    lambda x: x if x in device_list else "Other"
)

users_A_expl["device_personal_exploded"].value_counts()

In [None]:
# plot device count
deviceA_count = users_A_expl.groupby("device_personal_exploded")['id'].count()

# create a df for plotting & sort values ascending
bar_data_deviceA = pd.DataFrame({'Device': deviceA_count.index, 'Count': deviceA_count.values})
bar_data_deviceA = bar_data_deviceA.sort_values(by='Count', ascending=False)

# plot the bar chart
device_colors = px.colors.qualitative.Set3 
fig_deviceA = px.bar(bar_data_deviceA, x='Device', y='Count', title="User A Preferred Device")

# customizing layout
fig_deviceA.update_layout(
    width=800,
    height=500,
    legend=dict(
        title="Devices",
        orientation="h",
        x=1.1,
        y=1,
        xanchor="left",
        yanchor="top",
        font=dict(size=6),
        bgcolor="rgba(255, 255, 255, 0.8)"
    )
)

# show the plot
fig_deviceA.show()

In [None]:
# tool frequency plot - set count & color
toolfrequencyA_count = userA_data["freq_tool_personal"].value_counts()
toolfrequency_colors = px.colors.qualitative.Set3

# create df
pie_data_toolfreqA = pd.DataFrame({'Tool Frequency': toolfrequencyA_count.index, 'Count': toolfrequencyA_count.values})
fig_toolfreqA = px.pie(pie_data_toolfreqA, names='Tool Frequency', values='Count', title="User A Tool Usage Frequency Distribution", color_discrete_sequence=toolfrequency_colors)

# push out the big slices
fig_toolfreqA.update_traces(
    pull=[0.05 if count > 0.2 * toolfrequencyA_count.sum() else 0 for count in toolfrequencyA_count.values]  # Pull out small slices slightly
)

# show labels on the pie & fix layout
fig_toolfreqA.update_traces(textinfo='label+percent', textposition='inside')
fig_toolfreqA.update_layout(
    legend=dict(
        title="Frequency",
        orientation="v",
        x=1.1,
        y=1,
        xanchor="left",
        yanchor="top",
        font=dict(size=10),
        bgcolor="rgba(255, 255, 255, 0.8)"
    )
)

# display pie chart
fig_toolfreqA.show()

#### How useful our product is
1. Usefulness distribution
2. Reasons why impractical

In [None]:
# pie chart - distribution of how useful our product would be
product_usefulA_count = userA_data["demo_useful_personal"].value_counts()
product_useful_colors = px.colors.qualitative.Set3 # change colors

# create df & plot
pie_data_produseA = pd.DataFrame({'Product Usefulness': product_usefulA_count.index, 'Count': product_usefulA_count.values})
fig_produseA = px.pie(pie_data_produseA, names='Product Usefulness', values='Count', title="User A Thoughts on Product Usefulness", color_discrete_sequence=product_useful_colors)

# push out the big slices; show labels & fix layout
fig_produseA.update_traces(
    pull=[0.05 if count > 0.2 * product_usefulA_count.sum() else 0 for count in product_usefulA_count.values]  # Pull out small slices slightly
)
fig_produseA.update_traces(textinfo='label+percent', textposition='inside')
fig_produseA.update_layout(
    legend=dict(
        title="Frequency",
        orientation="v",
        x=1.1,
        y=1,
        xanchor="left",
        yanchor="top",
        font=dict(size=10),
        bgcolor="rgba(255, 255, 255, 0.8)"
    )
)

# display pie chart
fig_produseA.show()

In [None]:
# table - why impractical

# create parent & child columns based on responses
userA_data['parent'] = userA_data['demo_useful_personal']
userA_data['child'] = userA_data.apply(
    lambda row: row['why_impractical_demo_personal'] if row['demo_useful_personal'] in ["Somewhat impractical", "Extremely impractical"]
    else row['demo_useful_personal'], axis=1)

# group data and count unique IDs for each (parent, child) combination
userA_impractical = userA_data.groupby(['parent', 'child'])['id'].nunique().reset_index(name='count')

# filter data to only include rows where the parent is "Somewhat impractical" or "Extremely impractical"
userA_impractical = userA_impractical[userA_impractical['parent'].isin(["Somewhat impractical", "Extremely impractical"])]

# create the table figure & adjust layout
fig_impracticalA = go.Figure(data=[go.Table(
    header=dict(values=['Product Usefulness', 'Why']),
    cells=dict(values=[userA_impractical['parent'], 
                       userA_impractical['child']])
)])
fig_impracticalA.update_layout(
    title="Product Usefulness and Reasons for Impracticality",
    title_x=0.5 
)

# display table
fig_impracticalA.show()

#### User emails

In [None]:
# retrieve emails from contact column
def retrieve_emails(dataframe,col_name):
    email_list = []
    for val in dataframe[col_name]:
        if "@" in val:
            email_list.append(val)
    return email_list

userA_contact = retrieve_emails(userA_data,"contact_personal")
print(userA_contact)

# save to txt file
with open('userA_emails.txt', 'x') as f:
    for items in userA_contact:
        f.write('%s\n' %items)
    print("File written successfully")
f.close()    

### User B = Use Amenity Data

In [None]:
# import csv file as dataframe
userB_path = "userB_responses.csv"
userB_data = pd.read_csv(userB_path, delimiter=",")
userB_data.head(5)

#### Basic demographic data
0. Number of respondents
1. County percentages
2. Sector percentages

In [None]:
# how many people in user B
userB_count = len(userB_data)
print("Number of respondents in User B:",userB_count)

In [None]:
# county & sector plot
countyB_count = userB_data["county"].value_counts()
sectorB_count = userB_data["sector"].value_counts()

# county & sector colors - use above defined
# create pie chart for "County"
pie_data_countyB = pd.DataFrame({'County': countyB_count.index, 'Count': countyB_count.values})
fig_countyB = px.pie(pie_data_countyB, names='County', values='Count', title="User B County Distribution", color_discrete_sequence=county_colors)

# show labels; pull out big slices & adjust layout
fig_countyB.update_traces(textinfo='label+percent', textposition='inside')
fig_countyB.update_traces(
    pull=[0.07 if count > 0.05 * countyB_count.sum() else 0 for count in countyB_count.values]  
)
fig_countyB.update_layout(
    legend=dict(
        title="Counties",
        orientation="v",
        x=1.1,
        y=1,
        xanchor="left",
        yanchor="top",
        font=dict(size=10),
        bgcolor="rgba(255, 255, 255, 0.8)"
    )
)

# display county pie chart
fig_countyB.show()

# create second pie chart for "Sector"
pie_data_sectorB = pd.DataFrame({'Sector': sectorB_count.index, 'Count': sectorB_count.values})
fig_sectorB = px.pie(pie_data_sectorB, names='Sector', values='Count', title="User B Sector Distribution", color_discrete_sequence=sector_colors)

# show labels; pull out big slices & adjust layout
fig_sectorB.update_traces(textinfo='label+percent', textposition='inside')
fig_sectorB.update_traces(
    pull=[0.07 if count > 0.10 * sectorB_count.sum() else 0 for count in sectorB_count.values]  
)
fig_sectorB.update_layout(
    legend=dict(
        title="Sectors",
        orientation="v",
        x=1.1,
        y=1,
        xanchor="left",
        yanchor="top",
        font=dict(size=10),
        bgcolor="rgba(255, 255, 255, 0.8)"
    )
)

# display sector pie chart
fig_sectorB.show()

#### Technology usage
1. Which device mostly used day-to-day?
2. How often do you use digital tools for navigation?

In [None]:
# explode concerned columns
userBcols_to_explode = ["device_work","type_amenity_data_work","type_tool_work","satisfaction_tool_work","why_impractical_demo_work","other_amenity_work"]

users_B_expl = explode_multiple_answers(userB_data,userBcols_to_explode)

In [None]:
# replace other answers with "Other" - using defined list in userA code
users_B_expl["device_work_exploded"] = users_B_expl["device_work_exploded"].apply(
    lambda x: x if x in device_list else "Other"
)
# check for sanity
users_B_expl["device_work_exploded"].value_counts()

In [None]:
# plot device count
deviceB_count = users_B_expl.groupby("device_work_exploded")['id'].count()

# create a df for plotting & sort values ascending
bar_data_deviceB = pd.DataFrame({'Device': deviceB_count.index, 'Count': deviceB_count.values})
bar_data_deviceB = bar_data_deviceB.sort_values(by='Count', ascending=False)

# plot the bar chart
fig_deviceB = px.bar(bar_data_deviceB, x='Device', y='Count', title="User B Preferred Device")

# customizing layout
fig_deviceB.update_layout(
    width=800,
    height=500,
    legend=dict(
        title="Devices",
        orientation="h",
        x=1.1,
        y=1,
        xanchor="left",
        yanchor="top",
        font=dict(size=6),
        bgcolor="rgba(255, 255, 255, 0.8)"
    )
)

# show the plot
fig_deviceB.show()

In [None]:
# tool frequency plot - set count & color
toolfrequencyB_count = userB_data["freq_tool_work"].value_counts()

# create df
pie_data_toolfreqB = pd.DataFrame({'Tool Frequency': toolfrequencyB_count.index, 'Count': toolfrequencyB_count.values})
fig_toolfreqB = px.pie(pie_data_toolfreqB, names='Tool Frequency', values='Count', title="User B Tool Usage Frequency Distribution", color_discrete_sequence=toolfrequency_colors)

# push out the big slices
fig_toolfreqB.update_traces(
    pull=[0.05 if count > 0.2 * toolfrequencyB_count.sum() else 0 for count in toolfrequencyB_count.values] 
)

# show labels on the pie & fix layout
fig_toolfreqB.update_traces(textinfo='label+percent', textposition='inside')
fig_toolfreqB.update_layout(
    legend=dict(
        title="Frequency",
        orientation="v",
        x=1.1,
        y=1,
        xanchor="left",
        yanchor="top",
        font=dict(size=10),
        bgcolor="rgba(255, 255, 255, 0.8)"
    )
)

# display pie chart
fig_toolfreqB.show()

#### How useful our product is
1. Usefulness distribution
2. Reasons why impractical

In [None]:
# pie chart - distribution of how useful our product would be
product_usefulB_count = userB_data["demo_useful_work"].value_counts()

# create df and plot pie chart
pie_data_produseB = pd.DataFrame({'Product Usefulness': product_usefulB_count.index, 'Count': product_usefulB_count.values})
fig_produseB = px.pie(pie_data_produseB, names='Product Usefulness', values='Count', title="User B Thoughts on Product Usefulness", color_discrete_sequence=product_useful_colors)

# push out the big slices; show labels and adjust layout
fig_produseB.update_traces(
    pull=[0.05 if count > 0.2 * product_usefulB_count.sum() else 0 for count in product_usefulB_count.values]  
)
fig_produseB.update_traces(textinfo='label+percent', textposition='inside')
fig_produseB.update_layout(
    legend=dict(
        title="Frequency",
        orientation="v",
        x=1.1,
        y=1,
        xanchor="left",
        yanchor="top",
        font=dict(size=10),
        bgcolor="rgba(255, 255, 255, 0.8)"
    )
)

# display pie chart
fig_produseB.show()

In [None]:
# table why impractical
# create the parent and child columns based on responses
userB_data['parent'] = userB_data['demo_useful_work']
userB_data['child'] = userB_data.apply(
    lambda row: row['why_impractical_demo_work'] if row['demo_useful_work'] in ["Somewhat impractical", "Extremely impractical"]
    else row['demo_useful_work'], axis=1)

# group data and count unique IDs for each (parent, child) combination
userB_impractical = userB_data.groupby(['parent', 'child'])['id'].nunique().reset_index(name='count')

# filter data to only include rows where the parent is "Somewhat impractical" or "Extremely impractical"
userB_impractical = userB_impractical[userB_impractical['parent'].isin(["Somewhat impractical", "Extremely impractical"])]

# create the table figure & adjust layout
fig_impracticalB = go.Figure(data=[go.Table(
    header=dict(values=['Product Usefulness', 'Why']),
    cells=dict(values=[userB_impractical['parent'], 
                       userB_impractical['child']])
)])
fig_impracticalB.update_layout(
    title="Product Usefulness and Reasons for Impracticality",
    title_x=0.5
)

# display table
fig_impracticalB.show()

#### User emails
For this weekend, retrieve county they live in, sector they work in and the type of amenity data they access

In [None]:
# retrieve emails from contact column
userB_contact = retrieve_emails(userB_data,"contact_work")
print(userB_contact)

# save to txt file
with open('userB_emails.txt', 'x') as g:
    for items in userB_contact:
        g.write('%s\n' %items)
    print("File written successfully")
g.close() 

### Both users

#### Demographic data
1. Use Amenity or not
2. Sector distribution

In [None]:
# plot userA count and userB count in pie chart
userA_count = len(userA_data)
userB_count = len(userB_data)
list_opt = ["Yes","No"]
list_count = [userB_count, userA_count]

# create df & plot pie chart 
pie_user_count = pd.DataFrame({'Use amenity': list_opt, 'Count': list_count})
fig_user_count = px.pie(pie_user_count, names='Use amenity', values='Count', title="Amenity data usage")

# show labels on the pie
fig_user_count.update_traces(textinfo='label+percent', textposition='inside')

# display all amenity data usage counts
fig_user_count.show()


In [None]:
# sector all users
# combine sector data from user A & B
all_sectors = pd.concat([userA_data["sector"], userB_data["sector"]])

# get value counts for all sectors & reset the index
sector_counts = all_sectors.value_counts().reset_index()
sector_counts.columns = ['sector', 'count'] 

# plot the pie chart
fig_allsector = px.pie(
    sector_counts,
    names='sector',          
    values='count',          
    title="Sector Distribution for All Users",
    color_discrete_sequence=sector_colors
)

# adjust layout
fig_allsector.update_traces(
    textinfo='percent+label',   
    pull=[0.05 if count > sector_counts['count'].mean() else 0 for count in sector_counts['count']]            
)

# display pie chart
fig_allsector.show()
