In [16]:
%%writefile Data_Quality.py
import streamlit as st
import pandas as pd
import os
from datetime import datetime
import seaborn as sns
from pathlib import Path
from pandas_profiling import ProfileReport
from pandas_profiling.utils.cache import cache_zipped_file
from streamlit_pandas_profiling import st_profile_report


st.markdown("# Data Quality Check")
st.sidebar.markdown("# Data Quality Check")
tab1, tab2, tab3, tab4, tab5 = st.tabs(["Transaction", "New Customer", "Customer Demo", 'CustomerAddress', 'Takeaways'])

# progress_text = "Operation in progress. Please wait."
# my_bar = st.progress(0, text=progress_text)
# for percent_complete in range(100):
#     time.sleep(0.1)
#     my_bar.progress(percent_complete + 1, text=progress_text)

    
st.success('Your DataOverview Report is Completed', icon="✅")


@st.cache_resource 
def profiling_transaction(sheet):
    df = pd.read_excel('KPMG_VI_New_raw_data_update_final.xlsx',sheet_name= sheet)
    if sheet != 'CustomerDemographic':
        df.columns = df.iloc[0,:]
        df  = df.iloc[1:,:]
        df = df.loc[:,~df.columns.isna()]



    profile = ProfileReport(
        df, title="Profile Report of the Transaction Sheet", explorative=True
    )
    
    return profile

with tab1:
    profile = profiling_transaction('Transactions')
    st_profile_report(profile)
    st.download_button(
      'Download  Report',
      data=profile.to_html(),
        file_name = 'Transactions.html',
      help='Click  to get you own insights!'
)

    
with tab2:
    profile = profiling_transaction('NewCustomerList')
    st_profile_report(profile)    
    st.download_button(
      'Download  Report',
      data=profile.to_html(),
        file_name = 'NewCustomerList.html',
      help='Click  to get you own insights!'
)
    
with tab3:
    profile = profiling_transaction('CustomerDemographic')
    st_profile_report(profile) 
    st.download_button(
      'Download  Report',
      data=profile.to_html(),
        file_name = 'CustomerDemographic.html',
      help='Click  to get you own insights!'
)    
with tab4:
    profile = profiling_transaction('CustomerAddress')
    st_profile_report(profile) 
    st.download_button(
      'Download  Report',
      data=profile.to_html(),
        file_name = 'CustomerAddress.html',
      help='Click  to get you own insights!'
)       
with tab5:
    '''
    ### General Data Issue
    1. **Data accuracy:**
    Inconsistencies and inaccuracies in the data. For data birth, a lot of record has date of birth Over 120 years old and the max one even have 174 years old.
    It seems that this table have long period historical data which is updated with time goes by, but without check the death situation.
    2. **Data completeness:** Some column in the dataset contain null values.
    3. **Data consistency:** Some tables have incorrect data types, for this demographic table, the DOB should be timestamp, but the checkresult shows that it contain some non-numeric value.(We need conduct some data cleaning and data type transformation before do visualization and ml)
    4. **Data timelines:** transaction dataset seems good, and it has no problems with data currency.
    5. **Data validity:** Some data points in the dataset are invalid, for example, one record in date of
    birth in the Customer Demographic sheet making them 174 years old, which is not be used, need go back to data source.
    6. **Data uniqueness:** By conduct quality checking, we could not find any duplicate data in the dataset.
        '''

    
    st.write('### Suggestions')
    st.write('- Regular Data Audits: Conducting periodic assessments to eliminate invalid data and ensure the uniqueness of the information.')
    st.write('- Clear Data Collection Guidelines: Establishing explicit protocols for collecting data.')
    st.write('- Cross-Checking Procedures: Implementing cross-referencing techniques between different data fields.')
    st.write('- Consistent Naming Conventions: Setting clear guidelines for naming data fields across the organization.')
    st.write('- Data Validation: Implementing checks to verify the accuracy of specific data types in designated columns.')
    




Overwriting Data_Quality.py


In [2]:
%%writefile pages/Insight.py
import streamlit as st
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
#st.set_page_config(layout="wide")
st.markdown("# Insight")
st.sidebar.markdown("Insight")

#Shit hole begin
transactions = pd.read_excel('KPMG_VI_New_raw_data_update_final.xlsx',sheet_name='Transactions',header=1)
NewCustomer = pd.read_excel('KPMG_VI_New_raw_data_update_final.xlsx',sheet_name='NewCustomerList',header=1)
Demographic  = pd.read_excel('KPMG_VI_New_raw_data_update_final.xlsx',sheet_name='CustomerDemographic')
CustomerAddress = pd.read_excel('KPMG_VI_New_raw_data_update_final.xlsx',sheet_name='CustomerAddress',header=1)


Transactions = pd.read_excel('KPMG_VI_New_raw_data_update_final.xlsx',sheet_name= 'Transactions')
Transactions.columns = Transactions.iloc[0,:]
Transactions  = Transactions.iloc[1:,:]
Transactions.dropna(subset=['product_first_sold_date'], inplace=True)
Transactions.product_first_sold_date = Transactions.product_first_sold_date.apply(lambda x: datetime.fromtimestamp(x))
Transactions.transaction_date = Transactions.transaction_date.astype('string')
Transactions['transaction_month'] = Transactions.transaction_date.apply(lambda x: x[5:7])
Transactions.product_id = Transactions.product_id.astype('string')
product_summary = Transactions.groupby(['product_id'],as_index=False).aggregate(
            {'list_price':'mean',
             'brand':'first',
            'standard_cost':'mean',
            'transaction_id':'count'})
product_summary.sort_values(by=['list_price','standard_cost'],inplace=True,ascending=[False, True])
product_summary = product_summary.rename({'transaction_id': 'count'}, axis=1)
brands_name = Transactions.brand.unique()
chosen_brand = st.selectbox(
    'Choose brand here:',
    brands_name)

brand_fig, ax = plt.subplots(1,2,figsize=(16, 9))
ax[0].yaxis.tick_right()
sns.barplot(data=product_summary[product_summary.brand == chosen_brand], 
              x="list_price", 
              y="product_id", 
                  ax = ax[0],
                  color = 'green'
             )
sns.barplot(data=product_summary[product_summary.brand == chosen_brand], 
              x="standard_cost", 
              y="product_id", 
                  ax = ax[0],
                  color = 'orange'
                
             )
sns.barplot(data=product_summary[product_summary.brand == chosen_brand], 
              x="count", 
              y="product_id", 
                  ax = ax[1],
                  color = 'lightblue'
             )
ax[0].legend(['price','cost'],fontsize=14)
ax[0].invert_xaxis()
leg = ax[0].get_legend()
leg.legendHandles[0].set_color('green')
leg.legendHandles[1].set_color('orange')
ax[1].legend(['Count'], fontsize = 14)
leg1 = ax[1].get_legend()
leg1.legendHandles[0].set_color('lightblue')
st.pyplot(brand_fig)
# brand count
st.write('- Each brand has different marketing strategy and target populations, their sales statistics are hence different.')
st.write('- Understanding these difference is important when organizing any business activities.')

Tran_counts = Transactions.groupby(['brand'],as_index=False)['list_price'].count()
Tran_counts.columns = ['brand', 'count']
# brand revenue
Tran_brand = Transactions.groupby(['brand'],as_index=False)[['list_price','standard_cost']].sum()
Tran_brand['profit'] = Tran_brand['list_price'] - Tran_brand['standard_cost']
Tran_brand = Tran_brand.merge(Tran_counts, on = ['brand'])
Tran_brand['list_price_avg'] = Tran_brand['list_price'] / Tran_brand['count']
Tran_brand['profit_avg'] = Tran_brand['profit'] / Tran_brand['count']
Tran_brand['profit_rate'] = 1- Tran_brand['standard_cost']/Tran_brand['list_price']
st.dataframe(Tran_brand.style.highlight_max(axis=0, color = 'lightblue'))


#---------------Bing Start -----------------------------
#Brand Selling proportion
brand_sell = transactions.groupby('brand').count().reset_index().iloc[:,0:2]
brand_sell.columns = ['brand','sells_num']


labels = list(brand_sell['brand'])
data = list(brand_sell['sells_num'])

#Creating fig and ax
fig, ax = plt.subplots(figsize = (10,6))

#define Seaborn color palette to use
colors = sns.color_palette('pastel')[0:6]

#create pie chart
plt.pie(data, labels = labels, colors = colors, autopct='%.0f%%')
plt.tight_layout()

ax.set_title('')
st.write('## Brand Selling Proportion')
st.pyplot(fig)

#Insight
st.write('- Brands are cutting market evenly.')
st.write('- Solex is taking biggest cut with number of 21%, followed by Giant Bicycles and WeareA2B with 17%')

#---------------Bing End -----------------------------


#---------------Bing Online vs Brand Start -----------------------------

#Online order
st.write('## Brands Online Sells Analysis')
brand_online_sold = pd.DataFrame(transactions[transactions['online_order']==1.0]['brand'].value_counts()).reset_index()
brand_offline_sold = pd.DataFrame(transactions[transactions['online_order']==0]['brand'].value_counts()).reset_index()

cols = ['brand','online_sold','offline_sold']
brand_online_vs_offline = brand_online_sold.merge(brand_offline_sold,how='inner',on = 'index').set_axis(cols,axis=1)
st.dataframe(brand_online_vs_offline)

st.write('- Online shopping proportion showing similar patterns. All taking around 50% proportions.')
st.write('- Offline store is still taking a huge place under the impact of online shopping trend. Offline experience still cant fully replace by convinience.')

#---------------Bing Online vs Brand End -------------------------------



#---------------Bing Average Profit of brands in different class Start -------------------------------

st.write('## Average Profit of brands in different class')
transactions['profit'] = transactions['list_price'] - transactions['standard_cost']

profit = transactions.groupby(['brand','product_class']).mean()[['list_price','standard_cost','profit']]
#profit = transactions.groupby(['brand']).mean()[['profit']]

profit.reset_index(inplace=True)
my_order = ['low','medium','high']

profit['product_class'] = profit['product_class'].astype('category')
profit['product_class'].cat.reorder_categories(my_order, inplace= True)
profit.sort_values(['brand','product_class'])

fig, ax = plt.subplots(figsize = (10,6))
sns.barplot(data = profit, x = 'brand',y = 'profit',hue='product_class',)
plt.tight_layout()
st.pyplot(fig)
st.write('- OHM and Solex have amazing profit on low class products.')
st.write('- Trek and Weare A2B have advantages on middle class products.')
st.write('- Giant and Nocro are not competitive in profits compared to their opposite. With similar share of markets, company is making much less revenue.')



#---------------Bing Average Profit of brands in different class End -------------------------------




#---------------Bing Rich Customer Industry Start -------------------------------

st.write('## Valuable Customers Industry Source')
Rich_Ind = Demographic[(Demographic['wealth_segment'] == 'Affluent Customer')|(Demographic['wealth_segment'] == 'High Net Worth')].groupby('job_industry_category').count().reset_index().iloc[:,0:2]

labels = list(Rich_Ind['job_industry_category'])
data = list(Rich_Ind['customer_id'])

#define Seaborn color palette to use
colors = sns.color_palette('pastel')[0:10]

fig, ax = plt.subplots(figsize = (10,6))

#create pie chart
plt.pie(data, labels = labels, colors = colors, autopct='%.0f%%')
plt.tight_layout()

ax.set_title('')
st.title('')
st.pyplot(fig)

st.write('- Proportion of valuable customers is having similer pattern with all-customers industry pattern.')
st.write('- No strong evidence indicating that customers from certain areas having higher willingness to pay.')

#---------------Bing Rich Customer Industry End -------------------------------




# Customer payment count
CustomerAddress = pd.read_excel('KPMG_VI_New_raw_data_update_final.xlsx',sheet_name= 'CustomerAddress')
CustomerAddress.columns = CustomerAddress.iloc[0,:]
CustomerAddress  = CustomerAddress.iloc[1:,:]
Customer_counts = Transactions.groupby(['customer_id'],as_index=False)['list_price'].count()
Customer_counts.columns = ['customer_id', 'bill_count']
# Customer payment revenue
Cutomer_brand = Transactions.groupby(['customer_id'],as_index=False)[['list_price','standard_cost']].sum()
Cutomer_brand['profit'] = Cutomer_brand['list_price'] - Cutomer_brand['standard_cost']
Cutomer_brand = Cutomer_brand.merge(Customer_counts, on = ['customer_id'])
Cutomer_brand['list_price_avg'] = Cutomer_brand['list_price'] / Cutomer_brand['bill_count']
Cutomer_brand['profit_avg'] = Cutomer_brand['profit'] / Cutomer_brand['bill_count']
Cutomer_brand['profit_rate'] = 1- Cutomer_brand['standard_cost']/Cutomer_brand['list_price']
Cutomer_brand = Cutomer_brand.merge(CustomerAddress[['customer_id', 'property_valuation']], on = 'customer_id')

#Verbal
st.write('')

Cutomer_brand
st.write('- Pay closely attention to customer purchase frequency as we want to keep our customers and expand their consumptions.')
fig, ax = plt.subplots(figsize = (10,6))
Cutomer_brand.hist(['bill_count'] , ax = ax)
ax.set_title('')
ax.set_xlabel('Frequency')
ax.set_ylabel('Customer counts')
st.write('## Purchase Frequency Counts')
st.pyplot(fig)
st.write('- Most customers bought 5 products for this year. Probably indicating high user stickness')
fig1, ax1 = plt.subplots(figsize = (10,6))
Cutomer_brand.plot.scatter(['list_price_avg'], ['profit_avg'], 
                           figsize = (16,9),
                           c = Cutomer_brand.property_valuation,
                           ax = ax1
                           )
ax1.set_title('')
ax1.set_xlabel('Average Purchase Price')
ax1.set_ylabel('Average Profit')
st.write('## Price versus Profit')
st.pyplot(fig1)
st.write('- The deeper the color, the richer the customer.')
st.write('- Part of luxury products having over 75% profit rate.')
st.write('- Most purchase lie in range between 750 and 1500, which should be comfort zone for most buyer.')
st.write('- No significant evidence showing wealthy customers prefer more expensive products.')


Overwriting pages/Insight.py


In [5]:
%%writefile pages/Introduction.py
import streamlit as st
import pandas as pd
import numpy as np

st.markdown("# KPMG Data Analysis Platform 🎉")
st.sidebar.markdown("Contact & Controller🎉")
import streamlit as st



# Add a selectbox to the sidebar:
add_selectbox = st.sidebar.selectbox(
    'How would you like to be contacted?',
    ('Email', 'Home phone', 'Mobile phone')
)

st.sidebar.text_area('Contact Infomation', 
                      'Please leave your contact information on here! You would get compelete report!!')

# add mugshot to sidebar
# mugshot = st.sidebar.camera_input(
#   '## Create your mugshot for your own Report'
# )
# Add a slider to the sidebar:
# add_slider = st.sidebar.slider(
#     'Select a range of values',
#     0.0, 100.0, (25.0, 75.0)
# )
#####################Main page###

tab1, tab2, tab3 = st.tabs(["Why us?" , "Solutions & Anticipations", "About this Platform"])

with tab1:
    col1, col2 = st.columns(2,gap = "medium")
    with col1:
       st.image('src/IMG_5301 2.JPG',width = 400)

    with col2:
       st.markdown('## Why Us?')
       '''
       1. User-Friendly Interface: The platform features a user-friendly interface that allows users to easily visualize, manipulate, and explore their data, without requiring specialized technical skills.

       2. Advanced Analytics: The platform includes advanced analytics capabilities, such as machine learning algorithms, predictive modeling, and statistical analysis, allowing users to uncover insights and make data-driven decisions.
       
       3. Scalability and Security: The platform is designed to be scalable and secure, ensuring that it can accommodate growing amounts of data and protect sensitive information.
       '''
    
with tab2:
    
    col1, col2 = st.columns(2)
    with col1:
       st.markdown('## Solutions & Anticipations')
       '''
        1. **Deep understanding your data profile!**
            KYC,KYB and Know your data!
            
        2. **Check your data Quality!**
            Ensure your data quality from 6 dimensions and not be deceived!
            
        3. **Powerful Analytics tools!**
           Analyze your data set in multiple dimensions and give you the most comprehensive advice！
           
        4. **Intelligent Suggestion! **
            Intelligently provide valuable insights for your preprocessing procedure.
            
        5. **Visualization and Dashboard!**
            Quick, colorful, informative dashboard to let you aim your target users.
        '''

    with col2:
       st.image('src/IMG_5301 2.JPG')
      


with tab3:
    
    st.image("https://static.streamlit.io/examples/cat.jpg")

    '''
    Your personal Report is here
    '''
#     st.image(mugshot, width=300)





# @st.experimental_memo
# def load_data(url):
#     df = pd.read_csv(url)
#     return df

# df = load_data("https://github.com/plotly/datasets/raw/master/uber-rides-data1.csv")
# st.dataframe(df)

# st.button("Rerun")

Overwriting pages/Introduction.py


In [12]:
%%writefile environment.yml
name: STEnv
dependencies:
  - pandas-profiling
  - numpy
  - pandas
  - matplotlib
  - seaborn
  - openpyxl
  - pip
  - pip:
    - streamlit-pandas-profiling


Overwriting environment.yml


In [3]:
!streamlit run Data_Quality.py

[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://192.168.1.158:8501[0m
[0m
2023-02-11 11:19:30.759 Pandas backend loaded 1.4.2
2023-02-11 11:19:30.763 Numpy backend loaded 1.21.5
2023-02-11 11:19:30.763 Pyspark backend NOT loaded
2023-02-11 11:19:30.763 Python backend loaded
  NewCustomer = pd.read_excel('KPMG_VI_New_raw_data_update_final.xlsx',sheet_name='NewCustomerList',header=1)
  Demographic  = pd.read_excel('KPMG_VI_New_raw_data_update_final.xlsx',sheet_name='CustomerDemographic')
  NewCustomer = pd.read_excel('KPMG_VI_New_raw_data_update_final.xlsx',sheet_name='NewCustomerList',header=1)
  Demographic  = pd.read_excel('KPMG_VI_New_raw_data_update_final.xlsx',sheet_name='CustomerDemographic')
  profit['product_class'].cat.reorder_categories(my_order, inplace= True)
  NewCustomer = pd.read_excel('KPMG_VI_New_raw_data_update_final.xlsx',sheet_name='NewCustomerList',