In [20]:
%%writefile Data_Quality.py
import streamlit as st
import pandas as pd
import os
from datetime import datetime
import seaborn as sns
from pathlib import Path
from pandas_profiling import ProfileReport
from pandas_profiling.utils.cache import cache_zipped_file
from streamlit_pandas_profiling import st_profile_report


st.markdown("# Data Quality Check")
st.sidebar.markdown("# Data Quality Check")
tab1, tab2, tab3, tab4, tab5 = st.tabs(["Transaction", "New Customer", "Customer Demo", 'CustomerAddress', 'Takeaways'])

# progress_text = "Operation in progress. Please wait."
# my_bar = st.progress(0, text=progress_text)
# for percent_complete in range(100):
#     time.sleep(0.1)
#     my_bar.progress(percent_complete + 1, text=progress_text)

    
st.success('Your DataOverview Report is Completed', icon="✅")


@st.cache_resource 
def profiling_transaction(sheet):
    df = pd.read_excel('KPMG_VI_New_raw_data_update_final.xlsx',sheet_name= sheet)
    if sheet != 'CustomerDemographic':
        df.columns = df.iloc[0,:]
        df  = df.iloc[1:,:]
        df = df.loc[:,~df.columns.isna()]



    profile = ProfileReport(
        df, title="Profile Report of the Transaction Sheet", explorative=True
    )
    return profile

with tab1:
    profile = profiling_transaction('Transactions')
    st_profile_report(profile)

    
with tab2:
    profile = profiling_transaction('NewCustomerList')
    st_profile_report(profile)    
    
with tab3:
    profile = profiling_transaction('CustomerDemographic')
    st_profile_report(profile) 
    
with tab4:
    profile = profiling_transaction('CustomerAddress')
    st_profile_report(profile) 
    
with tab5:
    '''
    ### Takeaway & Insights
    1. **Data accuracy:**
    Inconsistencies and inaccuracies in the data. For data birth, a lot of record has date of birth Over 120 years old and the max one even have 174 years old.
    It seems that this table have long time historical data which is updated with time goes by, but without check the death situation.
    2. **Data completeness:** Some column in the dataset where contains null values.(It seems need data cleaning)
    
    3. **Data consistency:** Some tables have incorrect data types, for this demographic table, the DOB should be timestamp, but the checkresult shows that it contain some non-numeric value.(We need conduct some data cleaning and data type transformation before do visualization and ml)
    4. **Data timelines:** transaction dataset seems good, and it has no problems with data currency.
    5. **Data validity:** Some data points in the dataset are invalid, for example, one record in date of
    birth in the Customer Demographic sheet making them 174 years old, which is not be used, need go back to data source.
    6. **Data uniqueness:** By conduct quality checking, we could not find any duplicate data in the dataset.
        '''

    
    st.write('### Suggestions')
    st.write('- Regular Data Audits: Conducting periodic assessments to eliminate invalid data and ensure the uniqueness of the information.')
    st.write('- Clear Data Collection Guidelines: Establishing explicit protocols for collecting data.')
    st.write('- Cross-Checking Procedures: Implementing cross-referencing techniques between different data fields.')
    st.write('- Consistent Naming Conventions: Setting clear guidelines for naming data fields across the organization.')
    st.write('- Data Validation: Implementing checks to verify the accuracy of specific data types in designated columns.')
    


st.download_button(
  'Download  Report',
  data='This is some text',
  help='Click  to get you own  insights!'
)

Overwriting main_page.py


In [12]:
%%writefile pages/page_2.py
import streamlit as st
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
st.markdown("# Page 2 ❄️")
st.sidebar.markdown("Page 2")

#Shit hole begin
transactions = pd.read_excel('KPMG_VI_New_raw_data_update_final.xlsx',sheet_name='Transactions',header=1)
NewCustomer = pd.read_excel('KPMG_VI_New_raw_data_update_final.xlsx',sheet_name='NewCustomerList',header=1)
Demographic  = pd.read_excel('KPMG_VI_New_raw_data_update_final.xlsx',sheet_name='CustomerDemographic')
CustomerAddress = pd.read_excel('KPMG_VI_New_raw_data_update_final.xlsx',sheet_name='CustomerAddress',header=1)


Transactions = pd.read_excel('KPMG_VI_New_raw_data_update_final.xlsx',sheet_name= 'Transactions')
Transactions.columns = Transactions.iloc[0,:]
Transactions  = Transactions.iloc[1:,:]
Transactions.dropna(subset=['product_first_sold_date'], inplace=True)
Transactions.product_first_sold_date = Transactions.product_first_sold_date.apply(lambda x: datetime.fromtimestamp(x))
Transactions.transaction_date = Transactions.transaction_date.astype('string')
Transactions['transaction_month'] = Transactions.transaction_date.apply(lambda x: x[5:7])




# brand count
st.write('- Each brand has different marketing strategy and target populations, their sales statistics are hence different.')
st.write('- Understanding these difference is important when organizing any business activities.')

Tran_counts = Transactions.groupby(['brand'],as_index=False)['list_price'].count()
Tran_counts.columns = ['brand', 'count']
# brand revenue
Tran_brand = Transactions.groupby(['brand'],as_index=False)[['list_price','standard_cost']].sum()
Tran_brand['profit'] = Tran_brand['list_price'] - Tran_brand['standard_cost']
Tran_brand = Tran_brand.merge(Tran_counts, on = ['brand'])
Tran_brand['list_price_avg'] = Tran_brand['list_price'] / Tran_brand['count']
Tran_brand['profit_avg'] = Tran_brand['profit'] / Tran_brand['count']
Tran_brand['profit_rate'] = 1- Tran_brand['standard_cost']/Tran_brand['list_price']
st.dataframe(Tran_brand.style.highlight_max(axis=0, color = 'lightblue'))


#---------------Bing Start -----------------------------
#Brand Selling proportion
brand_sell = transactions.groupby('brand').count().reset_index().iloc[:,0:2]
brand_sell.columns = ['brand','sells_num']


labels = list(brand_sell['brand'])
data = list(brand_sell['sells_num'])

#Creating fig and ax
fig, ax = plt.subplots(figsize = (10,6))

#define Seaborn color palette to use
colors = sns.color_palette('pastel')[0:6]

#create pie chart
plt.pie(data, labels = labels, colors = colors, autopct='%.0f%%')
plt.tight_layout()

ax.set_title('')
st.title('Brand Selling Proportion')
st.pyplot(fig)

#Insight
st.write('- Brands are cutting market evenly. Solex is taking biggest cut with number of 21%, followed by Giant Bicycles and WeareA2B.')

#---------------Bing End -----------------------------


#---------------Bing Online vs Brand Start -----------------------------

#Online order
st.write('## Brands Online Sells Analysis')
brand_online_sold = pd.DataFrame(transactions[transactions['online_order']==1.0]['brand'].value_counts()).reset_index()
brand_offline_sold = pd.DataFrame(transactions[transactions['online_order']==0]['brand'].value_counts()).reset_index()

cols = ['brand','online_sold','offline_sold']
brand_online_vs_offline = brand_online_sold.merge(brand_offline_sold,how='inner',on = 'index').set_axis(cols,axis=1)
st.dataframe(brand_online_vs_offline)

st.write('For all the brands, online proportion and offline are closed, which proves the importance of off line store in Bike market.')

#---------------Bing Online vs Brand End -------------------------------

# Customer payment count
CustomerAddress = pd.read_excel('KPMG_VI_New_raw_data_update_final.xlsx',sheet_name= 'CustomerAddress')
CustomerAddress.columns = CustomerAddress.iloc[0,:]
CustomerAddress  = CustomerAddress.iloc[1:,:]
Customer_counts = Transactions.groupby(['customer_id'],as_index=False)['list_price'].count()
Customer_counts.columns = ['customer_id', 'bill_count']
# Customer payment revenue
Cutomer_brand = Transactions.groupby(['customer_id'],as_index=False)[['list_price','standard_cost']].sum()
Cutomer_brand['profit'] = Cutomer_brand['list_price'] - Cutomer_brand['standard_cost']
Cutomer_brand = Cutomer_brand.merge(Customer_counts, on = ['customer_id'])
Cutomer_brand['list_price_avg'] = Cutomer_brand['list_price'] / Cutomer_brand['bill_count']
Cutomer_brand['profit_avg'] = Cutomer_brand['profit'] / Cutomer_brand['bill_count']
Cutomer_brand['profit_rate'] = 1- Cutomer_brand['standard_cost']/Cutomer_brand['list_price']
Cutomer_brand = Cutomer_brand.merge(CustomerAddress[['customer_id', 'property_valuation']], on = 'customer_id')
st.write('- Pay closely attention to customer purchase frequency as we want to keep our customers and expand their consumptions.')
fig, ax = plt.subplots(figsize = (10,6))
Cutomer_brand.hist(['bill_count'] , ax = ax)
ax.set_title('')
st.title('Purchase Frequency Counts')
st.pyplot(fig)

fig1, ax1 = plt.subplots(figsize = (10,6))
Cutomer_brand.plot.scatter(['list_price_avg'], ['profit_avg'], 
                           figsize = (16,9),
                           c = Cutomer_brand.property_valuation,
                           ax = ax1
                           )
ax1.set_title('')
st.title('Price versus Profit')
st.pyplot(fig1)

Overwriting pages/page_2.py


In [19]:
%%writefile pages/page_3.py
import streamlit as st
import pandas as pd
import numpy as np


st.markdown("# Page 3 🎉")
st.sidebar.markdown("Page 3 🎉")
import streamlit as st

# Add a selectbox to the sidebar:
add_selectbox = st.sidebar.selectbox(
    'How would you like to be contacted?',
    ('Email', 'Home phone', 'Mobile phone')
)

# Add a slider to the sidebar:
add_slider = st.sidebar.slider(
    'Select a range of values',
    0.0, 100.0, (25.0, 75.0)
)

@st.experimental_memo
def load_data(url):
    df = pd.read_csv(url)
    return df

df = load_data("https://github.com/plotly/datasets/raw/master/uber-rides-data1.csv")
st.dataframe(df)

st.button("Rerun")

Overwriting pages/page_3.py


In [12]:
%%writefile environment.yml
name: STEnv
dependencies:
  - pandas-profiling
  - numpy
  - pandas
  - matplotlib
  - seaborn
  - openpyxl
  - pip
  - pip:
    - streamlit-pandas-profiling


Overwriting environment.yml


In [2]:
!streamlit run Data_Quality.py

'streamlit' is not recognized as an internal or external command,
operable program or batch file.
