In [2]:
# We can use plotly, matplotlib and seaborn to do graphics in Python.
# In module_2 we started to do some plotting using seaborn.

# In the GitHub Terminal area, type "pip install statsmodels" <return>
# This installs something in GitHub that we refer to in this next line of code
import statsmodels as sm
import pandas as pd
import plotly.express as px
# the 'r' below means "read every character as the character it is"
# it is good practice to use the 'r' even when we don't need it
filepath = r'https://raw.githubusercontent.com/data-to-insight/ERN-sessions/main/data/1980%202023%20average%20house%20prices.csv'
df = pd.read_csv(filepath)
df.head()

Unnamed: 0,Name,Period,House price index All property types,Average price All property types,Percentage change (monthly) All property types,Percentage change (yearly) All property types
0,United Kingdom,1980-01,10.11,19273,3.94,28.59
1,United Kingdom,1980-02,10.11,19273,3.94,28.59
2,United Kingdom,1980-03,10.11,19273,3.94,28.59
3,United Kingdom,1980-04,10.51,20044,4.0,24.15
4,United Kingdom,1980-05,10.51,20044,4.0,24.15


In [3]:
df['Period'] = pd.to_datetime(df['Period'], format="%Y-%m", errors='coerce')
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 523 entries, 0 to 522
Data columns (total 6 columns):
 #   Column                                          Non-Null Count  Dtype         
---  ------                                          --------------  -----         
 0   Name                                            523 non-null    object        
 1   Period                                          523 non-null    datetime64[ns]
 2   House price index All property types            523 non-null    float64       
 3   Average price All property types                523 non-null    int64         
 4   Percentage change (monthly) All property types  523 non-null    float64       
 5   Percentage change (yearly) All property types   523 non-null    float64       
dtypes: datetime64[ns](1), float64(3), int64(1), object(1)
memory usage: 24.6+ KB


In [None]:
# with the px.scatter command you define the data source, x axis field, y axis field
scatter_1 = px.scatter(df,
                       x='Period',
                       y='Average price All property types')
scatter_1.show()

In [None]:
scatter_2 = px.scatter(df,
                       x='Period',
                       y='Percentage change (monthly) All property types',
                       title='Monthly blah blah',
                       labels={'Period':'Time period',
                                'Percentage change (monthly) All property types':'Monthly price change (percent)'},
                                trendline='ols')

scatter_2.show()

In [None]:
filepath = r'https://raw.githubusercontent.com/data-to-insight/D2I-Jupyter-Notebook-Tools/main/ml-data%20science%20tutorials/data/CINdetails.csv'

df = pd.read_csv(filepath)

df['CINreferralDate'] = pd.to_datetime(df['CINreferralDate'], format="%d/%m/%Y", errors='coerce')
df['CINclosureDate'] = pd.to_datetime(df['CINclosureDate'], format="%d/%m/%Y", errors='coerce')
df['DateOfInitialCPC'] = pd.to_datetime(df['DateOfInitialCPC'], format="%d/%m/%Y", errors='coerce')

df.head()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101 entries, 0 to 100
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   Unnamed: 0        101 non-null    int64         
 1   LAchildID         101 non-null    object        
 2   CINdetailsID      101 non-null    int64         
 3   CINreferralDate   101 non-null    datetime64[ns]
 4   ReferralSource    101 non-null    object        
 5   PrimaryNeedCode   101 non-null    object        
 6   CINclosureDate    63 non-null     datetime64[ns]
 7   ReasonForClosure  63 non-null     object        
 8   DateOfInitialCPC  85 non-null     datetime64[ns]
 9   ReferralNFA       101 non-null    bool          
dtypes: bool(1), datetime64[ns](3), int64(2), object(4)
memory usage: 7.3+ KB


In [None]:
hist_1 = px.histogram(df,
x='PrimaryNeedCode')
hist_1.show()

In [None]:
hist_2a = px.histogram(df,
                    x='PrimaryNeedCode',
                    color='ReferralSource')
hist_2a.show()

In [None]:
hist_2b = px.histogram(df,
                    x='PrimaryNeedCode',
                    color='ReferralSource',
                    color_discrete_sequence=px.colors.qualitative.Light24)
hist_2b.show()

In [None]:
hist_2c = px.histogram(df,
                    x='PrimaryNeedCode',
                    color='ReferralSource',
                    color_discrete_sequence=px.colors.qualitative.Light24,
                    barmode='group')
hist_2c.show()

In [None]:
import numpy as np
df['CPCTimeliness'] = df['DateOfInitialCPC'] - df['CINreferralDate']
# 'CPCTimeliness' needs to be reformatted, so that the field can be used in graphics work
df['CPCTimeliness'] = df['CPCTimeliness']/np.timedelta64(1, 'D')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101 entries, 0 to 100
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   Unnamed: 0        101 non-null    int64         
 1   LAchildID         101 non-null    object        
 2   CINdetailsID      101 non-null    int64         
 3   CINreferralDate   101 non-null    datetime64[ns]
 4   ReferralSource    101 non-null    object        
 5   PrimaryNeedCode   101 non-null    object        
 6   CINclosureDate    63 non-null     datetime64[ns]
 7   ReasonForClosure  63 non-null     object        
 8   DateOfInitialCPC  85 non-null     datetime64[ns]
 9   ReferralNFA       101 non-null    bool          
 10  CPCTimeliness     85 non-null     float64       
dtypes: bool(1), datetime64[ns](3), float64(1), int64(2), object(4)
memory usage: 8.1+ KB


In [None]:
time_hist = px.histogram(df,
                         x='CPCTimeliness',
                         nbins=20,
                         color='ReferralSource')
time_hist.show()     

In [None]:
df['ReferralYear'] = df['CINreferralDate'].dt.year

time_box = px.box(df,
                  x='ReferralYear',
                  y='CPCTimeliness')

time_box.show()

In [None]:
time_box_2 = px.box(df,
                    y='CPCTimeliness',
                    color='ReferralSource')

time_box_2.show()

In [None]:
marginal = px.scatter(df,
                        x='CINreferralDate',
                        y='CPCTimeliness',
                        marginal_x='violin',
                        marginal_y='box',
                        #trendline='ols',
                        color='PrimaryNeedCode')
marginal.show()

In [None]:
# in this example, Will demonstrates some axis labels that are not showing correctly
facet = px.histogram(df,
                    x='PrimaryNeedCode',
                    facet_row='ReferralYear')
facet.show()

In [None]:
# sankey diagram
df['ReasonForClosure'] = df['ReasonForClosure'].fillna('Still Open')

parallel_cats = px.parallel_categories(df[['ReferralSource', 'PrimaryNeedCode', 'ReasonForClosure']])

parallel_cats.show()

In [None]:
time_df = df[df['CPCTimeliness'].notna()]

time_df = time_df[time_df['ReferralYear'] > 2010]

In [None]:
# the size code line was wrecking this.
# The px.scatter command could not work with the format of df['CPCTimeliness'] which was timedelta64[ns]
# when creating the field 'CPCTimeliness' we have to change the format to float64 e.g.
# df['CPCTimeliness'] = df['CPCTimeliness']/np.timedelta64(1, 'D')
scatter_3a = px.scatter(time_df,
                        x='CINreferralDate',
                        y='PrimaryNeedCode',                       
                        color='ReferralSource',
                        size='CPCTimeliness')
scatter_3a.show()

In [None]:
time_df = df[df['CPCTimeliness'].notna()]

time_df = time_df[time_df['ReferralYear'] > 2010]

scatter_3b = px.scatter(time_df,
                       x='CINreferralDate',
                       y='PrimaryNeedCode',
                       size='CPCTimeliness',
                       color='ReferralSource',
                       range_x=[pd.to_datetime(2015, format='%Y'),pd.to_datetime(2020, format='%Y')],
                       )
scatter_3b.show()

In [5]:
df = px.data.gapminder()

fig = px.scatter(df,
                 x='gdpPercap',
                 y='lifeExp',
                 log_x=True,
                 hover_name='country',
                 color='continent',
                 size='pop',
                 facet_col='continent',
                 facet_col_wrap=2,
                 size_max=45,
                 animation_frame='year',
                 animation_group='country',
                 range_x=[100,120000],
                 range_y=[25,100])

fig.show()

NameError: name 'px' is not defined