<a href="https://colab.research.google.com/github/BragatteMAS/Python/blob/master/How_to/How_to_Altair_Viz.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

How to use [**Altair** for simple **Viz**](https://towardsdatascience.com/3-examples-to-show-python-altair-is-more-than-a-data-visualization-library-700e37cbf95b) like Pandas

        create by @bragatte 202103151325

In [1]:
import numpy as np
import pandas as pd
import altair as alt

In [None]:
cols = ['Type','Price','Distance','Date','Landsize','Regionname']
melb = pd.read_csv(
   "/content/melb_data.csv", usecols = cols, parse_dates = ['Date']
).sample(n=1000).reset_index(drop=True)
melb.head()

Unnamed: 0,Type,Price,Date,Distance,Landsize,Regionname
0,u,505000.0,2016-06-27,7.5,0.0,Southern Metropolitan
1,u,630000.0,2017-05-20,4.5,0.0,Southern Metropolitan
2,h,3010000.0,2016-10-09,3.4,382.0,Northern Metropolitan
3,u,600000.0,2017-03-09,6.4,2283.0,Southern Metropolitan
4,h,677500.0,2017-08-04,14.9,245.0,Northern Metropolitan


In [None]:
alt.Chart(melb).mark_bar().encode(
   x = 'Regionname', y = 'avg_price:Q'
).transform_aggregate(
   avg_price = 'mean(Price)', groupby = ['Regionname']
).properties(
   height = 300, width = 500
)

In [None]:
alt.Chart(
  melb, height=300, width=500
).mark_bar().encode(
  x = 'Regionname', y = 'avg_price:Q'
).transform_filter(
  alt.FieldGTPredicate(field='Distance', gt=3)
).transform_aggregate(
  avg_price = 'mean(Price)',groupby = ['Regionname']
)

In [None]:
melb['OwnerId'] = np.arange(1,1001)
df = pd.DataFrame({
  'OwnerId': melb['OwnerId'],
  'Age': np.random.randint(20, 40, size=1000),
  'Salary': np.random.randint(5000, 10000, size=1000)
})
df.head()

Unnamed: 0,OwnerId,Age,Salary
0,1,24,7795
1,2,31,7697
2,3,33,8259
3,4,30,6290
4,5,24,9403


In [None]:
alt.Chart(
  df, height=300, width=500
).mark_bar().encode(
  x = 'mean(Salary):Q', y = 'Type:O'
).transform_lookup(
  lookup='OwnerId',
  from_=alt.LookupData(data=melb, key='OwnerId', fields=['Type'])
)

## [Altair: Statistical Visualization Library for Python](https://towardsdatascience.com/altair-statistical-visualization-library-for-python-cfb63847c0c0)

In [2]:
insurance = pd.read_csv("/content/insurance.csv")
insurance.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


### Scatter plot

In [3]:
(alt.
  Chart(insurance).
  mark_circle(size=40).
  encode(x='charges', y='bmi').
  properties(height=400, width=500))

### Interactive option

In [4]:
(alt.
  Chart(insurance).
  mark_circle(size=50).
  encode(x='charges', y='bmi', color='smoker').
  properties(height=400, width=500).
  interactive())

### Interactive with Hover option 

In [6]:
(alt.
  Chart(insurance).
  mark_circle(size=50).
  encode(x='charges', y='bmi', color='smoker', tooltip=   
  ['age','sex']).
  properties(height=400, width=500).
  interactive())

### Bar plot


In [7]:
(alt.
  Chart(insurance).
  mark_bar().
  encode(x='region', y='mean(charges):Q').
  properties(height=300, width=400))

In [8]:
y=alt.X(field='charges', aggregate='mean', type='quantitative')


In [9]:
insurance[['region','charges']].groupby('region').mean()


Unnamed: 0_level_0,charges
region,Unnamed: 1_level_1
northeast,13406.384516
northwest,12417.575374
southeast,14735.411438
southwest,12346.937377


### Histogram

In [10]:
(alt.
  Chart(insurance).
  mark_bar().
  encode(alt.X('bmi:Q', bin=True), y='count()').
  properties(height=300, width=500))

### Grid of plots


In [11]:
p1 = (alt.
        Chart(insurance).
        mark_bar().
        encode(x='region', y='mean(charges):Q').
        properties(height=200, width=300))
p2 = (alt.
        Chart(insurance).
        mark_bar().
        encode(alt.X('bmi:Q', bin=True), y='count()').
        properties(height=200, width=300))

In [12]:
p1 | p2


In [13]:
p1 & p2


## [Altair: Statistical Visualization Library for Python (Part 2)](https://towardsdatascience.com/altair-statistical-visualization-library-for-python-part-2-4c8ce134e743)

In [14]:
marketing = pd.read_csv("/content/DirectMarketing.csv")
marketing.head()

Unnamed: 0,Age,Gender,OwnHome,Married,Location,Salary,Children,History,Catalogs,AmountSpent
0,Old,Female,Own,Single,Far,47500,0,High,6,755
1,Middle,Male,Rent,Single,Close,63600,0,High,6,1318
2,Young,Female,Rent,Single,Close,13500,0,Low,18,296
3,Middle,Male,Own,Married,Close,85600,1,High,18,2436
4,Middle,Female,Own,Single,Close,68400,0,High,12,1304


In [15]:
(alt.
  Chart(marketing).
  mark_circle(size=50).
  encode(x='Salary', y='AmountSpent', color='Age'))

Filter data

    It is possible to filter data while creating a visualization. For instance, we can only plot the data points for which the salary is less than 120000.

In [16]:
(alt.
  Chart(marketing).
  mark_circle(size=50).
  encode(x='Salary', y='AmountSpent', color='Age').
  transform_filter(alt.FieldLTPredicate(field='Salary', lt=120000)).
  properties(height=400, width=500))

    The same filtering operation can also be done by using the datum module of Altair. It is simpler in terms of the syntax. The following code will create the same plot as above.

In [17]:
from altair import datum
(alt.
  Chart(marketing).
  mark_circle(size=50).
  encode(x='Salary', y='AmountSpent', color='Age').
  transform_filter(datum.Salary < 120000).
  properties(height=400, width=500))

    Specify a condition for filtering based on a categorical column. For instance, the data points that belong to a set of discrete values can be filtered using the FieldOneOfPredicate method.

In [18]:
(alt.
  Chart(marketing).
  mark_circle(size=50).
  encode(x='Salary', y='AmountSpent', color='Age').
  transform_filter(alt.FieldOneOfPredicate(field='Children', 
                   oneOf= [0,2,3])).
  properties(height=400, width=500))

    Two plots:
        One will the a scatter plot that consists of the salary and amount spent columns. The other one will be a bar plot that shows the average salary for the categories in the age column. The second plot will also be used as a filter for the first plot.

    Calculated the averages by applying the following transformation in the encode function
`y='mean(Salary):Q'`



In [19]:
selection = alt.selection_multi(fields=['Age'])
first = (alt.
          Chart().
          mark_circle(size=50).
          encode(x='Salary', y='AmountSpent').
          transform_filter(selection).
          properties(height=300, width=500))
second = (alt.
           Chart().
           mark_bar().
           encode(
           x='Age:O',y='mean(Salary):Q',
           color=alt.condition(selection, alt.value('steelblue'),   
                 alt.value('lightgray'))
           ).
           properties(height=300, width=300).
           add_selection(selection))
alt.hconcat(first, second, data=marketing)

## [Altair: Statistical Visualization Library for Python (Part 3)](https://towardsdatascience.com/altair-statistical-visualization-library-for-python-part-3-c1e650a8411e)

In [21]:
cols = ['Attrition_Flag','Gender','Education_Level', 'Marital_Status','Credit_Limit','Total_Trans_Amt','Total_Trans_Ct']
churn = pd.read_csv("/content/BankChurners.csv", usecols=cols)\
.sample(n=1000)
churn.head()

Unnamed: 0,Attrition_Flag,Gender,Education_Level,Marital_Status,Credit_Limit,Total_Trans_Amt,Total_Trans_Ct
578,Existing Customer,F,High School,Married,2927.0,1474,26
6512,Existing Customer,M,Post-Graduate,Divorced,11596.0,4686,67
409,Existing Customer,F,Graduate,Single,6347.0,1161,44
4763,Existing Customer,F,High School,Married,1438.3,3887,79
7376,Attrited Customer,F,High School,Divorced,1662.0,2326,31


In [22]:
selection = alt.selection(type='interval')

In [23]:
plt1 = (alt.
         Chart(churn).
         mark_circle(size=50).
         encode(
          x='Credit_Limit', y='Total_Trans_Amt',
          color = alt.condition(selection, 'Gender',  
          alt.value('lightgray'))
         ).
         add_selection(selection))
plt2 = (alt.
         Chart(churn).
         mark_bar().
         encode(y='Gender', x='count(Gender):Q',color = 'Gender').
         transform_filter(selection))

In [24]:
plt1 & plt2

`y='mean(Total_Trans_Amt):Q'`

In [26]:
selection = alt.selection(type='interval')
plt1 = (alt.
         Chart().
         mark_circle(size=50).
         encode(x='Credit_Limit', y='Total_Trans_Amt',
         color='Gender').
         transform_filter(selection))
plt2 = (alt.
         Chart().
         mark_bar().
         encode(
         x='Marital_Status', y='mean(Total_Trans_Amt):Q',
         color=alt.condition(selection, alt.value("lightblue"),   
         alt.value("lightgray"))
         ).
         properties(height=300, width=200).
         add_selection(selection))

In [27]:
alt.hconcat(plt1, plt2, data=churn)

In [28]:
selection = alt.selection_multi(fields=['Education_Level'], bind='legend')

In [29]:
(alt.
  Chart(churn).
  mark_circle(size=50).
  encode(
  x='Total_Trans_Ct', y='Total_Trans_Amt',
  color= alt.Color('Education_Level:N',
  scale=alt.Scale(scheme='category20b')),
  opacity=alt.condition(selection, alt.value(1), alt.value(0.1))
  ).
  properties(height=400, width=500).
  add_selection(selection))

## [Altair: Statistical Visualization Library for Python](https://towardsdatascience.com/altair-statistical-visualization-library-for-python-part-4-9ec970fb12e8) (Part 4)

In [2]:
insurance = pd.read_csv("/content/insurance.csv")
insurance.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [3]:
(alt.
  Chart(insurance).
  mark_circle().
  encode(x='charges', y='bmi', color='smoker').
  properties(height=400, width=500))

In [4]:
(alt.
  Chart(insurance).
  mark_circle().
  encode(
    alt.X('charges'),
    alt.Y('bmi', scale=alt.Scale(zero=False)),
    alt.Color('smoker')).
properties(height=400, width=500))

In [11]:
(alt.
  Chart(insurance).
  mark_circle().
  encode(
    alt.X('charges'),
    alt.Y('bmi', scale=alt.Scale(domain=(10,60))),
    alt.Color('smoker')).
properties(height=300, width=400))

In [12]:
(alt.
  Chart(insurance).
  mark_circle(size=50, color='darkblue', opacity=0.6).
  encode(
    alt.X('charges'),
    alt.Y('bmi', scale=alt.Scale(domain=(15,55)))
  ).
  properties(height=400, width=500))

In [13]:
(alt.
  Chart(insurance).
  mark_circle().
  encode(
    alt.X('charges'),
    alt.Y('bmi', scale=alt.Scale(domain=(15,55))),
    size = alt.value(50),
    color = alt.value('darkblue'),
    opacity = alt.value(0.6)
  ).
  properties(height=400, width=500))

In [14]:
(alt.
  Chart(insurance).
  mark_circle(size=40).
  encode(
    alt.X('charges'),
    alt.Y('bmi', scale=alt.Scale(zero=False)),
    alt.Color('smoker', 
              legend=alt.Legend(
                 title='Do they smoke?',  
                 orient='left',
                 titleFontSize=13,
                 labelFontSize=13
                 )
              )
    ).
  properties(title="Bmi vs Insurance Cost")
)

In [15]:
(alt.
  Chart(insurance).
  mark_circle(size=40).
  encode(
    alt.X('charges', title="Insurance Cost"),
    alt.Y('bmi', scale=alt.Scale(zero=False), 
          title="Body Mass Index"),
    alt.Color('smoker', 
              legend=alt.Legend(
                 title='Do they smoke?',  
                 orient='left',
                 titleFontSize=13,
                 labelFontSize=13
                 )
              )
    ).
  properties(title="Bmi vs Insurance Cost",
             height=350, width=500)
)