[Reference](https://medium.com/@gamiranda.ds/basic-python-altair-tutorial-on-pokemon-dataset-93f088c936df)



In [1]:
import pandas as pd
import altair as alt
df = pd.read_csv("https://raw.githubusercontent.com/gamiranda/mediumPost/main/Pokemon.csv")
df['Legendary'] = ["Legendary" if df.loc[i, 'Legendary'] == True
                               else "Not Legendary" 
                               for i in range(df.shape[0])]
df['Generation'] = df['Generation'].astype(str)

In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 800 entries, 0 to 799
Data columns (total 13 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   #           800 non-null    int64 
 1   Name        800 non-null    object
 2   Type 1      800 non-null    object
 3   Type 2      414 non-null    object
 4   Total       800 non-null    int64 
 5   HP          800 non-null    int64 
 6   Attack      800 non-null    int64 
 7   Defense     800 non-null    int64 
 8   Sp. Atk     800 non-null    int64 
 9   Sp. Def     800 non-null    int64 
 10  Speed       800 non-null    int64 
 11  Generation  800 non-null    object
 12  Legendary   800 non-null    object
dtypes: int64(8), object(5)
memory usage: 81.4+ KB


# Information about Pokemon type


In [3]:
bars = alt.Chart(df).mark_bar().encode(
    x='count()',
    y=alt.Y('Type 1', sort='-x') #sorting the chart
)
text = bars.mark_text(
    align='right',
    baseline='middle',
    dx=20  # nudges text to right 
).encode(
    text='count()'
)
(bars + text).properties(height=500) #height changes the size of the figure

# Correlation between the numerical variables


In [4]:
df_num = df.drop(["#", "Name", "Type 1", "Type 2", "Generation", "Total", "Legendary"], 1)

corrMatrix = df_num.corr(
method = 'pearson').reset_index().melt('index') #pearson correlation
corrMatrix.columns = ['X', 'Y', 'correlation']

base = alt.Chart(corrMatrix).encode(
    x='X',
    y='Y',
).properties(
    width=alt.Step(100),
    height=alt.Step(100)
)

rects = base.mark_rect().encode(
    color='correlation'
)

text = base.mark_text(
    size=30
).encode(
    text=alt.Text('correlation', format=".2f"),
    color=alt.condition(
#if the correlation is higher than 0.5, the number um the chart will be white else black
        "datum.correlation > 0.5", 
        alt.value('white'),
        alt.value('black')
    )
)

rects + text

  """Entry point for launching an IPython kernel.


# Comparing HP and Attack


In [5]:
alt.Chart(df).mark_point().encode(
    x = 'HP',
    y = 'Attack',
    color = 'Legendary',
    shape = 'Legendary'
             #change the title of the chart
).properties(title = 'Scatterplot Chart Comparing HP and Attack', 
             width=400, #change the width of the chart
             height=400) #change the height of the chart

In [6]:
alt.Chart(df).mark_point().encode(
    x = 'HP',
    y = 'Attack',
    color = 'Legendary',
    shape = 'Legendary',
    tooltip = ['Name', 'Attack', 'HP']
).properties(title = 'Scatterplot Chart Comparing HP and Attack',
             width=400,
             height=400).interactive()

In [7]:
alt.Chart(df).mark_point().encode(
    x = 'HP',
    y = 'Attack',
    column = 'Generation',
    color = 'Legendary',
    shape = 'Legendary',
    tooltip = ['Name', 'Attack', 'HP']
).properties(title = 'Scatterplot Chart Comparing HP and Attack by Generation',
             width=200,
             height=200).interactive()

# Visualization about Generation


In [8]:
alt.Chart(df).mark_bar().encode(
    x='Generation',
    y = 'count()',
).properties(title = 'Barplot Chart Comparing Generation',
             width=400,
             height=400)

In [9]:
alt.Chart(df).mark_bar().encode(
    x='Generation',
    y = 'count()',
    color=alt.condition(
        alt.datum.Generation == "3",  # If the generation is 3 this test returns True,
        alt.value('orange'),     # which sets the bar orange.
        alt.value('steelblue')   # And if it's not true it sets the bar steelblue.
    )
).properties(title = 'Barplot Chart Comparing Generation',
             width=400,
             height=400).interactive()

# INFORMATION ABOUT HP AND ATTACK


In [10]:
g1 = alt.Chart(df).mark_boxplot(size = 150).encode( #size change the size of the box
    x = 'Legendary',
    y = 'HP',
    color = 'Legendary'
).properties(title = "Boxplot of the Pokemon's HP",
             width=400,
             height=400).interactive()

g2 = alt.Chart(df).mark_boxplot(size = 150).encode(
    x = 'Legendary',
    y = 'Attack',
    color = 'Legendary'
).properties(title = "Boxplot of the Pokemon's Attack",
             width=400,
             height=400).interactive()

g1 | g2 #plot two charts side by side

In [11]:
hp_leg = [df.loc[i, 'HP'] for i in range(0, df.shape[0]) if df.loc[i, 'Legendary'] == 'Legendary']

source = pd.DataFrame({
    'HP Legendary': hp_leg
})

base = alt.Chart(source)

bar = base.mark_bar().encode(
    x=alt.X('HP Legendary', bin=alt.Bin(maxbins=10)), #set the max bins
    y='count()'
).properties(title = "Histogram of the Legendary Pokemon's HP",
             width=400,
             height=400).interactive()

rule = base.mark_rule(color='red').encode(
    x='mean(HP Legendary)',
    size=alt.value(5) #set the size of the red line
)

g1 = bar + rule

hp_nleg = [df.loc[i, 'HP'] for i in range(0, df.shape[0]) if df.loc[i, 'Legendary'] == 'Not Legendary']

source = pd.DataFrame({
    'HP Not Legendary': hp_nleg
})

base = alt.Chart(source)

bar = base.mark_bar().encode(
    x=alt.X('HP Not Legendary', bin=alt.Bin(maxbins=10)), #set the max bins
    y='count()'
).properties(title = "Histogram of the not Legendary Pokemon's HP",
             width=400,
             height=400).interactive()

rule = base.mark_rule(color='red').encode(
    x='mean(HP Not Legendary)',
    size=alt.value(5) #set the size of the red line
)

g2 = bar + rule

g1 | g2

In [12]:
at_leg = [df.loc[i, 'Attack'] for i in range(0, df.shape[0]) if df.loc[i, 'Legendary'] == 'Legendary']

source = pd.DataFrame({
    'Attack Legendary': hp_leg
})

base = alt.Chart(source)

bar = base.mark_bar().encode(
    x=alt.X('Attack Legendary', bin=alt.Bin(maxbins=10)), #set the max bins
    y='count()'
).properties(title = "Histogram of the Legendary Pokemon's Attack",
             width=400,
             height=400).interactive()

rule = base.mark_rule(color='red').encode(
    x='mean(Attack Legendary)',
    size=alt.value(5) #set the size of the red line
)

g1 = bar + rule

hp_nleg = [df.loc[i, 'Attack'] for i in range(0, df.shape[0]) if df.loc[i, 'Legendary'] == 'Not Legendary']

source = pd.DataFrame({
    'Attack Not Legendary': hp_nleg
})

base = alt.Chart(source)

bar = base.mark_bar().encode(
    x=alt.X('Attack Not Legendary', bin=alt.Bin(maxbins=10)), #set the max bins
    y='count()'
).properties(title = "Histogram of the not Legendary Pokemon's Attack",
             width=400,
             height=400).interactive()

rule = base.mark_rule(color='red').encode(
    x='mean(Attack Not Legendary)',
    size=alt.value(5) #set the size of the red line
)

g2 = bar + rule

g1 | g2