In [34]:
import pandas as pd

df = pd.read_csv('https://gist.githubusercontent.com/armgilles/194bcff35001e7eb53a2a8b441e8b2c6/raw/92200bc0a673d5ce2110aaad4544ed6c4010f687/pokemon.csv')

## Discussion 


1. What is a dataframe?


2. What information can we obtain about a dataframe?


3. How have we interacted with a dataframe?



In [35]:
# 1. What is a df?

# Tabular data object format specific to the pandas library in python

# What info can we obtain about a dataframe?
# its tabular data,
# which means it has properties like shape (rows, cols), data types, field names, cell values

# 3. How have we interacted with a dataframe?
# we have looked at dataframes largely sourced either through our ETL methodology of using mysql to call a 
# table of information from a schema, but have noted that we can also grab data from other sources
# once in a dataframe, we use this object and strategy in order to hold tabular data in memory 
# and manipulate it and filter it at our leisure for the sake of visualization, exploratory analysis,
# stats testing, and machine learning

---

### Practice Exercises


#### Information about a dataframe

1. Obtain the following information:
    
    - dimensions
    - dtypes
    - column names
    - summary statistics

In [36]:
# our dataframe: the variable df, which points to the dataframe object
# dimensions: code for shape df.shape()? df.shape?
df.shape

(800, 13)

In [38]:
# dtypes: datatypes: df.dtypes
df.dtypes

#              int64
Name          object
Type 1        object
Type 2        object
Total          int64
HP             int64
Attack         int64
Defense        int64
Sp. Atk        int64
Sp. Def        int64
Speed          int64
Generation     int64
Legendary       bool
dtype: object

In [39]:
# column names:
# 
df.columns

Index(['#', 'Name', 'Type 1', 'Type 2', 'Total', 'HP', 'Attack', 'Defense',
       'Sp. Atk', 'Sp. Def', 'Speed', 'Generation', 'Legendary'],
      dtype='object')

In [40]:
df.describe()

Unnamed: 0,#,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation
count,800.0,800.0,800.0,800.0,800.0,800.0,800.0,800.0,800.0
mean,362.81375,435.1025,69.25875,79.00125,73.8425,72.82,71.9025,68.2775,3.32375
std,208.343798,119.96304,25.534669,32.457366,31.183501,32.722294,27.828916,29.060474,1.66129
min,1.0,180.0,1.0,5.0,5.0,10.0,20.0,5.0,1.0
25%,184.75,330.0,50.0,55.0,50.0,49.75,50.0,45.0,2.0
50%,364.5,450.0,65.0,75.0,70.0,65.0,70.0,65.0,3.0
75%,539.25,515.0,80.0,100.0,90.0,95.0,90.0,90.0,5.0
max,721.0,780.0,255.0,190.0,230.0,194.0,230.0,180.0,6.0


In [45]:
# for the columns that are only objects (string types primarily)
df.loc[:,df.dtypes == 'O'].describe()

Unnamed: 0,Name,Type 1,Type 2
count,800,800,414
unique,800,18,18
top,Bulbasaur,Water,Flying
freq,1,112,97


In [41]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 800 entries, 0 to 799
Data columns (total 13 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   #           800 non-null    int64 
 1   Name        800 non-null    object
 2   Type 1      800 non-null    object
 3   Type 2      414 non-null    object
 4   Total       800 non-null    int64 
 5   HP          800 non-null    int64 
 6   Attack      800 non-null    int64 
 7   Defense     800 non-null    int64 
 8   Sp. Atk     800 non-null    int64 
 9   Sp. Def     800 non-null    int64 
 10  Speed       800 non-null    int64 
 11  Generation  800 non-null    int64 
 12  Legendary   800 non-null    bool  
dtypes: bool(1), int64(9), object(3)
memory usage: 75.9+ KB


In [47]:
df.isna().sum() > 0

#             False
Name          False
Type 1        False
Type 2         True
Total         False
HP            False
Attack        False
Defense       False
Sp. Atk       False
Sp. Def       False
Speed         False
Generation    False
Legendary     False
dtype: bool

---

#### Working with dataframes

1. What is the highest HP value present?

    - Create a function named highest_attack.
    - Use the loaded dataframe as an argument. 

In [49]:
# I want to look at this dataframe in a specific way:
# the HP is a field/column in df,
# i want to get the highest one
# once I do that, I want to put it in a function that returns that value

In [50]:
# to get the max hp, we can use the max method on the Series that HP lives in
df.HP.max()

255

In [51]:
def highest_attack(df, field='HP'):
    return df[field].max()
    

In [54]:
highest_attack(df, 'Attack')

190

2. Which Pokemon possess(es) the highest HP value?

In [57]:

df.sort_values(by='HP', ascending=False)['Name'].head(1)

261    Blissey
Name: Name, dtype: object

In [61]:
df[df.HP == highest_attack(df,'HP')]['Name']

261    Blissey
Name: Name, dtype: object

In [65]:
pd.Series(df.HP.values, index=df['Name']).idxmax()

'Blissey'

 
3. How many different types are represented in Type 1?

    - Create a function named num_types
    - Use the loaded dataframe as an argument
    

In [66]:
df['Type 1'].dtype

dtype('O')

In [67]:
df['Type 1'].nunique()

18

In [69]:
len(df['Type 1'].value_counts())

18

In [71]:
len(df.groupby('Type 1').count().Name)

18

In [72]:
len(df['Type 1'].unique())

18

In [73]:
def num_types(df, field_name='Type 1'):
    '''
    takes in a positional argument df, a pandas dataframe
    and an optional keyword argument or kwarg called field_name
    
    returns num_uniqs, the number of unique elements in said field
    '''
    num_uniqs = df[field_name].nunique()
    return num_uniqs

In [74]:
num_types(df)

18

4. Number of Pokemon whose Type 2 is Ghost

In [None]:
# type 2 has less entries than other fields
# we want to filter down to rows in the df
# specifically where the value of Type 2 is 'Ghost'
# aka df['Type 2'] == 'Ghost'

In [76]:
# sanity check:
(df['Type 2'] == 'Ghosts').sum()

0

In [77]:
df['Type 2'].unique()

array(['Poison', nan, 'Flying', 'Dragon', 'Ground', 'Fairy', 'Grass',
       'Fighting', 'Psychic', 'Steel', 'Ice', 'Rock', 'Dark', 'Water',
       'Electric', 'Fire', 'Ghost', 'Bug', 'Normal'], dtype=object)

In [78]:
# way 1: sum of a boolean series
(df['Type 2'] == 'Ghost').sum()

14

In [80]:
# way 2: filter the df and take the length
len(df[df['Type 2'] == 'Ghost'])

14

5. Percentage of Pokemon whose Type 2 is Ghost

    - Create a function named percent_ghost
     - Use the loaded dataframe as an argument

In [82]:
# percentage is just a ratio:
# hits of pokemanz that are type ghost over total pikimanz population * 100
# length/num rows in the entire df:
df.shape[0]

800

In [83]:
len(df)

800

In [85]:
round(
    (
        (df['Type 2'] == 'Ghost').sum() / df.shape[0]
    )  * 100, 2
)

1.75

In [86]:
def percent_ghost(df):
    return round(
    (
        (df['Type 2'] == 'Ghost').sum() / df.shape[0]
    )  * 100, 2
)

In [87]:
percent_ghost(df)

1.75

6. Number of Pokemon whose Attack is greater than Defense

In [89]:
# single comparison operator between two series
# value in field 1 greater than value in field 2:
(df.Attack > df.Defense).sum()

433

8. Lowest speed for Grass or Rock

In [95]:
# objective: lowest speed
# filter: grass or rock type
# df, where my type is in grass or rock
# get the min speed under those conditions
#  filter ==> min()
# mask or filter:
grass_rock_mask = (df['Type 1'] == 'Grass') |(df['Type 1'] == 'Rock')
df[grass_rock_mask][['Name','Speed']].min()

Name     Abomasnow
Speed           10
dtype: object

10. Change all the column names

      - lowercase letters
      - remove whitespace

In [96]:
# remove whitespace:
# leading and tailing whitespace on a string:
# we know we can remove that with .strip()
# otherwise, we may want to replace whitespace with an underscore .replace(' ', '_')
# and we want to lowercase the string: .lower()
'this Specific thing  '.strip().replace(' ', '_').lower()

'this_specific_thing'

In [98]:
# throw that in a list comprehension:
df.columns = [thing.strip().replace(' ', '_').lower() for thing in df.columns]

11. Rename Sp. Attack to special-attack

In [101]:
df = df.rename(columns={'sp._atk': 'special-attack', 'sp._def': 'special-defense'})

12. Rename Sp. Def to special-defense

In [102]:
# see above

    
13. Which Pokemon has/have the greatest difference in: 

    - Attack and Defense points
    - Special Attack and Special Defense
  

In [106]:
# a condition: (df.attack - df.defense).abs().max()

In [107]:
df['ad_diff'] = (df.attack - df.defense).abs()

In [108]:
df[df['ad_diff'] == df['ad_diff'].max()][['name']]

Unnamed: 0,name
230,Shuckle


14. How many Pokemon are of Type 1 Rock and Type 2 Fairy?

In [111]:
((df.type_1 == 'Rock') & (df.type_2 == 'Fairy')).sum()

3

15. Which Fire Pokemon appears last alphabetically?

    - Create a function named last_pokemon
    - Use the loaded dataframe as an argument

In [None]:
# filter df to just fire picklemans
# once just fire, make it sort (sort_values) based on name

In [112]:
def last_pokemon(df, type_condition='Fire'):
    return df[df.type_1 == type_condition].sort_values(by='name', ascending=False).head(1)

In [113]:
last_pokemon(df)

Unnamed: 0,#,name,type_1,type_2,total,hp,attack,defense,special-attack,special-defense,speed,generation,legendary,ad_diff
42,37,Vulpix,Fire,,299,38,41,40,50,65,65,1,False,1
