# Series methods

In [44]:
import pandas as pd
import numpy as np

# converting from dataframe to series

In [2]:
pd.read_csv('/home/diego/Documents/Data/pokemon.csv')  # This is a dataframe

Unnamed: 0,Pokemon,Type
0,Bulbasaur,Grass / Poison
1,Ivysaur,Grass / Poison
2,Venusaur,Grass / Poison
3,Charmander,Fire
4,Charmeleon,Fire
...,...,...
804,Stakataka,Rock / Steel
805,Blacephalon,Fire / Ghost
806,Zeraora,Electric
807,Meltan,Steel


In [3]:
# change to a series object

pokemon = pd.read_csv('/home/diego/Documents/Data/pokemon.csv', index_col='Pokemon')  # DataFrame with a column of values yet

# To force change to a Series object, we use the dataframe's method squeeze
pokemon = pokemon.squeeze()  # Now a Series
type(pokemon)
pokemon

Pokemon
Bulbasaur      Grass / Poison
Ivysaur        Grass / Poison
Venusaur       Grass / Poison
Charmander               Fire
Charmeleon               Fire
                    ...      
Stakataka        Rock / Steel
Blacephalon      Fire / Ghost
Zeraora              Electric
Meltan                  Steel
Melmetal                Steel
Name: Type, Length: 809, dtype: object

## now with google.csv

In [4]:
# We are opening the google_stocks csv. We are changing our Date format from string to Date format, then we set the column as index. Finally, we are converting the Dataframe to a Series object
google = pd.read_csv('/home/diego/Documents/Data/google_stocks.csv', parse_dates=['Date'], index_col='Date').squeeze()
google

Date
2004-08-19      49.98
2004-08-20      53.95
2004-08-23      54.50
2004-08-24      52.24
2004-08-25      52.80
               ...   
2019-10-21    1246.15
2019-10-22    1242.80
2019-10-23    1259.13
2019-10-24    1260.99
2019-10-25    1265.13
Name: Close, Length: 3824, dtype: float64

## Now revolutionary_war file

In [5]:
pd.read_csv('/home/diego/Documents/Data/revolutionary_war.csv', 
            index_col='Battle', 
            parse_dates=['Start Date']).tail()

Unnamed: 0_level_0,Start Date,State
Battle,Unnamed: 1_level_1,Unnamed: 2_level_1
Siege of Fort Henry,1782-09-11,Virginia
Grand Assault on Gibraltar,1782-09-13,
Action of 18 October 1782,1782-10-18,
Action of 6 December 1782,1782-12-06,
Action of 22 January 1783,1783-01-22,Virginia


In this case, if we want to convert from dataframe to Series, we can't do that directly because we have three columns of values

So we need to limit the import of columns to only two with the parameter `usecols`


In [6]:
battles = pd.read_csv('/home/diego/Documents/Data/revolutionary_war.csv', 
            usecols=['Start Date', 'State'],
            index_col='Start Date', 
            parse_dates=['Start Date']).squeeze()
battles

Start Date
1774-09-01    Massachusetts
1774-12-14    New Hampshire
1775-04-19    Massachusetts
1775-04-19    Massachusetts
1775-04-20         Virginia
                  ...      
1782-09-11         Virginia
1782-09-13              NaN
1782-10-18              NaN
1782-12-06              NaN
1783-01-22         Virginia
Name: State, Length: 232, dtype: object

## sorting values

In [7]:
google.sort_values()

Date
2004-09-03      49.82
2004-09-01      49.94
2004-08-19      49.98
2004-09-02      50.57
2004-09-07      50.60
               ...   
2019-04-23    1264.55
2019-10-25    1265.13
2018-07-26    1268.33
2019-04-26    1272.18
2019-04-29    1287.58
Name: Close, Length: 3824, dtype: float64

In [8]:
pokemon.sort_values()

Pokemon
Illumise                Bug
Silcoon                 Bug
Pinsir                  Bug
Burmy                   Bug
Wurmple                 Bug
                  ...      
Tirtouga       Water / Rock
Relicanth      Water / Rock
Corsola        Water / Rock
Carracosta     Water / Rock
Empoleon      Water / Steel
Name: Type, Length: 809, dtype: object

In [9]:
# When sorting, by default the nan values are at the end of the series
battles.sort_values().tail()

Start Date
1782-08-08    NaN
1782-08-25    NaN
1782-09-13    NaN
1782-10-18    NaN
1782-12-06    NaN
Name: State, dtype: object

In [10]:
# nan values at first
battles.sort_values(na_position='first').head()

Start Date
1775-09-17    NaN
1775-12-31    NaN
1776-03-03    NaN
1776-03-25    NaN
1776-05-18    NaN
Name: State, dtype: object

## Dropping nan

In [11]:
battles.dropna().sort_values()

Start Date
1781-09-06    Connecticut
1779-07-05    Connecticut
1777-04-27    Connecticut
1777-09-03       Delaware
1777-05-17        Florida
                 ...     
1781-07-06       Virginia
1781-07-01       Virginia
1781-06-26       Virginia
1781-04-25       Virginia
1783-01-22       Virginia
Name: State, Length: 162, dtype: object

sorting index

In [12]:
pokemon.sort_index()

Pokemon
Abomasnow        Grass / Ice
Abra                 Psychic
Absol                   Dark
Accelgor                 Bug
Aegislash      Steel / Ghost
                  ...       
Zoroark                 Dark
Zorua                   Dark
Zubat        Poison / Flying
Zweilous       Dark / Dragon
Zygarde      Dragon / Ground
Name: Type, Length: 809, dtype: object

In [13]:
battles.sort_index(na_position='first')

Start Date
NaT              New Jersey
NaT                Virginia
NaT                     NaN
NaT                     NaN
1774-09-01    Massachusetts
                  ...      
1782-09-11         Virginia
1782-09-13              NaN
1782-10-18              NaN
1782-12-06              NaN
1783-01-22         Virginia
Name: State, Length: 232, dtype: object

## nlargest, nsmallest

In [15]:
google.nlargest()

Date
2019-04-29    1287.58
2019-04-26    1272.18
2018-07-26    1268.33
2019-10-25    1265.13
2019-04-23    1264.55
Name: Close, dtype: float64

In [16]:
google.nsmallest()

Date
2004-09-03    49.82
2004-09-01    49.94
2004-08-19    49.98
2004-09-02    50.57
2004-09-07    50.60
Name: Close, dtype: float64

In [19]:
battles.nsmallest()

TypeError: Cannot use method 'nsmallest' with dtype object

## Counting occurrences

In [29]:
len(pokemon)

809

In [28]:
pokemon.nunique()

159

In [26]:
# Count the unique values, so it is possible to have less or equal values than in the original data
# in this case 159 values of 159
pokemon.value_counts()

Type
Normal                65
Water                 61
Grass                 38
Psychic               35
Fire                  30
                      ..
Fire / Psychic         1
Normal / Ground        1
Psychic / Fighting     1
Dark / Ghost           1
Fire / Ghost           1
Name: count, Length: 159, dtype: int64

`normalize` parameter when us True gives the relative frequency of that value. $N(a) / total$

In [33]:
pokemon.value_counts(normalize=True)

Type
Normal                0.080346
Water                 0.075402
Grass                 0.046972
Psychic               0.043263
Fire                  0.037083
                        ...   
Fire / Psychic        0.001236
Normal / Ground       0.001236
Psychic / Fighting    0.001236
Dark / Ghost          0.001236
Fire / Ghost          0.001236
Name: proportion, Length: 159, dtype: float64

We can round the values of a Series with the `round()` method

In [41]:
(pokemon.value_counts(normalize=True) * 100).round(2)

Type
Normal                8.03
Water                 7.54
Grass                 4.70
Psychic               4.33
Fire                  3.71
                      ... 
Fire / Psychic        0.12
Normal / Ground       0.12
Psychic / Fighting    0.12
Dark / Ghost          0.12
Fire / Ghost          0.12
Name: proportion, Length: 159, dtype: float64

## Grouping data
To identify trends in numeric data sets, it can be more beneficial to group values into
predefined intervals rather than count distinct values.

In [42]:
google.max()

1287.58

In [43]:
google.min()

49.82

The data is between $1287.58$ and $49.82$ so, we have approximately 1250 values in those ranges
Let's group the values in intervals of 200

In [47]:
bucks = np.linspace(0, 1400, 8)
bucks

array([   0.,  200.,  400.,  600.,  800., 1000., 1200., 1400.])

The `bins` parameter of `value_counts()` group the values in half-open intervals

In [55]:
google.value_counts(bins=bucks, sort=False)

(-0.001, 200.0]      595
(200.0, 400.0]      1568
(400.0, 600.0]       575
(600.0, 800.0]       380
(800.0, 1000.0]      207
(1000.0, 1200.0]     406
(1200.0, 1400.0]      93
Name: count, dtype: int64

In [51]:
# Here the price was between 200 and 400 in 1568 values of the dataset
google.value_counts(bins=bucks).sort_index(ascending=True)

(-0.001, 200.0]      595
(200.0, 400.0]      1568
(400.0, 600.0]       575
(600.0, 800.0]       380
(800.0, 1000.0]      207
(1000.0, 1200.0]     406
(1200.0, 1400.0]      93
Name: count, dtype: int64

In [59]:
# with numbers
google.value_counts(bins=7, sort=False)

(48.581, 226.643]       824
(226.643, 403.466]     1346
(403.466, 580.289]      514
(580.289, 757.111]      313
(757.111, 933.934]      256
(933.934, 1110.757]     293
(1110.757, 1287.58]     278
Name: count, dtype: int64

### Working with battles

In [61]:
battles.value_counts(dropna=False)

State
NaN               70
South Carolina    31
New York          28
New Jersey        24
Virginia          21
Massachusetts     11
Pennsylvania      10
North Carolina     9
Florida            8
Georgia            6
Rhode Island       3
Connecticut        3
Vermont            3
New Hampshire      1
Delaware           1
Indiana            1
Louisiana          1
Ohio               1
Name: count, dtype: int64

We have seen that all previous method works for values of the Series. We can apply those method also for the index but we need the Series.index attribute

In [65]:
battles.index.value_counts(dropna=False)

Start Date
NaT           4
1777-08-22    2
1781-05-22    2
1782-01-11    2
1780-08-18    2
             ..
1778-06-30    1
1778-07-03    1
1778-07-27    1
1778-08-21    1
1783-01-22    1
Name: count, Length: 218, dtype: int64

# apply method

This method takes as argument a function **(the name)** and applies the function to each Series' value

It can also be a lambda function

In [67]:
google

Date
2004-08-19      49.98
2004-08-20      53.95
2004-08-23      54.50
2004-08-24      52.24
2004-08-25      52.80
               ...   
2019-10-21    1246.15
2019-10-22    1242.80
2019-10-23    1259.13
2019-10-24    1260.99
2019-10-25    1265.13
Name: Close, Length: 3824, dtype: float64

In [66]:
def square(x):
    return x ** 2

In [68]:
google.apply(square)

Date
2004-08-19    2.498000e+03
2004-08-20    2.910603e+03
2004-08-23    2.970250e+03
2004-08-24    2.729018e+03
2004-08-25    2.787840e+03
                  ...     
2019-10-21    1.552890e+06
2019-10-22    1.544552e+06
2019-10-23    1.585408e+06
2019-10-24    1.590096e+06
2019-10-25    1.600554e+06
Name: Close, Length: 3824, dtype: float64

So it is the same do:

In [69]:
# Here we are using the method that is defined in pandas class Series
google.round()

Date
2004-08-19      50.0
2004-08-20      54.0
2004-08-23      54.0
2004-08-24      52.0
2004-08-25      53.0
               ...  
2019-10-21    1246.0
2019-10-22    1243.0
2019-10-23    1259.0
2019-10-24    1261.0
2019-10-25    1265.0
Name: Close, Length: 3824, dtype: float64

using apply method

In [70]:
# Here we are using the built-in function, and the apply method
google.apply(round)

Date
2004-08-19      50
2004-08-20      54
2004-08-23      54
2004-08-24      52
2004-08-25      53
              ... 
2019-10-21    1246
2019-10-22    1243
2019-10-23    1259
2019-10-24    1261
2019-10-25    1265
Name: Close, Length: 3824, dtype: int64

I want to know if a pokemon has two or one type. According to the format, a / tell us the pokem has more than one type, make a function to know if has a single or multi type

In [71]:
def single_or_multi(pokemon_type: str):
    if '/' in pokemon_type:
        return 'multi'
    
    return 'single'

In [73]:
pokemon

Pokemon
Bulbasaur      Grass / Poison
Ivysaur        Grass / Poison
Venusaur       Grass / Poison
Charmander               Fire
Charmeleon               Fire
                    ...      
Stakataka        Rock / Steel
Blacephalon      Fire / Ghost
Zeraora              Electric
Meltan                  Steel
Melmetal                Steel
Name: Type, Length: 809, dtype: object

In [74]:
what_type = pokemon.apply(single_or_multi)

How many are multi and how are single?

In [75]:
what_type.value_counts(dropna=False)

Type
multi     405
single    404
Name: count, dtype: int64