# Intro to Pandas

In [1]:
# imports for this notebook
import numpy as np
import pandas as pd
from numpy.random import randn

## Pandas Series

In [2]:
# We can create a numpy array from a list like this:
series_1 = ['Delaware', 'Georgia', 'New Hampshire', 'Tennessee', 'Arkansas']
np.array(series_1)


array(['Delaware', 'Georgia', 'New Hampshire', 'Tennessee', 'Arkansas'],
      dtype='<U13')

In [3]:
# We can also create a pandas Series from a list like this:
pd.Series(data = series_1)


0         Delaware
1          Georgia
2    New Hampshire
3        Tennessee
4         Arkansas
dtype: str

In [4]:
series_2 = [1, 4, 9, 16, 25] 

pd.Series(data = series_2)

0     1
1     4
2     9
3    16
4    25
dtype: int64

In [5]:
pd.Series(data = series_1, index = series_2) # axis labels set to series_2 values

1          Delaware
4           Georgia
9     New Hampshire
16        Tennessee
25         Arkansas
dtype: str

### Interactive Learning Moment:
1. Hover your cursor over "Series" below. You will see information pop up (Documentation).
2. The python syntax informs you of the parameters (the stuff inside of the parentheses) and their order.
3. When using a method (.Series) or function, you can specify what information is assigned to each parameter (e.g., pd.Series(data = series_1, index = series_2)).
4. If a parameter is not specified by name, python assumes the order given in the method or function definition.
5. Parameters not specified are set to default values.

In [6]:
pd.Series(series_1, series_2) 


1          Delaware
4           Georgia
9     New Hampshire
16        Tennessee
25         Arkansas
dtype: str

In [10]:
test = pd.Series(series_1, series_2) # verify type of object created (should be pandas Series)
type(test)


pandas.Series

In [11]:
# We can also pass in a dictionary to create a Series (keys become the index)
# Remember dictionaries are a python data type made up of key:value pairs (like an address book)
# Dictionaries are created with curly braces {}, not square brackets [] or parentheses ()
# Here is an example dictionary:
d = {'1': 'Delaware',
        '4': 'Georgia',
        '9': 'New Hampshire',
        '16': 'Tennessee', 
        '25': 'Arkansas'}
type(d) # should be 'dict'

dict

In [12]:
pd.Series(d) # you can pass in a dictionary to create a pandas Series (keys become the index)

1          Delaware
4           Georgia
9     New Hampshire
16        Tennessee
25         Arkansas
dtype: str

In [13]:
# What would you expect the data type to be when using pd.Series on a dictionary?
type(pd.Series(d))

pandas.Series

## Pandas DataFrame

In [14]:
# We will be using numpy's random number generator
# Set the random seed for reproducibility (so we all get the same random numbers)
np.random.seed(0)

In [15]:
# Hover over `randn` to see what the parameters are (you will have to scroll a bit -- look for "Parameters")
# Here we create a DataFrame with 5 rows and 4 columns of random numbers
# IMPORTANT: In python, rows come first, then columns (unlike MATLAB)
df = pd.DataFrame(randn(5,4), index = ['A','B','C','D','E'], columns = ['W','X','Y','Z'])
df

Unnamed: 0,W,X,Y,Z
A,1.764052,0.400157,0.978738,2.240893
B,1.867558,-0.977278,0.950088,-0.151357
C,-0.103219,0.410599,0.144044,1.454274
D,0.761038,0.121675,0.443863,0.333674
E,1.494079,-0.205158,0.313068,-0.854096


### Indexing

In [16]:
df.columns

Index(['W', 'X', 'Y', 'Z'], dtype='str')

In [17]:
df.rows # this will give an error because DataFrames do not have a 'rows' attribute

AttributeError: 'DataFrame' object has no attribute 'rows'

In [18]:
df.index # shows the row index labels

Index(['A', 'B', 'C', 'D', 'E'], dtype='str')

In [19]:
# Index columns

df['W']  # get a single column

A    1.764052
B    1.867558
C   -0.103219
D    0.761038
E    1.494079
Name: W, dtype: float64

In [20]:
# Select mulitple columns

df[['W','Z']] # double brackets for multiple columns

Unnamed: 0,W,Z
A,1.764052,2.240893
B,1.867558,-0.151357
C,-0.103219,1.454274
D,0.761038,0.333674
E,1.494079,-0.854096


In [21]:
# What is the data type of a single column?

type(df['W'])

pandas.Series

In [22]:
# Select a single row

df.loc['A'] # loc --> location (label based)

W    1.764052
X    0.400157
Y    0.978738
Z    2.240893
Name: A, dtype: float64

In [23]:
# Select a single row
df.iloc[0] # iloc --> integer location (position based)


W    1.764052
X    0.400157
Y    0.978738
Z    2.240893
Name: A, dtype: float64

#### Learning Moment:
- Python indexing is not intuitive.
- Indexing means to pull out or parse elements from a larger group of elements (List, Dict, DataFrame, etc.)
- Python indexing starts at value = 0.
- In a list [1,2,3,4,5], the index position of integer 1 is actually 0.

In [24]:
# What is the data type of a single row?
type(df.loc['A'])

pandas.Series

Dataframes are just a collection of indexable pandas Series!!!

In [25]:
# You can select rows and columns together

df.loc['B','Y']  # row B, column Y

np.float64(0.9500884175255894)

In [26]:
df.loc[['A','B'],['W','Y']]  # rows A and B, columns W and Y

Unnamed: 0,W,Y
A,1.764052,0.978738
B,1.867558,0.950088


### Manipulating DataFrames

In [27]:
# refresh original dataframe view

df

Unnamed: 0,W,X,Y,Z
A,1.764052,0.400157,0.978738,2.240893
B,1.867558,-0.977278,0.950088,-0.151357
C,-0.103219,0.410599,0.144044,1.454274
D,0.761038,0.121675,0.443863,0.333674
E,1.494079,-0.205158,0.313068,-0.854096


In [28]:
# Add a new column

df['new_column'] = df['W'] + df['Y']
df

Unnamed: 0,W,X,Y,Z,new_column
A,1.764052,0.400157,0.978738,2.240893,2.74279
B,1.867558,-0.977278,0.950088,-0.151357,2.817646
C,-0.103219,0.410599,0.144044,1.454274,0.040825
D,0.761038,0.121675,0.443863,0.333674,1.204901
E,1.494079,-0.205158,0.313068,-0.854096,1.807147


In [29]:
df.drop('new_column', axis=1)  # axis=1 means drop a column, while axis=0 would mean drop a row

Unnamed: 0,W,X,Y,Z
A,1.764052,0.400157,0.978738,2.240893
B,1.867558,-0.977278,0.950088,-0.151357
C,-0.103219,0.410599,0.144044,1.454274
D,0.761038,0.121675,0.443863,0.333674
E,1.494079,-0.205158,0.313068,-0.854096


In [30]:
# dataframe manipulations do not happen inplace (permenantly) unless specified (inplace=True)

df 

Unnamed: 0,W,X,Y,Z,new_column
A,1.764052,0.400157,0.978738,2.240893,2.74279
B,1.867558,-0.977278,0.950088,-0.151357,2.817646
C,-0.103219,0.410599,0.144044,1.454274,0.040825
D,0.761038,0.121675,0.443863,0.333674,1.204901
E,1.494079,-0.205158,0.313068,-0.854096,1.807147


In [31]:
df.drop('new_column', axis = 1, inplace = True) # inplace = False is a safeguard and prevents accidental data loss


In [32]:
df # verify the column has been dropped

Unnamed: 0,W,X,Y,Z
A,1.764052,0.400157,0.978738,2.240893
B,1.867558,-0.977278,0.950088,-0.151357
C,-0.103219,0.410599,0.144044,1.454274
D,0.761038,0.121675,0.443863,0.333674
E,1.494079,-0.205158,0.313068,-0.854096


## Importing Data as DataFrame

In [33]:
my_data = pd.read_csv('data.csv')

In [34]:
my_data

Unnamed: 0,Duration,Date,Pulse,Maxpulse,Calories
0,60,'2020/12/01',110,130,409.1
1,60,'2020/12/02',117,145,479.0
2,60,'2020/12/03',103,135,340.0
3,45,'2020/12/04',109,175,282.4
4,45,'2020/12/05',117,148,406.0
5,60,'2020/12/06',102,127,300.0
6,60,'2020/12/07',110,136,374.0
7,450,'2020/12/08',104,134,253.3
8,30,'2020/12/09',109,133,195.1
9,60,'2020/12/10',98,124,269.0


### Practice Exercises

Each cell below will ask you to perform specific actions. Please save the notebook after you have completed all the exercises and push to GitHub under your branch.

In [35]:
# Confirm that `my_data` is a DataFrame
type(my_data)


pandas.DataFrame

In [42]:
# Index the 'Pulse' column
df = my_data
df['Pulse']



0     110
1     117
2     103
3     109
4     117
5     102
6     110
7     104
8     109
9      98
10    103
11    100
12    100
13    106
14    104
15     98
16     98
17    100
18     90
19    103
20     97
21    108
22    100
23    130
24    105
25    102
26    100
27     92
28    103
29    100
30    102
31     92
Name: Pulse, dtype: int64

In [43]:
# Index the 'Pulse' and 'Maxpulse' columns
my_data[['Pulse','Maxpulse']]

Unnamed: 0,Pulse,Maxpulse
0,110,130
1,117,145
2,103,135
3,109,175
4,117,148
5,102,127
6,110,136
7,104,134
8,109,133
9,98,124


In [44]:
# Index row 2 data
df.iloc[1]

Duration              60
Date        '2020/12/02'
Pulse                117
Maxpulse             145
Calories           479.0
Name: 1, dtype: object

In [55]:
# Add column '% Intensity' which is (Pulse/Max Pulse)*100
df['E'] = (df['Pulse'] / df['Maxpulse'] * 100)
df = df.rename(columns={'E': 'Intensity'})
print(df)

    Duration          Date  Pulse  Maxpulse  Calories   Intensity
0         60  '2020/12/01'    110       130     409.1   84.615385
1         60  '2020/12/02'    117       145     479.0   80.689655
2         60  '2020/12/03'    103       135     340.0   76.296296
3         45  '2020/12/04'    109       175     282.4   62.285714
4         45  '2020/12/05'    117       148     406.0   79.054054
5         60  '2020/12/06'    102       127     300.0   80.314961
6         60  '2020/12/07'    110       136     374.0   80.882353
7        450  '2020/12/08'    104       134     253.3   77.611940
8         30  '2020/12/09'    109       133     195.1   81.954887
9         60  '2020/12/10'     98       124     269.0   79.032258
10        60  '2020/12/11'    103       147     329.3   70.068027
11        60  '2020/12/12'    100       120     250.7   83.333333
12        60  '2020/12/12'    100       120     250.7   83.333333
13        60  '2020/12/13'    106       128     345.3   82.812500
14        

In [None]:
# Drop the 'Calories' column permanently
df.drop('Calories', axis=1, inplace=True)
print(df)



    Duration          Date  Pulse  Maxpulse   Intensity
0         60  '2020/12/01'    110       130   84.615385
1         60  '2020/12/02'    117       145   80.689655
2         60  '2020/12/03'    103       135   76.296296
3         45  '2020/12/04'    109       175   62.285714
4         45  '2020/12/05'    117       148   79.054054
5         60  '2020/12/06'    102       127   80.314961
6         60  '2020/12/07'    110       136   80.882353
7        450  '2020/12/08'    104       134   77.611940
8         30  '2020/12/09'    109       133   81.954887
9         60  '2020/12/10'     98       124   79.032258
10        60  '2020/12/11'    103       147   70.068027
11        60  '2020/12/12'    100       120   83.333333
12        60  '2020/12/12'    100       120   83.333333
13        60  '2020/12/13'    106       128   82.812500
14        60  '2020/12/14'    104       132   78.787879
15        60  '2020/12/15'     98       123   79.674797
16        60  '2020/12/16'     98       120   81

In [73]:
# Challenge Exercise (Hint: my_data[my_data[...]])

# Index rows where Pulse is greater than 100 and display the raw values

print(df.loc[df['Pulse'] > 100, 'Pulse'])



0     110
1     117
2     103
3     109
4     117
5     102
6     110
7     104
8     109
10    103
13    106
14    104
19    103
21    108
23    130
24    105
25    102
28    103
30    102
Name: Pulse, dtype: int64
