# Pandas Walkthrough

In [1]:
import pandas as pd

## Pandas Series

In [66]:
lst = [1,3,4,6,7,5,44,33,2,5,6,7,8,2]
type(lst)

list

In [67]:
lst[4]

7

In [68]:
#to convert a slit to Pandas Series
mySeries = pd.Series(lst)
mySeries

0      1
1      3
2      4
3      6
4      7
5      5
6     44
7     33
8      2
9      5
10     6
11     7
12     8
13     2
dtype: int64

In [69]:
mySeries[8]

2

In [70]:
mySeries[7:12]

7     33
8      2
9      5
10     6
11     7
dtype: int64

In [71]:
mySeries2 = mySeries[7:12]
mySeries2

7     33
8      2
9      5
10     6
11     7
dtype: int64

In [72]:
mySeries2 = mySeries2.reset_index()
mySeries2

Unnamed: 0,index,0
0,7,33
1,8,2
2,9,5
3,10,6
4,11,7


In [73]:
lst = [11,12,13]
mySeries2 = pd.Series(lst, index=['x', 'y', 'z'])
mySeries2

x    11
y    12
z    13
dtype: int64

In [74]:
string1 = 'abcdefghijkl'
Ser1 = pd.Series(list(string1))
Ser1

0     a
1     b
2     c
3     d
4     e
5     f
6     g
7     h
8     i
9     j
10    k
11    l
dtype: object

In [75]:
# convert pandas series into a numpy arrray
Ser1.values

array(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l'],
      dtype=object)

In [76]:
#create series using range function
Seq = pd.Series([x for x in range (1,25)])
Seq

0      1
1      2
2      3
3      4
4      5
5      6
6      7
7      8
8      9
9     10
10    11
11    12
12    13
13    14
14    15
15    16
16    17
17    18
18    19
19    20
20    21
21    22
22    23
23    24
dtype: int64

In [77]:
#negative indices
Seq[-4:-1]

20    21
21    22
22    23
dtype: int64

In [78]:
#apply logical condition using NumPy
#I want to mark the numbers that are divisible by 2 (even numbers)
Seq2 = Seq.where(Seq%2 == 0)
Seq2

0      NaN
1      2.0
2      NaN
3      4.0
4      NaN
5      6.0
6      NaN
7      8.0
8      NaN
9     10.0
10     NaN
11    12.0
12     NaN
13    14.0
14     NaN
15    16.0
16     NaN
17    18.0
18     NaN
19    20.0
20     NaN
21    22.0
22     NaN
23    24.0
dtype: float64

In [79]:
#to get a clean list (drop the nulls)
Seq2.dropna()

1      2.0
3      4.0
5      6.0
7      8.0
9     10.0
11    12.0
13    14.0
15    16.0
17    18.0
19    20.0
21    22.0
23    24.0
dtype: float64

In [80]:
Seq2.dropna().reset_index(drop=True)

0      2.0
1      4.0
2      6.0
3      8.0
4     10.0
5     12.0
6     14.0
7     16.0
8     18.0
9     20.0
10    22.0
11    24.0
dtype: float64

In [81]:
#replace nulls with an arbitrary value
Seq.where(Seq%2 == 0, 'Odd Number')

0     Odd Number
1              2
2     Odd Number
3              4
4     Odd Number
5              6
6     Odd Number
7              8
8     Odd Number
9             10
10    Odd Number
11            12
12    Odd Number
13            14
14    Odd Number
15            16
16    Odd Number
17            18
18    Odd Number
19            20
20    Odd Number
21            22
22    Odd Number
23            24
dtype: object

In [82]:
# split function
mySer = pd.Series(['this is an example', 'this is the second sentence'])

mySer.str.split()

0              [this, is, an, example]
1    [this, is, the, second, sentence]
dtype: object

In [83]:
mySer.str.upper()

0             THIS IS AN EXAMPLE
1    THIS IS THE SECOND SENTENCE
dtype: object

## Pandas Dataframe

**Anatomy of a DataFrame**
 
![df](https://static.packt-cdn.com/products/9781839213106/graphics/Images/B15597_01_01.png)

In [84]:
# converting a data dictionary into a datframe
# key : value(s)
data = {'name': ['Matt', 'John', 'Tammy'],
        'age': [33,27,19]
        }
data

{'name': ['Matt', 'John', 'Tammy'], 'age': [33, 27, 19]}

In [85]:
data['name']

['Matt', 'John', 'Tammy']

In [86]:
df = pd.DataFrame.from_dict(data)
df

Unnamed: 0,name,age
0,Matt,33
1,John,27
2,Tammy,19


In [87]:
#grab the first row of the dataframe
df.loc[0]

name    Matt
age       33
Name: 0, dtype: object

In [88]:
#first and 3rd row
df.loc[[0,2]]

Unnamed: 0,name,age
0,Matt,33
2,Tammy,19


In [89]:
#get the dimension
df.ndim

2

In [90]:
#get the size of the data
df.shape

(3, 2)

3 rows and 2 columns

In [91]:
#get the data type 
df.dtypes

name    object
age      int64
dtype: object

In [92]:
df.T

Unnamed: 0,0,1,2
name,Matt,John,Tammy
age,33,27,19


### Iteration Methods

We have 3 different methods. Let's start with an example

In [1]:
import pandas as pd
data = {'Name': ['Matt', 'John', 'Tammy', 'Bob', 'Nancy'],
        'Age': [33,27,19, 29, 18],
        'City': ['New York', 'Nashville', 'San Diego', 'Asheville','Atlanta']
        }
data

{'Name': ['Matt', 'John', 'Tammy', 'Bob', 'Nancy'],
 'Age': [33, 27, 19, 29, 18],
 'City': ['New York', 'Nashville', 'San Diego', 'Asheville', 'Atlanta']}

In [2]:
df = pd.DataFrame.from_dict(data)
df

Unnamed: 0,Name,Age,City
0,Matt,33,New York
1,John,27,Nashville
2,Tammy,19,San Diego
3,Bob,29,Asheville
4,Nancy,18,Atlanta


#### Method 1 - using `df.index`

In [3]:
for i in df.index:
    print(df['Name'][i], 'lives in', df['City'][i])

Matt lives in New York
John lives in Nashville
Tammy lives in San Diego
Bob lives in Asheville
Nancy lives in Atlanta


In [4]:
len(df)

5

#### Method 2 - using df length `len(df)`

In [5]:
#method 2
for i in range(len(df)):
    print(df['Name'][i], 'is', df['Age'][i], 'years old')

Matt is 33 years old
John is 27 years old
Tammy is 19 years old
Bob is 29 years old
Nancy is 18 years old


#### Method 3 - using `iterrows()`

In [6]:
#method 3
for i, row in df.iterrows():
    print('Person', i+1, ':',row['Name'])

Person 1 : Matt
Person 2 : John
Person 3 : Tammy
Person 4 : Bob
Person 5 : Nancy


In [7]:
# derive a new column
df['YearsToRetirement'] = 65 - df['Age'] 
df

Unnamed: 0,Name,Age,City,YearsToRetirement
0,Matt,33,New York,32
1,John,27,Nashville,38
2,Tammy,19,San Diego,46
3,Bob,29,Asheville,36
4,Nancy,18,Atlanta,47


In [8]:
df.to_json('my_data.json', orient='records') # use orient to skip the index

### `apply` and `map` Functions

In [9]:
#using apply method
#add 2 years for each age
df.apply(lambda row: row['Age'] + 2, axis=1)

0    35
1    29
2    21
3    31
4    20
dtype: int64

In [None]:
# add more elements and automate for any value
yrs = 6
df.apply(lambda row: row['name']  + ' will be ' + str(row['age'] + yrs) + ' after ' + str(yrs)+' years', axis=1)

0     Matt will be 39 after 6 years
1     John will be 33 after 6 years
2    Tammy will be 25 after 6 years
3      Bob will be 35 after 6 years
4    Nancy will be 24 after 6 years
dtype: object

In [None]:
df['teenager'] = df['age'].map(lambda x: True if x < 20 else False) #
df

Unnamed: 0,name,age,city,teenager
0,Matt,33,New York,False
1,John,27,Nashville,False
2,Tammy,19,San Diego,True
3,Bob,29,Ashville,False
4,Nancy,18,Atlanta,True


- apply could be used too, but map is faster if it's only one column
- So in summary:
    - Use map() for element-wise transformations with a mapping dict/function
    - Use apply() when you need access to the entire row/column to transform data
    - map() is faster and supports chaining, apply() allows custom functions

In [None]:
df['region'] = df['city'].map({'Nashville':'South', 'Atlanta':'South'})
df

Unnamed: 0,name,age,city,teenager,region
0,Matt,33,New York,False,
1,John,27,Nashville,False,South
2,Tammy,19,San Diego,True,
3,Bob,29,Ashville,False,
4,Nancy,18,Atlanta,True,South


In [None]:
# sorting in descending order
df.sort_values(by='age', ascending=False)

Unnamed: 0,name,age,city,teenager,region
0,Matt,33,New York,False,
3,Bob,29,Ashville,False,
1,John,27,Nashville,False,South
2,Tammy,19,San Diego,True,
4,Nancy,18,Atlanta,True,South


Looking for nulls

In [None]:
# using isna function
import math
data = {'name': ['Matt', 'John', 'Tammy', 'Bob'],
        'age': [33,27,19, 29],
        'city': ['New York', 'Nashville', 'San Diego', math.nan]
        }
data

{'name': ['Matt', 'John', 'Tammy', 'Bob'],
 'age': [33, 27, 19, 29],
 'city': ['New York', 'Nashville', 'San Diego', nan]}

In [None]:
df = pd.DataFrame.from_dict(data)
df

Unnamed: 0,name,age,city
0,Matt,33,New York
1,John,27,Nashville
2,Tammy,19,San Diego
3,Bob,29,


In [None]:
df.isna()

Unnamed: 0,name,age,city
0,False,False,False
1,False,False,False
2,False,False,False
3,False,False,True


In [None]:
df.isna().sum()

name    0
age     0
city    1
dtype: int64

Next, we'll go a full DA exercise in pandas.