# Pandas Library (Chapter 4)

In [1]:
import numpy as np
import pandas as pd

In [None]:
# from pandas import *
# if we import pandas with (import *) we don't have to reference functions with pd.functionname()
# or np.objectormethodname(); this is not considered good practice.

# Series

designed to represent 1-d data structures, similarly to an array but with additional features. Internal structure is simple. It contains 2 arrays associated with each other. One holds the data and another labels it, called Index. 

### Declaring Series

In [3]:
s = pd.Series([12, -4, 7, 9])
print(s)

0    12
1    -4
2     7
3     9
dtype: int64


In [9]:
# To individually see two arrays that make up this data structure;
# we can call the two attributes as: index and values
print(s.values)
print(s.index)

[12 -4  7  9]
Index(['a', 'b', 'c', 'd'], dtype='object')


### Selecting Internal Elements

In [10]:
# select elements as ordinary numpy array specifying the key
s[2]

7

In [11]:
# or specify the label corresponding to the position of index
s['b']

-4

In [12]:
# select multiple terms like numpy array
s[0:2]

a    12
b    -4
dtype: int64

In [13]:
# select multiple with labels:
s[['b', 'c']]

b   -4
c    7
dtype: int64

### Assigning Values to Elements

In [14]:
s[1] = 0
print(s)

a    12
b     0
c     7
d     9
dtype: int64


In [15]:
# assign value with label
s['b'] = 1
print(s)

a    12
b     1
c     7
d     9
dtype: int64


### Defining Series from NumPy Array and Other Series

In [16]:
arr = np.array([1,2,3,4])
s3 = pd.Series(arr) # values are not copied, but are passed by reference.
print(s3)

0    1
1    2
2    3
3    4
dtype: int32


In [19]:
s4 = pd.Series(s)
print(s4)

a    12
b     1
c     7
d     9
dtype: int64


In [20]:
arr[2] = -2 # changes to array also changes the Series. 
print(s3)

0    1
1    2
2   -2
3    4
dtype: int32


### Filtering Values

In [22]:
s[s>8]

a    12
d     9
dtype: int64

### Operations and Mathematical Functions

In [24]:
# We can simply use Arithmetic expression like NumPy array
s/2

a    6.0
b    0.5
c    3.5
d    4.5
dtype: float64

In [26]:
# For NumPy functions we must specify the function referenced with np and 
# the instance of Series passed as argument
np.log(s)

a    2.484907
b    0.000000
c    1.945910
d    2.197225
dtype: float64

### Evaluating Values

In [27]:
serd = pd.Series([1,0, 2, 1, 2, 3], index=['white', 'white', 'blue', 'green', 'green', 'yellow'])
serd

white     1
white     0
blue      2
green     1
green     2
yellow    3
dtype: int64

In [28]:
# See values in Series excluding duplicates:
serd.unique()

array([1, 0, 2, 3], dtype=int64)

In [30]:
# value_coun ts() returns the unique values and calculates number of their occurrences
serd.value_counts()

1    2
2    2
0    1
3    1
dtype: int64

In [31]:
# To check if any specific value is present in a Series
serd.isin([0, 3])

white     False
white      True
blue      False
green     False
green     False
yellow     True
dtype: bool

In [32]:
serd[serd.isin([0,3])]

white     0
yellow    3
dtype: int64

In [33]:
print(serd)

white     1
white     0
blue      2
green     1
green     2
yellow    3
dtype: int64


### NaN Values

In [37]:
s2 = pd.Series([5, -3, np.NaN, 14])
print(s2)

0     5.0
1    -3.0
2     NaN
3    14.0
dtype: float64


In [38]:
# To identify the indexs without values: use isnull() and notnull()
s2.isnull() # returns boolean

0    False
1    False
2     True
3    False
dtype: bool

In [40]:
s2.notnull()

0     True
1     True
2    False
3     True
dtype: bool

In [41]:
# we can use those boolean Series place inside a filtering to make a condition
s2[s2.notnull()]

0     5.0
1    -3.0
3    14.0
dtype: float64

In [42]:
s2[s2.isnull()]

2   NaN
dtype: float64

### Series as Dictionaries

In [45]:
# Series can also be viewed as dictionary. We can create a Series from Dictionary
mydict = {'red':2000,
          'blue': 1000,
          'yellow': 500,
          'orange': 1000}

myseries = pd.Series(mydict)
print(myseries)

red       2000
blue      1000
yellow     500
orange    1000
dtype: int64


In [48]:
# array index filled with value keys.
# we can also define the array indexes separately. In this case if there's mismatch
# pandas will add value NaN
colors = ['red', 'yellow', 'white','orange', 'blue', 'green']
myseries = pd.Series(mydict, index=colors)
print(myseries)

red       2000.0
yellow     500.0
white        NaN
orange    1000.0
blue      1000.0
green        NaN
dtype: float64


### Operations between Series

In [49]:
mydict2 = {'red':400, 'yellow':1000, 'black':700}
myseries2 = pd.Series(mydict2)
myseries + myseries2

black        NaN
blue         NaN
green        NaN
orange       NaN
red       2400.0
white        NaN
yellow    1500.0
dtype: float64

We get new object Series in which only the items with the same labels are added. Other label present in one of hte two series are still added but have a NaN value. 

# DataFrame

### Defining DataFrame

In [50]:
# Pass a dictionary object to DataFrame()
data = {'color':['blue', 'green', 'yellow', 'red', 'white'],
        'object': ['ball', 'pen', 'pencil', 'paper', 'mug'],
        'price': [1.2, 1.0, 0.6, 0.9, 1.7]}

In [52]:
frame = pd.DataFrame(data)
print(frame)

    color  object  price
0    blue    ball    1.2
1   green     pen    1.0
2  yellow  pencil    0.6
3     red   paper    0.9
4   white     mug    1.7


In [55]:
# we can select just the required columns in specifying dataframe
frame2 = pd.DataFrame(data=data, columns=['object', 'price'])
frame2

Unnamed: 0,object,price
0,ball,1.2
1,pen,1.0
2,pencil,0.6
3,paper,0.9
4,mug,1.7


In [56]:
# We can choose our own index while defining dataframe instead of default numeric one
frame2 = pd.DataFrame(data, index=['one', 'two', 'three', 'four', 'five'])
frame2

Unnamed: 0,color,object,price
one,blue,ball,1.2
two,green,pen,1.0
three,yellow,pencil,0.6
four,red,paper,0.9
five,white,mug,1.7


In [57]:
frame3 = pd.DataFrame(np.arange(16).reshape((4,4)),
                      index=['red', 'blue', 'yellow', 'white'],
                      columns=['ball', 'pen', 'pencil', 'paper'])
frame3

Unnamed: 0,ball,pen,pencil,paper
red,0,1,2,3
blue,4,5,6,7
yellow,8,9,10,11
white,12,13,14,15


### Selecting Elements

In [60]:
# Check name of columns
frame.columns

Index(['color', 'object', 'price'], dtype='object')

In [62]:
# check list of indices
frame3.index

Index(['red', 'blue', 'yellow', 'white'], dtype='object')

In [63]:
# get the entire set of data using values attribute
frame.values

array([['blue', 'ball', 1.2],
       ['green', 'pen', 1.0],
       ['yellow', 'pencil', 0.6],
       ['red', 'paper', 0.9],
       ['white', 'mug', 1.7]], dtype=object)

In [68]:
# select specific column/ columns
frame3[['pencil', 'pen']] 
# frame3['pen'] # select only 1 col

Unnamed: 0,pencil,pen
red,2,1
blue,6,5
yellow,10,9
white,14,13


In [73]:
frame.iloc[2] # use iloc to select rows within dataframe

color     yellow
object    pencil
price        0.6
Name: 2, dtype: object

In [74]:
# to select multiple rows:
frame.iloc[[2, 4]]

Unnamed: 0,color,object,price
2,yellow,pencil,0.6
4,white,mug,1.7


In [77]:
# select multiple rows range:
frame.iloc[2:5] # first to include, first to exclude

Unnamed: 0,color,object,price
2,yellow,pencil,0.6
3,red,paper,0.9
4,white,mug,1.7


In [79]:
# extract portion of dataframe
frame.iloc[0:1]

Unnamed: 0,color,object,price
0,blue,ball,1.2


In [80]:
# To find single value within a Dataframe: choose col first and then row
frame['object'][3]

'paper'

### Assigning Values

In [85]:
frame

Unnamed: 0,color,object,price
0,blue,ball,1.2
1,green,pen,1.0
2,yellow,pencil,0.6
3,red,paper,0.9
4,white,mug,1.7


In [88]:
frame.index.name = 'id'; frame.columns.name = 'item'
frame

item,color,object,price
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,blue,ball,1.2
1,green,pen,1.0
2,yellow,pencil,0.6
3,red,paper,0.9
4,white,mug,1.7


In [89]:
# Add new column
frame['newcol1'] = 12
frame

item,color,object,price,newcol1
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,blue,ball,1.2,12
1,green,pen,1.0,12
2,yellow,pencil,0.6,12
3,red,paper,0.9,12
4,white,mug,1.7,12


In [90]:
frame['newcol2'] = [12, 23, 21, 24, 15]
frame

item,color,object,price,newcol1,newcol2
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,blue,ball,1.2,12,12
1,green,pen,1.0,12,23
2,yellow,pencil,0.6,12,21
3,red,paper,0.9,12,24
4,white,mug,1.7,12,15


In [92]:
# update the entire column
ser = pd.Series(np.arange(5))
ser

0    0
1    1
2    2
3    3
4    4
dtype: int32

In [100]:
frame['new'] = ser
frame

item,color,object,price,newcol1,newcol2,new
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,blue,ball,1.2,12,12,0
1,green,pen,1.0,12,23,1
2,yellow,pencil,3.3,12,21,2
3,red,paper,0.9,12,24,3
4,white,mug,1.7,12,15,4


In [95]:
# To change single value
frame['price'][2] = 3.3
frame

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  frame['price'][2] = 3.3


item,color,object,price,newcol1,newcol2,new
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,blue,ball,1.2,12,12,0
1,green,pen,1.0,12,23,1
2,yellow,pencil,3.3,12,21,2
3,red,paper,0.9,12,24,3
4,white,mug,1.7,12,15,4


### Membership of Value

In [96]:
frame.isin([1.0, 'pen'])

item,color,object,price,newcol1,newcol2,new
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,False,False,False,False,False,False
1,False,True,True,False,False,True
2,False,False,False,False,False,False
3,False,False,False,False,False,False
4,False,False,False,False,False,False


In [97]:
# pass the boolean value returned as condition
frame[frame.isin([1.0, 'pen'])] # all other values will be printed NaN

item,color,object,price,newcol1,newcol2,new
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,,,,,,
1,,pen,1.0,,,1.0
2,,,,,,
3,,,,,,
4,,,,,,


### Deleting a Column

In [102]:
del frame['new']
frame

item,color,object,price,newcol1,newcol2
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,blue,ball,1.2,12,12
1,green,pen,1.0,12,23
2,yellow,pencil,3.3,12,21
3,red,paper,0.9,12,24
4,white,mug,1.7,12,15


### Filtering

### DataFrame from Nested dict

### Transposition of a DataFrame