# Pandas
Pandas is a python library built on top of Numpy, used for data manipulation and analysis.
## Pandas series vs pandas dataframes
A pandas series is a 1d array-like object that can hold many data types.
Pandas series are mutable

In [1]:
import pandas as pd

In [2]:
#creating a pandas series
groceries = pd.Series(data = [30, 6, 'Yes', 'No'], index = ['eggs', 'apples', 'milk', 'bread'])
groceries

eggs       30
apples      6
milk      Yes
bread      No
dtype: object

In [3]:
print("Groceries has shape:", groceries.shape)
print("Groceries has dimension:", groceries.ndim)
print("Groceries has a total of:", groceries.size, "elements")

Groceries has shape: (4,)
Groceries has dimension: 1
Groceries has a total of: 4 elements


In [4]:
'bananas' in groceries

False

In [5]:
'bread' in groceries

True

## Accessing and deleting elements in Pandas Series

In [6]:
groceries['eggs']

30

In [7]:
groceries[['eggs', 'milk']]

eggs     30
milk    Yes
dtype: object

In [8]:
groceries[0]

30

In [9]:
groceries[3]

'No'

In [10]:
groceries['eggs'] = 12
groceries

eggs       12
apples      6
milk      Yes
bread      No
dtype: object

In [11]:
groceries.drop('apples')
# returns a modified version of the series without changing it

eggs      12
milk     Yes
bread     No
dtype: object

In [12]:
groceries

eggs       12
apples      6
milk      Yes
bread      No
dtype: object

In [13]:
groceries.drop('apples', inplace = True)
# deletes the element and modifies the series
groceries

eggs      12
milk     Yes
bread     No
dtype: object

# Arithmetic operations on Pandas Series

In [14]:
fruits = pd.Series([10, 6, 3], ['apples', 'oranges', 'bananas'])
fruits

apples     10
oranges     6
bananas     3
dtype: int64

In [15]:
print(fruits + 2)


apples     12
oranges     8
bananas     5
dtype: int64


In [16]:
print(fruits - 2)

apples     8
oranges    4
bananas    1
dtype: int64


In [17]:
print(fruits * 2)

apples     20
oranges    12
bananas     6
dtype: int64


In [18]:
print(fruits - 2)

apples     8
oranges    4
bananas    1
dtype: int64


In [19]:
import numpy as np
np.sqrt(fruits)

apples     3.162278
oranges    2.449490
bananas    1.732051
dtype: float64

In [20]:
np.power(fruits, 2)

apples     100
oranges     36
bananas      9
dtype: int64

In [21]:
np.exp(fruits)

apples     22026.465795
oranges      403.428793
bananas       20.085537
dtype: float64

In [22]:
fruits[['bananas']] + 2

bananas    5
dtype: int64

# Pandas dataframes
Pandas dataframe is a 2d object with labelled rows and columns which can also hold multiple datatypes

In [23]:
items = {'Bob': pd.Series([245, 25, 55], index = ['bike', 'pants', 'watch']), 
         'Alice': pd.Series([40, 10, 500, 45], index = ['book', 'glasses', 'bike', 'pants'])}
type(items)

dict

In [24]:
shopping_carts = pd.DataFrame(items)
shopping_carts

Unnamed: 0,Bob,Alice
bike,245.0,500.0
book,,40.0
glasses,,10.0
pants,25.0,45.0
watch,55.0,


In [25]:
data = {'Bob' : pd.Series([245, 25, 55]),
        'Alice' : pd.Series([40, 110, 500, 45])}
df = pd.DataFrame(data)
df

Unnamed: 0,Bob,Alice
0,245.0,40
1,25.0,110
2,55.0,500
3,,45


In [26]:
bob_shopping_cart = pd.DataFrame(items, columns = ['Bob'])
bob_shopping_cart

Unnamed: 0,Bob
bike,245
pants,25
watch,55


In [27]:
sel_shopping_cart = pd.DataFrame(items, index = ['pants', 'book'])
sel_shopping_cart

Unnamed: 0,Bob,Alice
pants,25.0,45
book,,40


In [28]:
shopping_carts.shape

(5, 2)

In [29]:
alice_shopping_cart = pd.DataFrame(items, index = ['pants', 'book'], columns = ['Bob'])
alice_shopping_cart

Unnamed: 0,Bob
pants,25.0
book,


In [30]:
# creating a dataframe with labelled indices
data = {'Integers': [1, 2, 3], 'Floats': [2.4, 9.0, 5.6]}
df = pd.DataFrame(data, index = ['Label 1', 'Label 2', 'Label 3'])
df

Unnamed: 0,Integers,Floats
Label 1,1,2.4
Label 2,2,9.0
Label 3,3,5.6


# Accessing elements in Pandas dataframes

In [31]:
items = [{'bikes': 20, 'pants': 30, 'watches': 35}, {'watches': 10, 'glasses': 50, 'bikes': 15, 'pants': 5}]
store_items = pd.DataFrame(items, index = ['store 1', 'store 2'])
store_items

Unnamed: 0,bikes,pants,watches,glasses
store 1,20,30,35,
store 2,15,5,10,50.0


In [32]:
store_items['bikes']

store 1    20
store 2    15
Name: bikes, dtype: int64

In [33]:
store_items['bikes']['store 2']

15

The main diff b/w the 'loc' and 'iloc' methods is that loc gets rows (and/or columns) with particular labels while iloc gets rows (and/or columns) at integer locations.

In [34]:
store_items.loc[['store 1']]
# returns as a dataframe

Unnamed: 0,bikes,pants,watches,glasses
store 1,20,30,35,


In [35]:
store_items.loc['store 1']
# returns a listing of the elements

bikes      20.0
pants      30.0
watches    35.0
glasses     NaN
Name: store 1, dtype: float64

In [36]:
store_items['shirts'] = [15, 8]
store_items

Unnamed: 0,bikes,pants,watches,glasses,shirts
store 1,20,30,35,,15
store 2,15,5,10,50.0,8


In [37]:
store_items['suits'] = store_items['shirts'] + store_items['pants']
store_items

Unnamed: 0,bikes,pants,watches,glasses,shirts,suits
store 1,20,30,35,,15,45
store 2,15,5,10,50.0,8,13


In [38]:
new_items = [{'bikes': 20, 'pants': 30, 'watches': 35, 'glasses': 4}]
new_store = pd.DataFrame(new_items, index = ['store 3'])
new_store

Unnamed: 0,bikes,pants,watches,glasses
store 3,20,30,35,4


In [39]:
store_items = store_items.append(new_store)
store_items
#'append' is outdated and pd.concat() is now used

  store_items = store_items.append(new_store)


Unnamed: 0,bikes,pants,watches,glasses,shirts,suits
store 1,20,30,35,,15.0,45.0
store 2,15,5,10,50.0,8.0,13.0
store 3,20,30,35,4.0,,


In [40]:
print(pd.concat([store_items, new_store]))

         bikes  pants  watches  glasses  shirts  suits
store 1     20     30       35      NaN    15.0   45.0
store 2     15      5       10     50.0     8.0   13.0
store 3     20     30       35      4.0     NaN    NaN
store 3     20     30       35      4.0     NaN    NaN


In [41]:
store_items['new_watches'] = store_items['watches'][1:]
store_items

Unnamed: 0,bikes,pants,watches,glasses,shirts,suits,new_watches
store 1,20,30,35,,15.0,45.0,
store 2,15,5,10,50.0,8.0,13.0,10.0
store 3,20,30,35,4.0,,,35.0


In [42]:
store_items.insert(5, 'shoes', [2, 5, 0])
store_items

Unnamed: 0,bikes,pants,watches,glasses,shirts,shoes,suits,new_watches
store 1,20,30,35,,15.0,2,45.0,
store 2,15,5,10,50.0,8.0,5,13.0,10.0
store 3,20,30,35,4.0,,0,,35.0


'pop' is used to delete columns, 'drop' is used to delete both rows and columns by making use of the 'axis' keyword

In [43]:
store_items.insert(4, 'socks', [13, 9, 1])
store_items

Unnamed: 0,bikes,pants,watches,glasses,socks,shirts,shoes,suits,new_watches
store 1,20,30,35,,13,15.0,2,45.0,
store 2,15,5,10,50.0,9,8.0,5,13.0,10.0
store 3,20,30,35,4.0,1,,0,,35.0


In [44]:
store_items.pop('new_watches')
store_items

Unnamed: 0,bikes,pants,watches,glasses,socks,shirts,shoes,suits
store 1,20,30,35,,13,15.0,2,45.0
store 2,15,5,10,50.0,9,8.0,5,13.0
store 3,20,30,35,4.0,1,,0,


In [45]:
store_items = store_items.drop(['socks'], axis = 1)
store_items

Unnamed: 0,bikes,pants,watches,glasses,shirts,shoes,suits
store 1,20,30,35,,15.0,2,45.0
store 2,15,5,10,50.0,8.0,5,13.0
store 3,20,30,35,4.0,,0,


In [46]:
store_items = store_items.drop(['store 3'], axis = 0)
store_items

Unnamed: 0,bikes,pants,watches,glasses,shirts,shoes,suits
store 1,20,30,35,,15.0,2,45.0
store 2,15,5,10,50.0,8.0,5,13.0


In [47]:
# to rename a datafram row/column, use a dict with key as old name, value as new name
store_items = store_items.rename(columns = {'shoes': 'hats'})
store_items

Unnamed: 0,bikes,pants,watches,glasses,shirts,hats,suits
store 1,20,30,35,,15.0,2,45.0
store 2,15,5,10,50.0,8.0,5,13.0


In [48]:
store_items = store_items.rename(index = {'store 2': 'last store'})
store_items

Unnamed: 0,bikes,pants,watches,glasses,shirts,hats,suits
store 1,20,30,35,,15.0,2,45.0
last store,15,5,10,50.0,8.0,5,13.0


# Dealing with NaN
NaN stands for "not a number" and refers to  missing values in our dataset.

In [95]:
items = [{'bikes': 20, 'pants': 30, 'watches': 35, 'shirts': 15, 'shoes':8, 'suits':45},
{'watches': 10, 'glasses': 50, 'bikes': 15, 'pants':5, 'shirts': 2, 'shoes':5, 'suits':7},
{'bikes': 20, 'pants': 30, 'watches': 35, 'glasses': 4, 'shoes':10}]
store_items = pd.DataFrame(items, index = ['store 1', 'store 2', 'store 3'])
store_items = store_items.sort_index(axis = 1)
store_items

Unnamed: 0,bikes,glasses,pants,shirts,shoes,suits,watches
store 1,20,,30,15.0,8,45.0,35
store 2,15,50.0,5,2.0,5,7.0,10
store 3,20,4.0,30,,10,,35


In [96]:
x = store_items.isnull()
print(x)

         bikes  glasses  pants  shirts  shoes  suits  watches
store 1  False     True  False   False  False  False    False
store 2  False    False  False   False  False  False    False
store 3  False    False  False    True  False   True    False


In [97]:
x = store_items.isnull().sum()
print(x)

bikes      0
glasses    1
pants      0
shirts     1
shoes      0
suits      1
watches    0
dtype: int64


In [98]:
x = store_items.isnull().sum().sum()
print(x)

3


Use dropna to drop rows/columns with NaN values by specifying the axis (0 for rows, 1 for columns)

In [99]:
store_items.dropna(axis = 0)

Unnamed: 0,bikes,glasses,pants,shirts,shoes,suits,watches
store 2,15,50.0,5,2.0,5,7.0,10


In [100]:
store_items.dropna(axis = 1)

Unnamed: 0,bikes,pants,shoes,watches
store 1,20,30,8,35
store 2,15,5,5,10
store 3,20,30,10,35


This however does not modify the original dataframe. To do so, make use of the keyword "inplace" (inplace = True)

In [101]:
store_items.fillna(0)
# replace NaN values by a zero

Unnamed: 0,bikes,glasses,pants,shirts,shoes,suits,watches
store 1,20,0.0,30,15.0,8,45.0,35
store 2,15,50.0,5,2.0,5,7.0,10
store 3,20,4.0,30,0.0,10,0.0,35


In [102]:
store_items

Unnamed: 0,bikes,glasses,pants,shirts,shoes,suits,watches
store 1,20,,30,15.0,8,45.0,35
store 2,15,50.0,5,2.0,5,7.0,10
store 3,20,4.0,30,,10,,35


We can also use 'ffill' or 'backfill' to replace Nan values either by rows or columns.

ffill stands for forward fill which replaces the NaN by the vlue in front it.

backfill stands for backward fill replaces the NaN value by the value after it.

We can also use linear interpolation by making use of the  interpolate method to replaace the NaN value.

All the methods replace the NaN value without modifying the original dataframe.

In [104]:
store_items.fillna(method = 'ffill', axis = 0)

# replace NaN values by the value in their preceding row

Unnamed: 0,bikes,glasses,pants,shirts,shoes,suits,watches
store 1,20,,30,15.0,8,45.0,35
store 2,15,50.0,5,2.0,5,7.0,10
store 3,20,4.0,30,2.0,10,7.0,35


In [105]:
store_items.fillna(method = 'ffill', axis = 1)
# replace NaN values by the value in their preceding column

Unnamed: 0,bikes,glasses,pants,shirts,shoes,suits,watches
store 1,20.0,20.0,30.0,15.0,8.0,45.0,35.0
store 2,15.0,50.0,5.0,2.0,5.0,7.0,10.0
store 3,20.0,4.0,30.0,30.0,10.0,10.0,35.0


In [110]:
store_items.fillna(method = 'backfill', axis = 0)
# replace NaN values by the value in their preceding row

Unnamed: 0,bikes,glasses,pants,shirts,shoes,suits,watches
store 1,20,50.0,30,15.0,8,45.0,35
store 2,15,50.0,5,2.0,5,7.0,10
store 3,20,4.0,30,,10,,35


In [111]:
store_items.fillna(method = 'backfill', axis = 1)
# replace NaN values by the value in their preceding column

Unnamed: 0,bikes,glasses,pants,shirts,shoes,suits,watches
store 1,20.0,30.0,30.0,15.0,8.0,45.0,35.0
store 2,15.0,50.0,5.0,2.0,5.0,7.0,10.0
store 3,20.0,4.0,30.0,10.0,10.0,35.0,35.0


In [112]:
store_items

Unnamed: 0,bikes,glasses,pants,shirts,shoes,suits,watches
store 1,20,,30,15.0,8,45.0,35
store 2,15,50.0,5,2.0,5,7.0,10
store 3,20,4.0,30,,10,,35


In [113]:
store_items.interpolate(method="linear", axis = 0)

Unnamed: 0,bikes,glasses,pants,shirts,shoes,suits,watches
store 1,20,,30,15.0,8,45.0,35
store 2,15,50.0,5,2.0,5,7.0,10
store 3,20,4.0,30,2.0,10,7.0,35


In [114]:
store_items

Unnamed: 0,bikes,glasses,pants,shirts,shoes,suits,watches
store 1,20,,30,15.0,8,45.0,35
store 2,15,50.0,5,2.0,5,7.0,10
store 3,20,4.0,30,,10,,35
