# Pandas
* Pandas is used to manipulate data
* Pandas is a Python package providing fast, flexible, and expressive data structures designed to make working with “relational” or “labeled” data both easy and intuitive.
* It aims to be the fundamental high-level building block for doing practical, real world data analysis in Python.

In [1]:
import pandas as pd

#### '.read_csv()' will return a dataframe that will be loaded into an object

In [2]:
iris = pd.read_csv('iris.csv')
iris

Unnamed: 0,5.1,3.5,1.4,0.2,Iris-setosa
0,4.9,3.0,1.4,0.2,Iris-setosa
1,4.7,3.2,1.3,0.2,Iris-setosa
2,4.6,3.1,1.5,0.2,Iris-setosa
3,5.0,3.6,1.4,0.2,Iris-setosa
4,5.4,3.9,1.7,0.4,Iris-setosa
...,...,...,...,...,...
144,6.7,3.0,5.2,2.3,Iris-virginica
145,6.3,2.5,5.0,1.9,Iris-virginica
146,6.5,3.0,5.2,2.0,Iris-virginica
147,6.2,3.4,5.4,2.3,Iris-virginica


In [3]:
print(type(iris))

<class 'pandas.core.frame.DataFrame'>


# Accessing Data

In [48]:
df = iris.copy()

In [49]:
# by default .head() returns 5 rows
df.head(10)

Unnamed: 0,5.1,3.5,1.4,0.2,Iris-setosa
0,4.9,3.0,1.4,0.2,Iris-setosa
1,4.7,3.2,1.3,0.2,Iris-setosa
2,4.6,3.1,1.5,0.2,Iris-setosa
3,5.0,3.6,1.4,0.2,Iris-setosa
4,5.4,3.9,1.7,0.4,Iris-setosa
5,4.6,3.4,1.4,0.3,Iris-setosa
6,5.0,3.4,1.5,0.2,Iris-setosa
7,4.4,2.9,1.4,0.2,Iris-setosa
8,4.9,3.1,1.5,0.1,Iris-setosa
9,5.4,3.7,1.5,0.2,Iris-setosa


In [50]:
print(df.shape)

(149, 5)


In [51]:
# for each column it'll show the data type
print(df.dtypes)

5.1            float64
3.5            float64
1.4            float64
0.2            float64
Iris-setosa     object
dtype: object


### Changing columns headers

In [52]:
df.columns

Index(['5.1', '3.5', '1.4', '0.2', 'Iris-setosa'], dtype='object')

In [53]:
# sl: sepal length
# sw: sepal width
# pl: petal length
# pw: petal width
df.columns = ['sl','sw','pl','pw','flower_type']

In [54]:
df

Unnamed: 0,sl,sw,pl,pw,flower_type
0,4.9,3.0,1.4,0.2,Iris-setosa
1,4.7,3.2,1.3,0.2,Iris-setosa
2,4.6,3.1,1.5,0.2,Iris-setosa
3,5.0,3.6,1.4,0.2,Iris-setosa
4,5.4,3.9,1.7,0.4,Iris-setosa
...,...,...,...,...,...
144,6.7,3.0,5.2,2.3,Iris-virginica
145,6.3,2.5,5.0,1.9,Iris-virginica
146,6.5,3.0,5.2,2.0,Iris-virginica
147,6.2,3.4,5.4,2.3,Iris-virginica


In [55]:
df.dtypes

sl             float64
sw             float64
pl             float64
pw             float64
flower_type     object
dtype: object

In [56]:
df.describe()

Unnamed: 0,sl,sw,pl,pw
count,149.0,149.0,149.0,149.0
mean,5.848322,3.051007,3.774497,1.205369
std,0.828594,0.433499,1.759651,0.761292
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.4,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [57]:
# Access a particular value, this will return a series
df['sl']

0      4.9
1      4.7
2      4.6
3      5.0
4      5.4
      ... 
144    6.7
145    6.3
146    6.5
147    6.2
148    5.9
Name: sl, Length: 149, dtype: float64

In [58]:
df.isnull()

Unnamed: 0,sl,sw,pl,pw,flower_type
0,False,False,False,False,False
1,False,False,False,False,False
2,False,False,False,False,False
3,False,False,False,False,False
4,False,False,False,False,False
...,...,...,...,...,...
144,False,False,False,False,False
145,False,False,False,False,False
146,False,False,False,False,False
147,False,False,False,False,False


In [59]:
df.isnull().sum()

sl             0
sw             0
pl             0
pw             0
flower_type    0
dtype: int64

In [60]:
# this is look for NAN value in the data
df.isna()

Unnamed: 0,sl,sw,pl,pw,flower_type
0,False,False,False,False,False
1,False,False,False,False,False
2,False,False,False,False,False
3,False,False,False,False,False
4,False,False,False,False,False
...,...,...,...,...,...
144,False,False,False,False,False
145,False,False,False,False,False
146,False,False,False,False,False
147,False,False,False,False,False


## .iloc[] 
* It'll access the data based on the index position of row and column

In [61]:
df.iloc[:5,2:]

Unnamed: 0,pl,pw,flower_type
0,1.4,0.2,Iris-setosa
1,1.3,0.2,Iris-setosa
2,1.5,0.2,Iris-setosa
3,1.4,0.2,Iris-setosa
4,1.7,0.4,Iris-setosa


In [62]:
df.iloc[5,2]

1.4

## .loc[] 
* It'll access the data based on the row or column label value

In [63]:
df.loc[5]

sl                     4.6
sw                     3.4
pl                     1.4
pw                     0.3
flower_type    Iris-setosa
Name: 5, dtype: object

# Manipulate Data

In [64]:
df

Unnamed: 0,sl,sw,pl,pw,flower_type
0,4.9,3.0,1.4,0.2,Iris-setosa
1,4.7,3.2,1.3,0.2,Iris-setosa
2,4.6,3.1,1.5,0.2,Iris-setosa
3,5.0,3.6,1.4,0.2,Iris-setosa
4,5.4,3.9,1.7,0.4,Iris-setosa
...,...,...,...,...,...
144,6.7,3.0,5.2,2.3,Iris-virginica
145,6.3,2.5,5.0,1.9,Iris-virginica
146,6.5,3.0,5.2,2.0,Iris-virginica
147,6.2,3.4,5.4,2.3,Iris-virginica


### Drop a column
* '.drop()' will create a copy of the original dataframe, which will not have the particular row
* '.drop(0, inplace=True)' inplace attribute will change the original dataframe if set to True, by deafult it is False. And this  0 indicates the **row label** that needs to be dropped
* **row label** and **row index position** are different thing

In [65]:
a = df.drop(0)
a.head()

Unnamed: 0,sl,sw,pl,pw,flower_type
1,4.7,3.2,1.3,0.2,Iris-setosa
2,4.6,3.1,1.5,0.2,Iris-setosa
3,5.0,3.6,1.4,0.2,Iris-setosa
4,5.4,3.9,1.7,0.4,Iris-setosa
5,4.6,3.4,1.4,0.3,Iris-setosa


In [66]:
df.drop(0, inplace=True)
df.head()

Unnamed: 0,sl,sw,pl,pw,flower_type
1,4.7,3.2,1.3,0.2,Iris-setosa
2,4.6,3.1,1.5,0.2,Iris-setosa
3,5.0,3.6,1.4,0.2,Iris-setosa
4,5.4,3.9,1.7,0.4,Iris-setosa
5,4.6,3.4,1.4,0.3,Iris-setosa


In [67]:
# this throws as error as 0 index row is already removed
df.drop(0, inplace=True)
df.head()

KeyError: '[0] not found in axis'

In [68]:
df.drop(3, inplace=True)
df.head()

Unnamed: 0,sl,sw,pl,pw,flower_type
1,4.7,3.2,1.3,0.2,Iris-setosa
2,4.6,3.1,1.5,0.2,Iris-setosa
4,5.4,3.9,1.7,0.4,Iris-setosa
5,4.6,3.4,1.4,0.3,Iris-setosa
6,5.0,3.4,1.5,0.2,Iris-setosa


In [69]:
df.columns

Index(['sl', 'sw', 'pl', 'pw', 'flower_type'], dtype='object')

In [70]:
# 0 and 3 label is not there they have been dropped
df.index

Int64Index([  1,   2,   4,   5,   6,   7,   8,   9,  10,  11,
            ...
            139, 140, 141, 142, 143, 144, 145, 146, 147, 148],
           dtype='int64', length=147)

* This will return the **label** at particular **index position**

In [71]:
df.index[0]

1

In [72]:
df.index[3]

5

In [73]:
# This will drop the row at particular index position not the label
df.drop(df.index[0],inplace=True)
df.head()

Unnamed: 0,sl,sw,pl,pw,flower_type
2,4.6,3.1,1.5,0.2,Iris-setosa
4,5.4,3.9,1.7,0.4,Iris-setosa
5,4.6,3.4,1.4,0.3,Iris-setosa
6,5.0,3.4,1.5,0.2,Iris-setosa
7,4.4,2.9,1.4,0.2,Iris-setosa


In [74]:
# we can remove more than one row as well
df.drop(df.index[0,2],inplace=True)
df.head()

IndexError: too many indices for array: array is 1-dimensional, but 2 were indexed

In [75]:
bool_series1 = df['sl'] > 5
bool_series1

2      False
4       True
5      False
6      False
7      False
       ...  
144     True
145     True
146     True
147     True
148     True
Name: sl, Length: 146, dtype: bool

In [76]:
df[bool_series1]

Unnamed: 0,sl,sw,pl,pw,flower_type
4,5.4,3.9,1.7,0.4,Iris-setosa
9,5.4,3.7,1.5,0.2,Iris-setosa
13,5.8,4.0,1.2,0.2,Iris-setosa
14,5.7,4.4,1.5,0.4,Iris-setosa
15,5.4,3.9,1.3,0.4,Iris-setosa
...,...,...,...,...,...
144,6.7,3.0,5.2,2.3,Iris-virginica
145,6.3,2.5,5.0,1.9,Iris-virginica
146,6.5,3.0,5.2,2.0,Iris-virginica
147,6.2,3.4,5.4,2.3,Iris-virginica


In [77]:
bool_series2 = df['flower_type'] == 'Iris-setosa'
bool_series2

2       True
4       True
5       True
6       True
7       True
       ...  
144    False
145    False
146    False
147    False
148    False
Name: flower_type, Length: 146, dtype: bool

In [78]:
df[bool_series2]

Unnamed: 0,sl,sw,pl,pw,flower_type
2,4.6,3.1,1.5,0.2,Iris-setosa
4,5.4,3.9,1.7,0.4,Iris-setosa
5,4.6,3.4,1.4,0.3,Iris-setosa
6,5.0,3.4,1.5,0.2,Iris-setosa
7,4.4,2.9,1.4,0.2,Iris-setosa
8,4.9,3.1,1.5,0.1,Iris-setosa
9,5.4,3.7,1.5,0.2,Iris-setosa
10,4.8,3.4,1.6,0.2,Iris-setosa
11,4.8,3.0,1.4,0.1,Iris-setosa
12,4.3,3.0,1.1,0.1,Iris-setosa


In [79]:
c1 = df[df['flower_type'] == 'Iris-setosa'].describe()
c1

Unnamed: 0,sl,sw,pl,pw
count,46.0,46.0,46.0,46.0
mean,5.013043,3.426087,1.471739,0.247826
std,0.364301,0.39012,0.178466,0.111034
min,4.3,2.3,1.0,0.1
25%,4.8,3.125,1.4,0.2
50%,5.0,3.4,1.5,0.2
75%,5.2,3.7,1.6,0.3
max,5.8,4.4,1.9,0.6


### Add a row

In [80]:
print(df.head())
print(df.iloc[0])
print(df.loc[2])

    sl   sw   pl   pw  flower_type
2  4.6  3.1  1.5  0.2  Iris-setosa
4  5.4  3.9  1.7  0.4  Iris-setosa
5  4.6  3.4  1.4  0.3  Iris-setosa
6  5.0  3.4  1.5  0.2  Iris-setosa
7  4.4  2.9  1.4  0.2  Iris-setosa
sl                     4.6
sw                     3.1
pl                     1.5
pw                     0.2
flower_type    Iris-setosa
Name: 2, dtype: object
sl                     4.6
sw                     3.1
pl                     1.5
pw                     0.2
flower_type    Iris-setosa
Name: 2, dtype: object


In [81]:
df.loc[149] = [1,2,3,4,'Iris-setosa']
df.head()

Unnamed: 0,sl,sw,pl,pw,flower_type
2,4.6,3.1,1.5,0.2,Iris-setosa
4,5.4,3.9,1.7,0.4,Iris-setosa
5,4.6,3.4,1.4,0.3,Iris-setosa
6,5.0,3.4,1.5,0.2,Iris-setosa
7,4.4,2.9,1.4,0.2,Iris-setosa


In [82]:
df.tail()

Unnamed: 0,sl,sw,pl,pw,flower_type
145,6.3,2.5,5.0,1.9,Iris-virginica
146,6.5,3.0,5.2,2.0,Iris-virginica
147,6.2,3.4,5.4,2.3,Iris-virginica
148,5.9,3.0,5.1,1.8,Iris-virginica
149,1.0,2.0,3.0,4.0,Iris-setosa


In [83]:
iris = pd.read_csv('iris.csv')
print(iris.iloc[-3:])
print(iris.tail(3))

     5.1  3.5  1.4  0.2     Iris-setosa
146  6.5  3.0  5.2  2.0  Iris-virginica
147  6.2  3.4  5.4  2.3  Iris-virginica
148  5.9  3.0  5.1  1.8  Iris-virginica
     5.1  3.5  1.4  0.2     Iris-setosa
146  6.5  3.0  5.2  2.0  Iris-virginica
147  6.2  3.4  5.4  2.3  Iris-virginica
148  5.9  3.0  5.1  1.8  Iris-virginica


#### reset_index()
* It'll reset all the label index again from 0 till end value, as let suppose if we delete few data in between then label index will be disordered
* reset_index(), it'll add the previous index column along with the columns but we don't want that so put 'drop=True'

In [84]:
df.reset_index()

Unnamed: 0,index,sl,sw,pl,pw,flower_type
0,2,4.6,3.1,1.5,0.2,Iris-setosa
1,4,5.4,3.9,1.7,0.4,Iris-setosa
2,5,4.6,3.4,1.4,0.3,Iris-setosa
3,6,5.0,3.4,1.5,0.2,Iris-setosa
4,7,4.4,2.9,1.4,0.2,Iris-setosa
...,...,...,...,...,...,...
142,145,6.3,2.5,5.0,1.9,Iris-virginica
143,146,6.5,3.0,5.2,2.0,Iris-virginica
144,147,6.2,3.4,5.4,2.3,Iris-virginica
145,148,5.9,3.0,5.1,1.8,Iris-virginica


In [85]:
df.reset_index(drop=True,inplace=True)
df

Unnamed: 0,sl,sw,pl,pw,flower_type
0,4.6,3.1,1.5,0.2,Iris-setosa
1,5.4,3.9,1.7,0.4,Iris-setosa
2,4.6,3.4,1.4,0.3,Iris-setosa
3,5.0,3.4,1.5,0.2,Iris-setosa
4,4.4,2.9,1.4,0.2,Iris-setosa
...,...,...,...,...,...
142,6.3,2.5,5.0,1.9,Iris-virginica
143,6.5,3.0,5.2,2.0,Iris-virginica
144,6.2,3.4,5.4,2.3,Iris-virginica
145,5.9,3.0,5.1,1.8,Iris-virginica


### Drop a column

In [86]:
df.drop('sl',axis=1,inplace=True)

In [87]:
df

Unnamed: 0,sw,pl,pw,flower_type
0,3.1,1.5,0.2,Iris-setosa
1,3.9,1.7,0.4,Iris-setosa
2,3.4,1.4,0.3,Iris-setosa
3,3.4,1.5,0.2,Iris-setosa
4,2.9,1.4,0.2,Iris-setosa
...,...,...,...,...
142,2.5,5.0,1.9,Iris-virginica
143,3.0,5.2,2.0,Iris-virginica
144,3.4,5.4,2.3,Iris-virginica
145,3.0,5.1,1.8,Iris-virginica


In [88]:
del df['sw'] # It'll delete the column from original dataset
df

Unnamed: 0,pl,pw,flower_type
0,1.5,0.2,Iris-setosa
1,1.7,0.4,Iris-setosa
2,1.4,0.3,Iris-setosa
3,1.5,0.2,Iris-setosa
4,1.4,0.2,Iris-setosa
...,...,...,...
142,5.0,1.9,Iris-virginica
143,5.2,2.0,Iris-virginica
144,5.4,2.3,Iris-virginica
145,5.1,1.8,Iris-virginica


In [89]:
df =iris.copy()
df.columns = ['sl','sw','pl','pw','flower_type']
df

Unnamed: 0,sl,sw,pl,pw,flower_type
0,4.9,3.0,1.4,0.2,Iris-setosa
1,4.7,3.2,1.3,0.2,Iris-setosa
2,4.6,3.1,1.5,0.2,Iris-setosa
3,5.0,3.6,1.4,0.2,Iris-setosa
4,5.4,3.9,1.7,0.4,Iris-setosa
...,...,...,...,...,...
144,6.7,3.0,5.2,2.3,Iris-virginica
145,6.3,2.5,5.0,1.9,Iris-virginica
146,6.5,3.0,5.2,2.0,Iris-virginica
147,6.2,3.4,5.4,2.3,Iris-virginica


### Add a column

In [90]:
df.head()

Unnamed: 0,sl,sw,pl,pw,flower_type
0,4.9,3.0,1.4,0.2,Iris-setosa
1,4.7,3.2,1.3,0.2,Iris-setosa
2,4.6,3.1,1.5,0.2,Iris-setosa
3,5.0,3.6,1.4,0.2,Iris-setosa
4,5.4,3.9,1.7,0.4,Iris-setosa


In [93]:
df['diff_pl_pw'] = df['pl']-df['pw']
df.head()

Unnamed: 0,sl,sw,pl,pw,flower_type,diff_pl_pw
0,4.9,3.0,1.4,0.2,Iris-setosa,1.2
1,4.7,3.2,1.3,0.2,Iris-setosa,1.1
2,4.6,3.1,1.5,0.2,Iris-setosa,1.3
3,5.0,3.6,1.4,0.2,Iris-setosa,1.2
4,5.4,3.9,1.7,0.4,Iris-setosa,1.3
