In [1]:
import pandas as pd

# Pandas - DataStrucutures
- __Series__ - 1 DImension homogeneous array, sizeimmutable.
- __Data Frame__ - 2 Dimension size-mutable tabular structure with potentially heterogeneously typed columns.
- __Panel__ - 3 Dimension size-mutable array.
- DataFrame is a container of Series, Panel is a container of DataFrame.
- All Pandas data structures are value mutable (can be changed) and except Series all are size mutable. Series is size immutable.

# Pandas - Series

### pandas.Series( data, index, dtype, copy)
- __data__ - takes various forms like ndarray, list, dictionary,constants
- __index__ - Index values must be unique and hashable, same length as data. Default np.arrange(n) if no index is passed.
- __dtype__ - is for data type. If None, data type will be inferred
- __copy__ - Copy data. Default False

In [3]:
#Creating Empty Series
s = pd.Series()
print(s)

Series([], dtype: float64)


### Create a Series from ndarray
- If data is an ndarray, then index passed must be of the same length. If no index is passed, then by default index will be __range(n)__ where __n__ is array length, i.e., __[0,1,2,3…. range(len(array))-1]__.

In [4]:
import numpy as np
array = np.arange(9)

s = pd.Series(array)
print(s)

0    0
1    1
2    2
3    3
4    4
5    5
6    6
7    7
8    8
dtype: int32


In [7]:
# Lets Specify Index
import numpy as np
array = np.arange(9)

s = pd.Series(array,np.arange(10,19))
print(s)

# We passed the index values here. Now we can see the customized indexed values in the output.

10    0
11    1
12    2
13    3
14    4
15    5
16    6
17    7
18    8
dtype: int32


### Create a Series from dict
- A __dict__ can be passed as input and if no index is specified, then the dictionary keys are taken in a sorted order to construct index. If __index__ is passed, the values in data corresponding to the labels in the index will be pulled out.

In [9]:
data = {'a' : 0., 'b' : 1., 'c' : 2.}

s = pd.Series(data)

print(s)
#Observe − Dictionary keys are used to construct index.

a    0.0
b    1.0
c    2.0
dtype: float64


In [11]:
data = {'a' : 0., 'b' : 1., 'c' : 2.}
s = pd.Series(data,index=['b','c','d','a'])
print(s)

#Observe − Index order is persisted and the missing element is filled with NaN (Not a Number).

b    1.0
c    2.0
d    NaN
a    0.0
dtype: float64


### Create a Series from Scalar
- If data is a scalar value, an index must be provided. The value will be repeated to match the length of __index__

In [12]:
s = pd.Series(5, index=[0, 1, 2, 3])
print(s)

0    5
1    5
2    5
3    5
dtype: int64


### Accessing Data from Series with Position, Slicing, Index(Label)

In [17]:
s = pd.Series([1,2,3,4,5],index = ['a','b','c','d','e'])
print(s)
print('\n')

print(s[3]) # Accessinge Data with Position
print('\n')

print(s['b']) # Accessing Data with Label
print('\n')

print(s[3:]) # Access from the 3rd index
print('\n')

print(s[:3]) # Access up to last 3 indexes
print('\n')

print(s[-3:])# Access last 3 indexes

a    1
b    2
c    3
d    4
e    5
dtype: int64


4


2


d    4
e    5
dtype: int64


a    1
b    2
c    3
dtype: int64


c    3
d    4
e    5
dtype: int64


# Pandas - DataFrame

### pandas.DataFrame( data, index, columns, dtype, copy)
- __data__ - takes various forms like __ndarray__, __series__, __map__, __lists__, __dict__, __constants__ and also another __DataFrame__
- __index__ - For the row labels, the Index to be used for the resulting frame is Optional Default _np.arrange(n)_ if no index is passed.
- __columns__ - For column labels, the optional default syntax is - _np.arrange(n)_. This is only true if no index is passed.
- __dtype__ - Data type of each column.
- __copy__ - This command (or whatever it is) is used for copying of data, if the default is False.

In [18]:
# Creating Empty DataFrame

df = pd.DataFrame()

print(df)

Empty DataFrame
Columns: []
Index: []


### Creating a DataFrame from __Lists__
- The DataFrame can be created using a single list or a list of lists.

In [19]:
data = [10,11,12,13,14]

df = pd.DataFrame(data)

print(df)

    0
0  10
1  11
2  12
3  13
4  14


In [24]:
data = [['Alex',10],['Bob',12],['Clarke',13]]

df = pd.DataFrame(data)
print(df)
print('\n')

df_new = pd.DataFrame(data, columns=['Name','Age'], dtype=float) #Creating column names
print(df_new)

        0   1
0    Alex  10
1     Bob  12
2  Clarke  13


     Name   Age
0    Alex  10.0
1     Bob  12.0
2  Clarke  13.0


### Create a DataFrame from Dict of ndarrays / Lists
- All the ndarrays must be of same length. If index is passed, then the length of the index should equal to the length of the arrays.<br>If no index is passed, then by default, index will be range(n), where n is the array length.

In [27]:
Name_List = ['Tom', 'Jack', 'Steve', 'Ricky']
Age_List = [28,34,29,42]
data_dict = {'Name': Name_List, 'Age':Age_List}

df = pd.DataFrame(data_dict)
print(df)

# The dictionary keys are by default taken as column names.

    Name  Age
0    Tom   28
1   Jack   34
2  Steve   29
3  Ricky   42


In [26]:
Name_List = ['Tom', 'Jack', 'Steve', 'Ricky']
Age_List = [28,34,29,42]
data_dict = {'Name': Name_List, 'Age':Age_List}

df = pd.DataFrame(data_dict, index=['rank1','rank2','rank3','rank4'])
print(df)

        Name  Age
rank1    Tom   28
rank2   Jack   34
rank3  Steve   29
rank4  Ricky   42


### Create a DataFrame from List of Dicts
- List of Dictionaries can be passed as input data to create a DataFrame. The __dictionary keys__ are by default taken as __column names__.

In [28]:
data = [{'a': 1, 'b': 2},{'a': 5, 'b': 10, 'c': 20}]
df = pd.DataFrame(data)

print(df)

   a   b     c
0  1   2   NaN
1  5  10  20.0


In [30]:
data = [{'a': 1, 'b': 2},{'a': 5, 'b': 10, 'c': 20}]

#With two column indices, values same as dictionary keys
df1 = pd.DataFrame(data, index=['first', 'second'], columns=['a', 'b'])

#With two column indices with one index with other name
df2 = pd.DataFrame(data, index=['first', 'second'], columns=['a', 'b1'])

print(df1)
print('\n')
print(df2)

        a   b
first   1   2
second  5  10


        a  b1
first   1 NaN
second  5 NaN


### Create a DataFrame from Dict of Series
- Dictionary of Series can be passed to form a DataFrame. The resultant index is the union of all the series indexes passed.

In [33]:
data = {'Name': pd.Series(['Shri','Harsha'],index=['a', 'b']), 'Age': pd.Series([10,11,12],index=['a', 'b', 'c'])}

df = pd.DataFrame(data)
print(df)

     Name  Age
a    Shri   10
b  Harsha   11
c     NaN   12


### Column Selection

In [35]:
d = {'one' : pd.Series([1, 2, 3], index=['a', 'b', 'c']),
   'two' : pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])}

df = pd.DataFrame(d)

print(df)
print('\n')

print(df['two'])

   one  two
a  1.0    1
b  2.0    2
c  3.0    3
d  NaN    4


a    1
b    2
c    3
d    4
Name: two, dtype: int64


### Column Addition

In [38]:
d = {'one' : pd.Series([1, 2, 3], index=['a', 'b', 'c']),
   'two' : pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])}

df = pd.DataFrame(d)
print(df)
print('\n')

df['three'] = pd.Series([10,20,30], index=['a', 'b', 'c'])

print(df)
print('\n')

df['four'] = df['one']+df['three']
print(df)

   one  two
a  1.0    1
b  2.0    2
c  3.0    3
d  NaN    4


   one  two  three
a  1.0    1   10.0
b  2.0    2   20.0
c  3.0    3   30.0
d  NaN    4    NaN


   one  two  three  four
a  1.0    1   10.0  11.0
b  2.0    2   20.0  22.0
c  3.0    3   30.0  33.0
d  NaN    4    NaN   NaN


### Column Deletion

In [41]:
d = {'one' : pd.Series([1, 2, 3], index=['a', 'b', 'c']), 
   'two' : pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd']), 
   'three' : pd.Series([10,20,30], index=['a','b','c'])}

df = pd.DataFrame(d)
print(df)
print('\n')

#Using del function
del(df['two'])
print(df)
print('\n')

# Using Pandas pop dataframe function

df.pop('three')
print(df)
print('\n')

   one  two  three
a  1.0    1   10.0
b  2.0    2   20.0
c  3.0    3   30.0
d  NaN    4    NaN


   one  three
a  1.0   10.0
b  2.0   20.0
c  3.0   30.0
d  NaN    NaN


   one
a  1.0
b  2.0
c  3.0
d  NaN




### Row Selection

In [6]:
# Selection by Label
data_dict = {'one' : pd.Series([1, 2, 3], index=['a', 'b', 'c']), 
   'two' : pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])}

df = pd.DataFrame(data_dict)
print(df)
print('\n')

print(df.loc['c'])

   one  two
a  1.0    1
b  2.0    2
c  3.0    3
d  NaN    4


one    3.0
two    3.0
Name: c, dtype: float64


In [7]:
data_dict = {'one' : pd.Series([1, 2, 3], index=['a', 'b', 'c']),
   'two' : pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])}

df = pd.DataFrame(data_dict)
print(df)
print('\n')

print(df.iloc[3])

   one  two
a  1.0    1
b  2.0    2
c  3.0    3
d  NaN    4


one    NaN
two    4.0
Name: d, dtype: float64


### Slice Rows
- Slicing is possible only for rows, not for coulmns

In [8]:
d = {'one' : pd.Series([1, 2, 3], index=['a', 'b', 'c']), 
   'two' : pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])}

df = pd.DataFrame(d)
print(df)
print('\n')

print(df[2:3]) 

   one  two
a  1.0    1
b  2.0    2
c  3.0    3
d  NaN    4


   one  two
c  3.0    3


### Addition of Rows
- Add new rows to a DataFrame using the __append__ function. This function will __append__ the rows at the end.

In [11]:
df = pd.DataFrame([[1, 2], [3, 4]], columns = ['a','b'])
df2 = pd.DataFrame([[5, 6], [7, 8]], columns = ['a','b'])
print(df)
print('\n')
df = df.append(df2) # Append the new dataframe
print(df)

   a  b
0  1  2
1  3  4


   a  b
0  1  2
1  3  4
0  5  6
1  7  8


### Deletion of Rows
- Use index label to delete or drop rows from a DataFrame. If label is duplicated, then multiple rows will be dropped.

In [2]:
df = pd.DataFrame([[1, 2], [3, 4]], columns = ['a','b'])
df2 = pd.DataFrame([[5, 6], [7, 8]], columns = ['a','b'])

df = df.append(df2)
print(df)
print('\n')

df =  df.drop(0)
print(df)

   a  b
0  1  2
1  3  4
0  5  6
1  7  8


   a  b
1  3  4
1  7  8


# Panel
The names for the 3 axes are intended to give some semantic meaning to describing operations involving panel data. They are:
- __items__ − axis 0, each item corresponds to a DataFrame contained inside.
- __major_axis__ − axis 1, it is the index (rows) of each of the DataFrames.
- __minor_axis__ − axis 2, it is the columns of each of the DataFrames.

### Creating the Panel
#### pandas.Panel(data, items, major_axis, minor_axis, dtype, copy)
- __data__ ----> Data takes various forms like ndarray, series, map, lists, dict, constants and also another DataFrame
- __items__ ----> axis=0
- __major_axis__ ----> axis=1
- __minor_axis__ ----> axis=2
- __dtype__ ---->	Data type of each column
- __copy__ ----> Copy data. Default, false

# Basic Functionality

### Series Basic Functionality

In [5]:
import numpy as np
s = pd.Series(np.random.randn(4))

In [7]:
print(s.axes) #Returns a list of the row axis labels

[RangeIndex(start=0, stop=4, step=1)]


In [9]:
print(s.empty) #Returns the Boolean value saying whether the Object is empty or not

False


In [11]:
print(s.ndim) #Returns the number of dimensions of the object. By definition, a Series is a 1D data structure, so it returns 1

1


In [15]:
print(s.size) #Returns the size(length) of the series
print('\n')
print(len(s))

4


4


In [18]:
print(s.values) #Returns the actual data in the series as an array

[-0.31488133  0.10406499  1.28126161  0.16232898]


In [20]:
print(s.head(2)) #head() returns the first n rows(observe the index values). The default number of elements to display is five, but you may pass a custom number.

0   -0.314881
1    0.104065
dtype: float64


In [22]:
print(s.tail(2)) #tail() returns the last n rows(observe the index values). The default number of elements to display is five, but you may pass a custom number.

2    1.281262
3    0.162329
dtype: float64


### DataFrame Basic Functionality

In [28]:
data = {'Name':pd.Series(['Tom','James','Ricky','Vin','Steve','Smith','Jack']),
   'Age':pd.Series([25,26,25,23,30,29,23]),
   'Rating':pd.Series([4.23,3.24,3.98,2.56,3.20,4.6,3.8])}
df = pd.DataFrame(data)
print(df)

    Name  Age  Rating
0    Tom   25    4.23
1  James   26    3.24
2  Ricky   25    3.98
3    Vin   23    2.56
4  Steve   30    3.20
5  Smith   29    4.60
6   Jack   23    3.80


In [29]:
print(df.T) #Returns the transpose of the DataFrame. The rows and columns will interchange.

           0      1      2     3      4      5     6
Name     Tom  James  Ricky   Vin  Steve  Smith  Jack
Age       25     26     25    23     30     29    23
Rating  4.23   3.24   3.98  2.56    3.2    4.6   3.8


In [31]:
print(df.axes) #Returns the list of row axis labels and column axis labels.

[RangeIndex(start=0, stop=7, step=1), Index(['Name', 'Age', 'Rating'], dtype='object')]


In [38]:
print(df.dtypes) #Returns the data type of each column.

Name       object
Age         int64
Rating    float64
dtype: object


In [40]:
print(df.empty) #Returns the Boolean value saying whether the Object is empty or not; True indicates that the object is empty.

False


In [42]:
print(df.ndim) # Returns the number of dimensions of the object. By definition, DataFrame is a 2D object.

2


In [44]:
print(df.shape) #Returns a tuple representing the dimensionality of the DataFrame. Tuple (a,b), where a represents the number of rows and b represents the number of columns.

(7, 3)


In [46]:
print(df.size)#Returns the number of elements in the DataFrame.

21


In [48]:
print(df.values) #Returns the actual data in the DataFrame as an NDarray

[['Tom' 25 4.23]
 ['James' 26 3.24]
 ['Ricky' 25 3.98]
 ['Vin' 23 2.56]
 ['Steve' 30 3.2]
 ['Smith' 29 4.6]
 ['Jack' 23 3.8]]


In [49]:
print(df.head()) #To view a small sample of a DataFrame object, use the head() and tail() methods. head() returns the first n rows (observe the index values). The default number of elements to display is 5, but you may pass a custom number.

    Name  Age  Rating
0    Tom   25    4.23
1  James   26    3.24
2  Ricky   25    3.98
3    Vin   23    2.56
4  Steve   30    3.20


In [51]:
print(df.tail()) #tail() returns the last n rows (observe the index values). The default number of elements to display is 5, but you may pass a custom number.

    Name  Age  Rating
2  Ricky   25    3.98
3    Vin   23    2.56
4  Steve   30    3.20
5  Smith   29    4.60
6   Jack   23    3.80


# Descriptive Statistics

In [53]:
data = {'Name':pd.Series(['Tom','James','Ricky','Vin','Steve','Smith','Jack']),
   'Age':pd.Series([25,26,25,23,30,29,23]),
   'Rating':pd.Series([4.23,3.24,3.98,2.56,3.20,4.6,3.8])}
df = pd.DataFrame(data)
print(df)

    Name  Age  Rating
0    Tom   25    4.23
1  James   26    3.24
2  Ricky   25    3.98
3    Vin   23    2.56
4  Steve   30    3.20
5  Smith   29    4.60
6   Jack   23    3.80


In [56]:
print(df.sum()) #Returns the sum of the values for the requested axis. By default, axis is index (axis=0).
#Each individual column is added individually (Strings are appended)
print('\n')
print(df.sum(axis=1))

Name      TomJamesRickyVinSteveSmithJack
Age                                  181
Rating                             25.61
dtype: object


0    29.23
1    29.24
2    28.98
3    25.56
4    33.20
5    33.60
6    26.80
dtype: float64


In [58]:
print(df.mean()) #Returns the average value

Age       25.857143
Rating     3.658571
dtype: float64


In [59]:
print(df.std())

Age       2.734262
Rating    0.698628
dtype: float64


In [60]:
print(df.count())

Name      7
Age       7
Rating    7
dtype: int64


In [61]:
print(df.median())

Age       25.0
Rating     3.8
dtype: float64


In [63]:
print(df.mode())

    Name   Age  Rating
0   Jack  23.0    2.56
1  James  25.0    3.20
2  Ricky   NaN    3.24
3  Smith   NaN    3.80
4  Steve   NaN    3.98
5    Tom   NaN    4.23
6    Vin   NaN    4.60


In [64]:
print(df.min())

Name      Jack
Age         23
Rating    2.56
dtype: object


In [65]:
print(df.max())

Name      Vin
Age        30
Rating    4.6
dtype: object


In [66]:
print(df.abs())

TypeError: bad operand type for abs(): 'str'

In [67]:
print(df.prod())

Age       7.478738e+09
Rating    7.810877e+03
dtype: float64


In [68]:
print(df.cumsum())#cumulative sum

                             Name  Age Rating
0                             Tom   25   4.23
1                        TomJames   51   7.47
2                   TomJamesRicky   76  11.45
3                TomJamesRickyVin   99  14.01
4           TomJamesRickyVinSteve  129  17.21
5      TomJamesRickyVinSteveSmith  158  21.81
6  TomJamesRickyVinSteveSmithJack  181  25.61


In [69]:
print(df.cumprod())

TypeError: can't multiply sequence by non-int of type 'str'

#### Note − Since DataFrame is a Heterogeneous data structure. Generic operations don’t work with all functions.

- Functions like __sum()__, __cumsum()__ work with both numeric and character (or) string data elements without any error. Though n practice, character aggregations are never used generally, these functions do not throw any exception.
- Functions like __abs()__, __cumprod()__ throw exception when the DataFrame contains character or string data because such operations cannot be performed.

### Summarizing Data

In [74]:
print(df.describe()) #Returns Dataframe

             Age    Rating
count   7.000000  7.000000
mean   25.857143  3.658571
std     2.734262  0.698628
min    23.000000  2.560000
25%    24.000000  3.220000
50%    25.000000  3.800000
75%    27.500000  4.105000
max    30.000000  4.600000


This function gives the mean, std and IQR values. And, function excludes the character columns and given summary about numeric columns. __'include'__ is the argument which is used to pass necessary information regarding what columns need to be considered for summarizing. Takes the list of values; by default, 'number'.

- __object__ − Summarizes String columns
- __number__ − Summarizes Numeric columns
- __all__ − Summarizes all columns together (Should not pass it as a list value)

In [75]:
print(df.describe(include='object'))

        Name
count      7
unique     7
top     Jack
freq       1


In [76]:
print(df.describe(include='number'))

             Age    Rating
count   7.000000  7.000000
mean   25.857143  3.658571
std     2.734262  0.698628
min    23.000000  2.560000
25%    24.000000  3.220000
50%    25.000000  3.800000
75%    27.500000  4.105000
max    30.000000  4.600000


In [77]:
print(df.describe(include='all'))

        Name        Age    Rating
count      7   7.000000  7.000000
unique     7        NaN       NaN
top     Jack        NaN       NaN
freq       1        NaN       NaN
mean     NaN  25.857143  3.658571
std      NaN   2.734262  0.698628
min      NaN  23.000000  2.560000
25%      NaN  24.000000  3.220000
50%      NaN  25.000000  3.800000
75%      NaN  27.500000  4.105000
max      NaN  30.000000  4.600000


# Function Application 

To apply your own or another library’s functions to Pandas objects, you should be aware of the three important methods. The methods have been discussed below. The appropriate method to use depends on whether your function expects to operate on an entire DataFrame, row- or column-wise, or element wise.

- Table wise Function Application: pipe()
- Row or Column Wise Function Application: apply()
- Element wise Function Application: applymap()

In [9]:
import numpy as np

def adder(ele1,ele2):
   return ele1+ele2

df = pd.DataFrame(np.random.randn(5,3),columns=['col1','col2','col3'])
print(df)
df = df.pipe(adder,2) #Dataframe Wise
print(df)
print('\n')
print(df.apply(np.mean))
print('\n')
print(df.apply(np.mean,axis = 1)) #Row wise
print('\n')
print(df.apply(np.mean,axis = 0)) #Column wise
print('\n')
print(df.applymap(lambda x:x*100)) #Element wise

       col1      col2      col3
0 -0.298356  1.819500  0.232934
1 -2.141863 -0.315050  0.485152
2 -0.909651  1.366816  0.266045
3  0.051716 -1.373822  1.017764
4  1.106950  0.542309 -1.611644
       col1      col2      col3
0  1.701644  3.819500  2.232934
1 -0.141863  1.684950  2.485152
2  1.090349  3.366816  2.266045
3  2.051716  0.626178  3.017764
4  3.106950  2.542309  0.388356


col1    1.561759
col2    2.407951
col3    2.078050
dtype: float64


0    2.584693
1    1.342746
2    2.241070
3    1.898553
4    2.012538
dtype: float64


col1    1.561759
col2    2.407951
col3    2.078050
dtype: float64


         col1        col2        col3
0  170.164382  381.950004  223.293428
1  -14.186264  168.495010  248.515184
2  109.034920  336.681607  226.604501
3  205.171647   62.617836  301.776407
4  310.694969  254.230854   38.835630


# Reindexing
Reindexing changes the row labels and column labels of a DataFrame. To reindex means to conform the data to match a given set of labels along a particular axis.

Multiple operations can be accomplished through indexing like −

- Reorder the existing data to match a new set of labels.

- Insert missing value (NA) markers in label locations where no data for the label existed.

In [10]:
import numpy as np

N=20

df = pd.DataFrame({
   'A': pd.date_range(start='2016-01-01',periods=N,freq='D'),
   'x': np.linspace(0,stop=N-1,num=N),
   'y': np.random.rand(N),
   'C': np.random.choice(['Low','Medium','High'],N).tolist(),
   'D': np.random.normal(100, 10, size=(N)).tolist()
})
print(df)

            A     x         y       C           D
0  2016-01-01   0.0  0.757669  Medium   84.281649
1  2016-01-02   1.0  0.821316  Medium  106.457603
2  2016-01-03   2.0  0.664407  Medium   93.259536
3  2016-01-04   3.0  0.320490  Medium  104.235476
4  2016-01-05   4.0  0.736990  Medium  104.092506
5  2016-01-06   5.0  0.466542    High  107.355016
6  2016-01-07   6.0  0.591070  Medium   98.390755
7  2016-01-08   7.0  0.441446    High  105.282142
8  2016-01-09   8.0  0.804507  Medium  100.591906
9  2016-01-10   9.0  0.065743  Medium  118.679485
10 2016-01-11  10.0  0.154795    High   97.677474
11 2016-01-12  11.0  0.923391  Medium  100.083590
12 2016-01-13  12.0  0.322632    High  100.424788
13 2016-01-14  13.0  0.174743    High  115.027743
14 2016-01-15  14.0  0.895583     Low   99.802819
15 2016-01-16  15.0  0.110773  Medium   99.405887
16 2016-01-17  16.0  0.244039    High  116.981017
17 2016-01-18  17.0  0.840198     Low   89.472450
18 2016-01-19  18.0  0.831173    High   87.504912


In [11]:
df_reindexed = df.reindex(index=[0,2,5], columns=['A', 'C', 'B'])
print(df_reindexed)

           A       C   B
0 2016-01-01  Medium NaN
2 2016-01-03  Medium NaN
5 2016-01-06    High NaN


#### Reindex to Align with Other Objects
- You may wish to take an object and reindex its axes to be labeled the same as another object.
- __Note__ − Here, the df1 DataFrame is altered and reindexed like df2. The column names should be matched or else NAN will be added for the entire column label.

In [12]:
import numpy as np

df1 = pd.DataFrame(np.random.randn(10,3),columns=['col1','col2','col3'])
print(df1)
print('\n')
df2 = pd.DataFrame(np.random.randn(7,3),columns=['col1','col2','col3'])
print(df2)
print('\n')
df1 = df1.reindex_like(df2)
print(df1)

       col1      col2      col3
0 -1.745057  1.065676  0.494931
1  0.115237 -0.583812 -0.791352
2 -1.100612  0.082696  1.089111
3 -0.337314  0.600008 -0.621481
4 -1.353144 -1.478831  0.033871
5  0.795768  0.136982 -0.927107
6  0.584257 -0.995148 -0.576838
7  0.498210  0.953523 -0.268647
8 -1.022263  1.128703  1.388902
9 -0.539812 -0.189643  0.284068


       col1      col2      col3
0  0.012889 -0.036709  1.281354
1  0.570393  0.310735 -0.177701
2 -0.772351 -0.131164  0.155368
3  1.180512  2.942939  0.116303
4  0.280488  0.274593 -1.146620
5  0.354457 -0.815524 -1.897207
6 -0.021909  0.889226  0.663559


       col1      col2      col3
0 -1.745057  1.065676  0.494931
1  0.115237 -0.583812 -0.791352
2 -1.100612  0.082696  1.089111
3 -0.337314  0.600008 -0.621481
4 -1.353144 -1.478831  0.033871
5  0.795768  0.136982 -0.927107
6  0.584257 -0.995148 -0.576838


#### Filling while ReIndexing
reindex() takes an optional parameter method which is a filling method with values as follows −

- __pad/ffill__ − Fill values forward
- __bfill/backfill__ − Fill values backward
- __nearest__ − Fill from the nearest index values

In [13]:
import numpy as np

df1 = pd.DataFrame(np.random.randn(6,3),columns=['col1','col2','col3'])
print(df1)
print('\n')
df2 = pd.DataFrame(np.random.randn(2,3),columns=['col1','col2','col3'])
print(df2)
print('\n')
print(df2.reindex_like(df1))
print('\n')
print(df2.reindex_like(df1, method= 'ffill'))

       col1      col2      col3
0  1.123808  0.682570 -1.160439
1  0.712572 -0.743828 -1.157215
2 -0.879017  0.431221 -0.010857
3  0.969457 -0.930087  1.177066
4  0.507489  1.089422  0.938518
5  1.007367 -1.614119 -0.637264


       col1      col2      col3
0 -0.520303 -0.668133 -0.336906
1  2.564716  0.868215 -0.503586


       col1      col2      col3
0 -0.520303 -0.668133 -0.336906
1  2.564716  0.868215 -0.503586
2       NaN       NaN       NaN
3       NaN       NaN       NaN
4       NaN       NaN       NaN
5       NaN       NaN       NaN


       col1      col2      col3
0 -0.520303 -0.668133 -0.336906
1  2.564716  0.868215 -0.503586
2  2.564716  0.868215 -0.503586
3  2.564716  0.868215 -0.503586
4  2.564716  0.868215 -0.503586
5  2.564716  0.868215 -0.503586


In [15]:
# Limits on Filling while Reindexing
import numpy as np

df1 = pd.DataFrame(np.random.randn(6,3),columns=['col1','col2','col3'])
print(df1)
print('\n')
df2 = pd.DataFrame(np.random.randn(2,3),columns=['col1','col2','col3'])
print(df2)
print('\n')
print(df2.reindex_like(df1))
print('\n')
print(df2.reindex_like(df1, method= 'ffill',limit=2))

       col1      col2      col3
0  1.140259  1.047299 -0.200318
1  1.276610  1.840199  0.618895
2 -0.052835 -0.227479  0.381090
3  2.353892  0.299842 -1.190629
4 -0.214463  0.473533 -0.963519
5 -0.215341 -0.709957 -1.079844


       col1      col2      col3
0 -1.623279  1.037954  0.981717
1 -1.639740  0.794005  0.411001


       col1      col2      col3
0 -1.623279  1.037954  0.981717
1 -1.639740  0.794005  0.411001
2       NaN       NaN       NaN
3       NaN       NaN       NaN
4       NaN       NaN       NaN
5       NaN       NaN       NaN


       col1      col2      col3
0 -1.623279  1.037954  0.981717
1 -1.639740  0.794005  0.411001
2 -1.639740  0.794005  0.411001
3 -1.639740  0.794005  0.411001
4       NaN       NaN       NaN
5       NaN       NaN       NaN


### Renaming
##### Important
- The __rename()__ method allows you to relabel an axis based on some mapping (a dict or Series) or an arbitrary function.
- The rename() method provides an __inplace__ named parameter, which by __default is False__ and copies the underlying data. Pass __inplace=True to rename the data in place__.

In [20]:
import numpy as np

df1 = pd.DataFrame(np.random.randn(6,3),columns=['col1','col2','col3'])
print (df1)
print('\n')
print ("After renaming the rows and columns:")
print (df1.rename(columns={'col1' : 'c1', 'col2' : 'c2'},
index = {0 : 'apple', 1 : 'banana', 2 : 'durian'}))
print('\n')
print(df1)
print('\n')
df1.rename(columns={'col1' : 'c1', 'col2' : 'c2'},
index = {0 : 'apple', 1 : 'banana', 2 : 'durian'}, inplace= True)
print(df1)

       col1      col2      col3
0  1.768486 -0.856761 -0.244558
1 -0.503099 -0.314776  0.566201
2  1.489893 -0.677620 -0.599316
3  0.377884 -0.454941 -0.399157
4 -0.460150 -0.020811 -0.013219
5 -1.435699 -1.087923 -0.143332


After renaming the rows and columns:
              c1        c2      col3
apple   1.768486 -0.856761 -0.244558
banana -0.503099 -0.314776  0.566201
durian  1.489893 -0.677620 -0.599316
3       0.377884 -0.454941 -0.399157
4      -0.460150 -0.020811 -0.013219
5      -1.435699 -1.087923 -0.143332


       col1      col2      col3
0  1.768486 -0.856761 -0.244558
1 -0.503099 -0.314776  0.566201
2  1.489893 -0.677620 -0.599316
3  0.377884 -0.454941 -0.399157
4 -0.460150 -0.020811 -0.013219
5 -1.435699 -1.087923 -0.143332


              c1        c2      col3
apple   1.768486 -0.856761 -0.244558
banana -0.503099 -0.314776  0.566201
durian  1.489893 -0.677620 -0.599316
3       0.377884 -0.454941 -0.399157
4      -0.460150 -0.020811 -0.013219
5      -1.435699 -1.087923 -

# Iteration
The behavior of basic iteration over Pandas objects depends on the type. When iterating over a __Series, it is regarded as array-like__, and basic iteration produces the values. Other data structures, like __DataFrame and Panel, follow the dict-like__ convention of iterating over the keys of the objects.

In short, basic iteration (for i in object) produces −

- __Series − values__

- __DataFrame − column labels__

- __Panel − item labels__

### Iterating a DataFrame
Iterating a DataFrame gives __column names__

In [3]:
import numpy as np
 
N=20
df = pd.DataFrame({
   'A': pd.date_range(start='2016-01-01',periods=N,freq='D'),
   'x': np.linspace(0,stop=N-1,num=N),
   'y': np.random.rand(N),
   'C': np.random.choice(['Low','Medium','High'],N).tolist(),
   'D': np.random.normal(100, 10, size=(N)).tolist()
   })
print(df)
print('\n')

for col in df:
    print(col)

            A     x         y       C           D
0  2016-01-01   0.0  0.512597    High  114.118758
1  2016-01-02   1.0  0.064195  Medium  110.599907
2  2016-01-03   2.0  0.377606     Low  105.759164
3  2016-01-04   3.0  0.651644    High   85.668568
4  2016-01-05   4.0  0.729831     Low  108.315751
5  2016-01-06   5.0  0.048005  Medium   88.470734
6  2016-01-07   6.0  0.501928     Low   95.444062
7  2016-01-08   7.0  0.169318  Medium   84.872690
8  2016-01-09   8.0  0.717826     Low  104.258671
9  2016-01-10   9.0  0.643761    High   98.738904
10 2016-01-11  10.0  0.520811    High  104.831157
11 2016-01-12  11.0  0.249241     Low  114.957157
12 2016-01-13  12.0  0.334354     Low   96.953203
13 2016-01-14  13.0  0.961292  Medium  113.631693
14 2016-01-15  14.0  0.549690     Low   94.649102
15 2016-01-16  15.0  0.931154  Medium   94.887582
16 2016-01-17  16.0  0.938741    High   82.686003
17 2016-01-18  17.0  0.667328  Medium   99.876688
18 2016-01-19  18.0  0.098514     Low   94.118795


To iterate over the rows of the DataFrame, we can use the following functions −

- __iteritems() − to iterate over the (key,value) pairs__

- __iterrows() − iterate over the rows as (index,series) pairs__

- __itertuples() − iterate over the rows as namedtuples__

In [12]:
import numpy as np
 
df = pd.DataFrame(np.random.randn(4,3),columns=['col1','col2','col3'])
print(df)
print('\n')

print(df.iteritems)
print('\n')

for key,value in df.iteritems():
    print(key,value)
    print(type(key),type(value))
    print('\n')

#Observe, each column is iterated separately as a key-value pair in a Series.

       col1      col2      col3
0  0.074681 -0.426226 -1.069612
1 -0.774250 -0.443205  2.671276
2  0.277391 -0.425407 -1.397438
3 -1.600444  1.407851  0.068613


<bound method DataFrame.iteritems of        col1      col2      col3
0  0.074681 -0.426226 -1.069612
1 -0.774250 -0.443205  2.671276
2  0.277391 -0.425407 -1.397438
3 -1.600444  1.407851  0.068613>


col1 0    0.074681
1   -0.774250
2    0.277391
3   -1.600444
Name: col1, dtype: float64
<class 'str'> <class 'pandas.core.series.Series'>


col2 0   -0.426226
1   -0.443205
2   -0.425407
3    1.407851
Name: col2, dtype: float64
<class 'str'> <class 'pandas.core.series.Series'>


col3 0   -1.069612
1    2.671276
2   -1.397438
3    0.068613
Name: col3, dtype: float64
<class 'str'> <class 'pandas.core.series.Series'>




In [14]:
import numpy as np

df = pd.DataFrame(np.random.randn(4,3),columns = ['col1','col2','col3'])
print(df)
print('\n')
for row_index,row in df.iterrows():
    print(row_index, row)
    print(type(row_index), type(row))
    print('\n')

# Iterate over the rows

       col1      col2      col3
0  0.603627  0.570331 -0.758244
1  0.505285 -0.423692  0.059467
2  2.441589  0.319526 -1.120101
3 -0.646951 -0.829488 -0.897110


0 col1    0.603627
col2    0.570331
col3   -0.758244
Name: 0, dtype: float64
<class 'int'> <class 'pandas.core.series.Series'>


1 col1    0.505285
col2   -0.423692
col3    0.059467
Name: 1, dtype: float64
<class 'int'> <class 'pandas.core.series.Series'>


2 col1    2.441589
col2    0.319526
col3   -1.120101
Name: 2, dtype: float64
<class 'int'> <class 'pandas.core.series.Series'>


3 col1   -0.646951
col2   -0.829488
col3   -0.897110
Name: 3, dtype: float64
<class 'int'> <class 'pandas.core.series.Series'>




__Note__ − Because iterrows() iterate over the rows, it doesn't preserve the data type across the row. 0,1,2 are the row indices and col1,col2,col3 are column indices.

__itertuples()__ method will return an iterator yielding a __named tuple for each row__ in the DataFrame. The first element of the tuple will be the row’s corresponding index value, while the remaining values are the row values.

In [17]:
import numpy as np

df = pd.DataFrame(np.random.randn(4,3),columns = ['col1','col2','col3'])
print(df)
print('\n')
for row in df.itertuples():
    print(row)
    print('\n')

       col1      col2      col3
0 -2.636707  0.217648 -0.929925
1 -0.081309 -1.703289  0.291978
2 -1.238731 -0.799297  0.740163
3  0.396726  0.126497  1.000572


Pandas(Index=0, col1=-2.6367074165907383, col2=0.21764815319362685, col3=-0.9299245635769471)


Pandas(Index=1, col1=-0.08130912055972776, col2=-1.7032894549052962, col3=0.2919776628617955)


Pandas(Index=2, col1=-1.2387305324045026, col2=-0.7992972864060112, col3=0.7401625269872649)


Pandas(Index=3, col1=0.39672622868518087, col2=0.12649675399385701, col3=1.0005719759724914)




### Note − Do not try to modify any object while iterating. __Iterating is meant for reading and the iterator returns a copy of the original object (a view)__, thus the changes will not reflect on the original object.

In [19]:
import numpy as np

df = pd.DataFrame(np.random.randn(4,3),columns = ['col1','col2','col3'])
print(df)
print('\n')

for index, row in df.iterrows():
   row['a'] = 10
print(df)
# See No change reflected

       col1      col2      col3
0  0.035317 -1.077756  1.166138
1  0.714156  1.527332  0.545106
2  0.489707 -0.120217 -0.704183
3 -1.415261 -0.235189 -0.918121


       col1      col2      col3
0  0.035317 -1.077756  1.166138
1  0.714156  1.527332  0.545106
2  0.489707 -0.120217 -0.704183
3 -1.415261 -0.235189 -0.918121


# Sorting
There are two kinds of sorting available in Pandas. They are −

- By label
- By Actual Value

In [21]:
import numpy as np

unsorted_df=pd.DataFrame(np.random.randn(10,2),index=[1,4,6,2,3,5,9,8,0,7],columns=['col2','col1'])
print(unsorted_df)

       col2      col1
1 -0.492533  0.255125
4 -0.281147  0.457609
6 -1.083358  0.236330
2 -0.956279  0.628415
3 -0.185901 -1.822447
5  0.526994  0.276978
9  1.469652  1.434616
8 -0.237081 -0.177304
0  0.138270 -1.856043
7  0.580792 -3.034269


### By Label
Using the __sort_index()__ method, by passing the axis arguments and the order of sorting, DataFrame can be sorted. By default, sorting is done on row labels in ascending order.

In [27]:
sorted_df = unsorted_df.sort_index()
print(sorted_df)
print('\n')
sorted_df1 = unsorted_df.sort_index(axis = 1,ascending=False)
print(sorted_df1)

       col2      col1
0  0.138270 -1.856043
1 -0.492533  0.255125
2 -0.956279  0.628415
3 -0.185901 -1.822447
4 -0.281147  0.457609
5  0.526994  0.276978
6 -1.083358  0.236330
7  0.580792 -3.034269
8 -0.237081 -0.177304
9  1.469652  1.434616


       col2      col1
1 -0.492533  0.255125
4 -0.281147  0.457609
6 -1.083358  0.236330
2 -0.956279  0.628415
3 -0.185901 -1.822447
5  0.526994  0.276978
9  1.469652  1.434616
8 -0.237081 -0.177304
0  0.138270 -1.856043
7  0.580792 -3.034269


### By Value
Like index sorting, __sort_values()__ is the method for sorting by values. It accepts a __'by'__ argument which will use the column name of the DataFrame with which the values are to be sorted.

In [33]:
sorted_df2 = unsorted_df.sort_values(by = 'col2', ascending=False)
print(sorted_df2)

       col2      col1
9  1.469652  1.434616
7  0.580792 -3.034269
5  0.526994  0.276978
0  0.138270 -1.856043
3 -0.185901 -1.822447
8 -0.237081 -0.177304
4 -0.281147  0.457609
1 -0.492533  0.255125
2 -0.956279  0.628415
6 -1.083358  0.236330


### Sorting Algorithm
__sort_values()__ provides a provision to choose the algorithm from _mergesort, heapsort and quicksort_. __Mergesort is the only stable algorithm__.

In [34]:
sorted_df3 = unsorted_df.sort_values(by = 'col2', ascending=False, kind = 'mergesort')
print(sorted_df3)

       col2      col1
9  1.469652  1.434616
7  0.580792 -3.034269
5  0.526994  0.276978
0  0.138270 -1.856043
3 -0.185901 -1.822447
8 -0.237081 -0.177304
4 -0.281147  0.457609
1 -0.492533  0.255125
2 -0.956279  0.628415
6 -1.083358  0.236330


# Working with Text Data
- Pandas provides a set of string functions which make it easy to operate on string data. Most importantly, these functions __ignore (or exclude) missing/NaN values__.
- Applied on pandas __Series__.
- Almost, all of these methods work with Python string functions . So, __convert the Series Object to String Object__ and then perform the operation.

In [35]:
import numpy as np

df = pd.Series(['Tom', 'William Rick', 'John', 'Alber@t', np.nan, '1234','SteveSmith'])
print(df)

0             Tom
1    William Rick
2            John
3         Alber@t
4             NaN
5            1234
6      SteveSmith
dtype: object


In [36]:
print(df.str.lower())

0             tom
1    william rick
2            john
3         alber@t
4             NaN
5            1234
6      stevesmith
dtype: object


In [37]:
print(df.str.upper())

0             TOM
1    WILLIAM RICK
2            JOHN
3         ALBER@T
4             NaN
5            1234
6      STEVESMITH
dtype: object


In [38]:
print(df.str.len())

0     3.0
1    12.0
2     4.0
3     7.0
4     NaN
5     4.0
6    10.0
dtype: float64


In [40]:
print(df.str.strip())
#  strip whitespace(including newline) from each string in the Series/index from both the sides.

0             Tom
1    William Rick
2            John
3         Alber@t
4             NaN
5            1234
6      SteveSmith
dtype: object


In [46]:
print(df.str.split(''))
#Splits each string with the given pattern.

0                               [, T, o, m, ]
1    [, W, i, l, l, i, a, m,  , R, i, c, k, ]
2                            [, J, o, h, n, ]
3                   [, A, l, b, e, r, @, t, ]
4                                         NaN
5                            [, 1, 2, 3, 4, ]
6          [, S, t, e, v, e, S, m, i, t, h, ]
dtype: object


In [49]:
print(df.str.cat(sep='*******'))
#Concatenates the series/index elements with given separator.

Tom*******William Rick*******John*******Alber@t*******1234*******SteveSmith


In [51]:
print(df.str.get_dummies())
#Returns the DataFrame with One-Hot Encoded values.

   1234  Alber@t  John  SteveSmith  Tom  William Rick
0     0        0     0           0    1             0
1     0        0     0           0    0             1
2     0        0     1           0    0             0
3     0        1     0           0    0             0
4     0        0     0           0    0             0
5     1        0     0           0    0             0
6     0        0     0           1    0             0


In [52]:
print(df.str.contains('@'))

0    False
1    False
2    False
3     True
4      NaN
5    False
6    False
dtype: object


In [53]:
print(df.str.replace('@','#####'))

0             Tom
1    William Rick
2            John
3     Alber#####t
4             NaN
5            1234
6      SteveSmith
dtype: object


In [54]:
print(df.str.repeat(3))

0                               TomTomTom
1    William RickWilliam RickWilliam Rick
2                            JohnJohnJohn
3                   Alber@tAlber@tAlber@t
4                                     NaN
5                            123412341234
6          SteveSmithSteveSmithSteveSmith
dtype: object


In [56]:
print(df.str.count('Jo'))
#Returns count of appearance of pattern in each element

0    0.0
1    0.0
2    1.0
3    0.0
4    NaN
5    0.0
6    0.0
dtype: float64


In [57]:
print(df.str.startswith('T'))

0     True
1    False
2    False
3    False
4      NaN
5    False
6    False
dtype: object


In [59]:
print(df.str.endswith('t'))

0    False
1    False
2    False
3     True
4      NaN
5    False
6    False
dtype: object


In [60]:
print(df.str.find('o'))
#Returns the first position of the first occurrence of the pattern.

0    1.0
1   -1.0
2    1.0
3   -1.0
4    NaN
5   -1.0
6   -1.0
dtype: float64


In [61]:
print(df.str.findall('t'))
#Returns a list of all occurrence of the pattern.

0        []
1        []
2        []
3       [t]
4       NaN
5        []
6    [t, t]
dtype: object


In [62]:
print(df.str.swapcase())

0             tOM
1    wILLIAM rICK
2            jOHN
3         aLBER@T
4             NaN
5            1234
6      sTEVEsMITH
dtype: object


In [63]:
print(df.str.islower())

0    False
1    False
2    False
3    False
4      NaN
5    False
6    False
dtype: object


In [64]:
print(df.str.isupper())

0    False
1    False
2    False
3    False
4      NaN
5    False
6    False
dtype: object


In [65]:
print(df.str.isnumeric())

0    False
1    False
2    False
3    False
4      NaN
5     True
6    False
dtype: object


# Options and Customization

- To See and change the settings of Pandas options
[Read Here](https://www.tutorialspoint.com/python_pandas/python_pandas_options_and_customization.htm)

# Indexing and Selecting Data