In [1]:
import pandas as pd

# Pandas - DataStrucutures
- __Series__ - 1 DImension homogeneous array, sizeimmutable.
- __Data Frame__ - 2 Dimension size-mutable tabular structure with potentially heterogeneously typed columns.
- __Panel__ - 3 Dimension size-mutable array.
- DataFrame is a container of Series, Panel is a container of DataFrame.
- All Pandas data structures are value mutable (can be changed) and except Series all are size mutable. Series is size immutable.

# Pandas - Series

### pandas.Series( data, index, dtype, copy)
- __data__ - takes various forms like ndarray, list, dictionary,constants
- __index__ - Index values must be unique and hashable, same length as data. Default np.arrange(n) if no index is passed.
- __dtype__ - is for data type. If None, data type will be inferred
- __copy__ - Copy data. Default False

In [3]:
#Creating Empty Series
s = pd.Series()
print(s)

Series([], dtype: float64)


### Create a Series from ndarray
- If data is an ndarray, then index passed must be of the same length. If no index is passed, then by default index will be __range(n)__ where __n__ is array length, i.e., __[0,1,2,3…. range(len(array))-1]__.

In [4]:
import numpy as np
array = np.arange(9)

s = pd.Series(array)
print(s)

0    0
1    1
2    2
3    3
4    4
5    5
6    6
7    7
8    8
dtype: int32


In [7]:
# Lets Specify Index
import numpy as np
array = np.arange(9)

s = pd.Series(array,np.arange(10,19))
print(s)

# We passed the index values here. Now we can see the customized indexed values in the output.

10    0
11    1
12    2
13    3
14    4
15    5
16    6
17    7
18    8
dtype: int32


### Create a Series from dict
- A __dict__ can be passed as input and if no index is specified, then the dictionary keys are taken in a sorted order to construct index. If __index__ is passed, the values in data corresponding to the labels in the index will be pulled out.

In [9]:
data = {'a' : 0., 'b' : 1., 'c' : 2.}

s = pd.Series(data)

print(s)
#Observe − Dictionary keys are used to construct index.

a    0.0
b    1.0
c    2.0
dtype: float64


In [11]:
data = {'a' : 0., 'b' : 1., 'c' : 2.}
s = pd.Series(data,index=['b','c','d','a'])
print(s)

#Observe − Index order is persisted and the missing element is filled with NaN (Not a Number).

b    1.0
c    2.0
d    NaN
a    0.0
dtype: float64


### Create a Series from Scalar
- If data is a scalar value, an index must be provided. The value will be repeated to match the length of __index__

In [12]:
s = pd.Series(5, index=[0, 1, 2, 3])
print(s)

0    5
1    5
2    5
3    5
dtype: int64


### Accessing Data from Series with Position, Slicing, Index(Label)

In [17]:
s = pd.Series([1,2,3,4,5],index = ['a','b','c','d','e'])
print(s)
print('\n')

print(s[3]) # Accessinge Data with Position
print('\n')

print(s['b']) # Accessing Data with Label
print('\n')

print(s[3:]) # Access from the 3rd index
print('\n')

print(s[:3]) # Access up to last 3 indexes
print('\n')

print(s[-3:])# Access last 3 indexes

a    1
b    2
c    3
d    4
e    5
dtype: int64


4


2


d    4
e    5
dtype: int64


a    1
b    2
c    3
dtype: int64


c    3
d    4
e    5
dtype: int64


# Pandas - DataFrame

### pandas.DataFrame( data, index, columns, dtype, copy)
- __data__ - takes various forms like __ndarray__, __series__, __map__, __lists__, __dict__, __constants__ and also another __DataFrame__
- __index__ - For the row labels, the Index to be used for the resulting frame is Optional Default _np.arrange(n)_ if no index is passed.
- __columns__ - For column labels, the optional default syntax is - _np.arrange(n)_. This is only true if no index is passed.
- __dtype__ - Data type of each column.
- __copy__ - This command (or whatever it is) is used for copying of data, if the default is False.

In [18]:
# Creating Empty DataFrame

df = pd.DataFrame()

print(df)

Empty DataFrame
Columns: []
Index: []


### Creating a DataFrame from __Lists__
- The DataFrame can be created using a single list or a list of lists.

In [19]:
data = [10,11,12,13,14]

df = pd.DataFrame(data)

print(df)

    0
0  10
1  11
2  12
3  13
4  14


In [24]:
data = [['Alex',10],['Bob',12],['Clarke',13]]

df = pd.DataFrame(data)
print(df)
print('\n')

df_new = pd.DataFrame(data, columns=['Name','Age'], dtype=float) #Creating column names
print(df_new)

        0   1
0    Alex  10
1     Bob  12
2  Clarke  13


     Name   Age
0    Alex  10.0
1     Bob  12.0
2  Clarke  13.0


### Create a DataFrame from Dict of ndarrays / Lists
- All the ndarrays must be of same length. If index is passed, then the length of the index should equal to the length of the arrays.<br>If no index is passed, then by default, index will be range(n), where n is the array length.

In [27]:
Name_List = ['Tom', 'Jack', 'Steve', 'Ricky']
Age_List = [28,34,29,42]
data_dict = {'Name': Name_List, 'Age':Age_List}

df = pd.DataFrame(data_dict)
print(df)

# The dictionary keys are by default taken as column names.

    Name  Age
0    Tom   28
1   Jack   34
2  Steve   29
3  Ricky   42


In [26]:
Name_List = ['Tom', 'Jack', 'Steve', 'Ricky']
Age_List = [28,34,29,42]
data_dict = {'Name': Name_List, 'Age':Age_List}

df = pd.DataFrame(data_dict, index=['rank1','rank2','rank3','rank4'])
print(df)

        Name  Age
rank1    Tom   28
rank2   Jack   34
rank3  Steve   29
rank4  Ricky   42


### Create a DataFrame from List of Dicts
- List of Dictionaries can be passed as input data to create a DataFrame. The __dictionary keys__ are by default taken as __column names__.

In [28]:
data = [{'a': 1, 'b': 2},{'a': 5, 'b': 10, 'c': 20}]
df = pd.DataFrame(data)

print(df)

   a   b     c
0  1   2   NaN
1  5  10  20.0


In [30]:
data = [{'a': 1, 'b': 2},{'a': 5, 'b': 10, 'c': 20}]

#With two column indices, values same as dictionary keys
df1 = pd.DataFrame(data, index=['first', 'second'], columns=['a', 'b'])

#With two column indices with one index with other name
df2 = pd.DataFrame(data, index=['first', 'second'], columns=['a', 'b1'])

print(df1)
print('\n')
print(df2)

        a   b
first   1   2
second  5  10


        a  b1
first   1 NaN
second  5 NaN


### Create a DataFrame from Dict of Series
- Dictionary of Series can be passed to form a DataFrame. The resultant index is the union of all the series indexes passed.

In [33]:
data = {'Name': pd.Series(['Shri','Harsha'],index=['a', 'b']), 'Age': pd.Series([10,11,12],index=['a', 'b', 'c'])}

df = pd.DataFrame(data)
print(df)

     Name  Age
a    Shri   10
b  Harsha   11
c     NaN   12


### Column Selection

In [35]:
d = {'one' : pd.Series([1, 2, 3], index=['a', 'b', 'c']),
   'two' : pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])}

df = pd.DataFrame(d)

print(df)
print('\n')

print(df['two'])

   one  two
a  1.0    1
b  2.0    2
c  3.0    3
d  NaN    4


a    1
b    2
c    3
d    4
Name: two, dtype: int64


### Column Addition

In [38]:
d = {'one' : pd.Series([1, 2, 3], index=['a', 'b', 'c']),
   'two' : pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])}

df = pd.DataFrame(d)
print(df)
print('\n')

df['three'] = pd.Series([10,20,30], index=['a', 'b', 'c'])

print(df)
print('\n')

df['four'] = df['one']+df['three']
print(df)

   one  two
a  1.0    1
b  2.0    2
c  3.0    3
d  NaN    4


   one  two  three
a  1.0    1   10.0
b  2.0    2   20.0
c  3.0    3   30.0
d  NaN    4    NaN


   one  two  three  four
a  1.0    1   10.0  11.0
b  2.0    2   20.0  22.0
c  3.0    3   30.0  33.0
d  NaN    4    NaN   NaN


### Column Deletion

In [41]:
d = {'one' : pd.Series([1, 2, 3], index=['a', 'b', 'c']), 
   'two' : pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd']), 
   'three' : pd.Series([10,20,30], index=['a','b','c'])}

df = pd.DataFrame(d)
print(df)
print('\n')

#Using del function
del(df['two'])
print(df)
print('\n')

# Using Pandas pop dataframe function

df.pop('three')
print(df)
print('\n')

   one  two  three
a  1.0    1   10.0
b  2.0    2   20.0
c  3.0    3   30.0
d  NaN    4    NaN


   one  three
a  1.0   10.0
b  2.0   20.0
c  3.0   30.0
d  NaN    NaN


   one
a  1.0
b  2.0
c  3.0
d  NaN




### Row Selection

In [6]:
# Selection by Label
data_dict = {'one' : pd.Series([1, 2, 3], index=['a', 'b', 'c']), 
   'two' : pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])}

df = pd.DataFrame(data_dict)
print(df)
print('\n')

print(df.loc['c'])

   one  two
a  1.0    1
b  2.0    2
c  3.0    3
d  NaN    4


one    3.0
two    3.0
Name: c, dtype: float64


In [7]:
data_dict = {'one' : pd.Series([1, 2, 3], index=['a', 'b', 'c']),
   'two' : pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])}

df = pd.DataFrame(data_dict)
print(df)
print('\n')

print(df.iloc[3])

   one  two
a  1.0    1
b  2.0    2
c  3.0    3
d  NaN    4


one    NaN
two    4.0
Name: d, dtype: float64


### Slice Rows
- Slicing is possible only for rows, not for coulmns

In [8]:
d = {'one' : pd.Series([1, 2, 3], index=['a', 'b', 'c']), 
   'two' : pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])}

df = pd.DataFrame(d)
print(df)
print('\n')

print(df[2:3]) 

   one  two
a  1.0    1
b  2.0    2
c  3.0    3
d  NaN    4


   one  two
c  3.0    3


### Addition of Rows
- Add new rows to a DataFrame using the __append__ function. This function will __append__ the rows at the end.

In [11]:
df = pd.DataFrame([[1, 2], [3, 4]], columns = ['a','b'])
df2 = pd.DataFrame([[5, 6], [7, 8]], columns = ['a','b'])
print(df)
print('\n')
df = df.append(df2) # Append the new dataframe
print(df)

   a  b
0  1  2
1  3  4


   a  b
0  1  2
1  3  4
0  5  6
1  7  8


### Deletion of Rows
- Use index label to delete or drop rows from a DataFrame. If label is duplicated, then multiple rows will be dropped.

In [2]:
df = pd.DataFrame([[1, 2], [3, 4]], columns = ['a','b'])
df2 = pd.DataFrame([[5, 6], [7, 8]], columns = ['a','b'])

df = df.append(df2)
print(df)
print('\n')

df =  df.drop(0)
print(df)

   a  b
0  1  2
1  3  4
0  5  6
1  7  8


   a  b
1  3  4
1  7  8


# Panel
The names for the 3 axes are intended to give some semantic meaning to describing operations involving panel data. They are:
- __items__ − axis 0, each item corresponds to a DataFrame contained inside.
- __major_axis__ − axis 1, it is the index (rows) of each of the DataFrames.
- __minor_axis__ − axis 2, it is the columns of each of the DataFrames.

### Creating the Panel
#### pandas.Panel(data, items, major_axis, minor_axis, dtype, copy)
- __data__ ----> Data takes various forms like ndarray, series, map, lists, dict, constants and also another DataFrame
- __items__ ----> axis=0
- __major_axis__ ----> axis=1
- __minor_axis__ ----> axis=2
- __dtype__ ---->	Data type of each column
- __copy__ ----> Copy data. Default, false

# Basic Functionality

### Series Basic Functionality

In [5]:
import numpy as np
s = pd.Series(np.random.randn(4))

In [7]:
print(s.axes) #Returns a list of the row axis labels

[RangeIndex(start=0, stop=4, step=1)]


In [9]:
print(s.empty) #Returns the Boolean value saying whether the Object is empty or not

False


In [11]:
print(s.ndim) #Returns the number of dimensions of the object. By definition, a Series is a 1D data structure, so it returns 1

1


In [15]:
print(s.size) #Returns the size(length) of the series
print('\n')
print(len(s))

4


4


In [18]:
print(s.values) #Returns the actual data in the series as an array

[-0.31488133  0.10406499  1.28126161  0.16232898]


In [20]:
print(s.head(2)) #head() returns the first n rows(observe the index values). The default number of elements to display is five, but you may pass a custom number.

0   -0.314881
1    0.104065
dtype: float64


In [22]:
print(s.tail(2)) #tail() returns the last n rows(observe the index values). The default number of elements to display is five, but you may pass a custom number.

2    1.281262
3    0.162329
dtype: float64


### DataFrame Basic Functionality

In [28]:
data = {'Name':pd.Series(['Tom','James','Ricky','Vin','Steve','Smith','Jack']),
   'Age':pd.Series([25,26,25,23,30,29,23]),
   'Rating':pd.Series([4.23,3.24,3.98,2.56,3.20,4.6,3.8])}
df = pd.DataFrame(data)
print(df)

    Name  Age  Rating
0    Tom   25    4.23
1  James   26    3.24
2  Ricky   25    3.98
3    Vin   23    2.56
4  Steve   30    3.20
5  Smith   29    4.60
6   Jack   23    3.80


In [29]:
print(df.T) #Returns the transpose of the DataFrame. The rows and columns will interchange.

           0      1      2     3      4      5     6
Name     Tom  James  Ricky   Vin  Steve  Smith  Jack
Age       25     26     25    23     30     29    23
Rating  4.23   3.24   3.98  2.56    3.2    4.6   3.8


In [31]:
print(df.axes) #Returns the list of row axis labels and column axis labels.

[RangeIndex(start=0, stop=7, step=1), Index(['Name', 'Age', 'Rating'], dtype='object')]


In [38]:
print(df.dtypes) #Returns the data type of each column.

Name       object
Age         int64
Rating    float64
dtype: object


In [40]:
print(df.empty) #Returns the Boolean value saying whether the Object is empty or not; True indicates that the object is empty.

False


In [42]:
print(df.ndim) # Returns the number of dimensions of the object. By definition, DataFrame is a 2D object.

2


In [44]:
print(df.shape) #Returns a tuple representing the dimensionality of the DataFrame. Tuple (a,b), where a represents the number of rows and b represents the number of columns.

(7, 3)


In [46]:
print(df.size)#Returns the number of elements in the DataFrame.

21


In [48]:
print(df.values) #Returns the actual data in the DataFrame as an NDarray

[['Tom' 25 4.23]
 ['James' 26 3.24]
 ['Ricky' 25 3.98]
 ['Vin' 23 2.56]
 ['Steve' 30 3.2]
 ['Smith' 29 4.6]
 ['Jack' 23 3.8]]


In [49]:
print(df.head()) #To view a small sample of a DataFrame object, use the head() and tail() methods. head() returns the first n rows (observe the index values). The default number of elements to display is 5, but you may pass a custom number.

    Name  Age  Rating
0    Tom   25    4.23
1  James   26    3.24
2  Ricky   25    3.98
3    Vin   23    2.56
4  Steve   30    3.20


In [51]:
print(df.tail()) #tail() returns the last n rows (observe the index values). The default number of elements to display is 5, but you may pass a custom number.

    Name  Age  Rating
2  Ricky   25    3.98
3    Vin   23    2.56
4  Steve   30    3.20
5  Smith   29    4.60
6   Jack   23    3.80


# Descriptive Statistics

In [53]:
data = {'Name':pd.Series(['Tom','James','Ricky','Vin','Steve','Smith','Jack']),
   'Age':pd.Series([25,26,25,23,30,29,23]),
   'Rating':pd.Series([4.23,3.24,3.98,2.56,3.20,4.6,3.8])}
df = pd.DataFrame(data)
print(df)

    Name  Age  Rating
0    Tom   25    4.23
1  James   26    3.24
2  Ricky   25    3.98
3    Vin   23    2.56
4  Steve   30    3.20
5  Smith   29    4.60
6   Jack   23    3.80


In [56]:
print(df.sum()) #Returns the sum of the values for the requested axis. By default, axis is index (axis=0).
#Each individual column is added individually (Strings are appended)
print('\n')
print(df.sum(axis=1))

Name      TomJamesRickyVinSteveSmithJack
Age                                  181
Rating                             25.61
dtype: object


0    29.23
1    29.24
2    28.98
3    25.56
4    33.20
5    33.60
6    26.80
dtype: float64


In [58]:
print(df.mean()) #Returns the average value

Age       25.857143
Rating     3.658571
dtype: float64


In [59]:
print(df.std())

Age       2.734262
Rating    0.698628
dtype: float64


In [60]:
print(df.count())

Name      7
Age       7
Rating    7
dtype: int64


In [61]:
print(df.median())

Age       25.0
Rating     3.8
dtype: float64


In [63]:
print(df.mode())

    Name   Age  Rating
0   Jack  23.0    2.56
1  James  25.0    3.20
2  Ricky   NaN    3.24
3  Smith   NaN    3.80
4  Steve   NaN    3.98
5    Tom   NaN    4.23
6    Vin   NaN    4.60


In [64]:
print(df.min())

Name      Jack
Age         23
Rating    2.56
dtype: object


In [65]:
print(df.max())

Name      Vin
Age        30
Rating    4.6
dtype: object


In [66]:
print(df.abs())

TypeError: bad operand type for abs(): 'str'

In [67]:
print(df.prod())

Age       7.478738e+09
Rating    7.810877e+03
dtype: float64


In [68]:
print(df.cumsum())#cumulative sum

                             Name  Age Rating
0                             Tom   25   4.23
1                        TomJames   51   7.47
2                   TomJamesRicky   76  11.45
3                TomJamesRickyVin   99  14.01
4           TomJamesRickyVinSteve  129  17.21
5      TomJamesRickyVinSteveSmith  158  21.81
6  TomJamesRickyVinSteveSmithJack  181  25.61


In [69]:
print(df.cumprod())

TypeError: can't multiply sequence by non-int of type 'str'

#### Note − Since DataFrame is a Heterogeneous data structure. Generic operations don’t work with all functions.

- Functions like __sum()__, __cumsum()__ work with both numeric and character (or) string data elements without any error. Though n practice, character aggregations are never used generally, these functions do not throw any exception.
- Functions like __abs()__, __cumprod()__ throw exception when the DataFrame contains character or string data because such operations cannot be performed.

### Summarizing Data

In [74]:
print(df.describe()) #Returns Dataframe

             Age    Rating
count   7.000000  7.000000
mean   25.857143  3.658571
std     2.734262  0.698628
min    23.000000  2.560000
25%    24.000000  3.220000
50%    25.000000  3.800000
75%    27.500000  4.105000
max    30.000000  4.600000


This function gives the mean, std and IQR values. And, function excludes the character columns and given summary about numeric columns. __'include'__ is the argument which is used to pass necessary information regarding what columns need to be considered for summarizing. Takes the list of values; by default, 'number'.

- __object__ − Summarizes String columns
- __number__ − Summarizes Numeric columns
- __all__ − Summarizes all columns together (Should not pass it as a list value)

In [75]:
print(df.describe(include='object'))

        Name
count      7
unique     7
top     Jack
freq       1


In [76]:
print(df.describe(include='number'))

             Age    Rating
count   7.000000  7.000000
mean   25.857143  3.658571
std     2.734262  0.698628
min    23.000000  2.560000
25%    24.000000  3.220000
50%    25.000000  3.800000
75%    27.500000  4.105000
max    30.000000  4.600000


In [77]:
print(df.describe(include='all'))

        Name        Age    Rating
count      7   7.000000  7.000000
unique     7        NaN       NaN
top     Jack        NaN       NaN
freq       1        NaN       NaN
mean     NaN  25.857143  3.658571
std      NaN   2.734262  0.698628
min      NaN  23.000000  2.560000
25%      NaN  24.000000  3.220000
50%      NaN  25.000000  3.800000
75%      NaN  27.500000  4.105000
max      NaN  30.000000  4.600000


# Function Application 