In [1]:
import pandas as pd

# Pandas - DataStrucutures
- __Series__ - 1 DImension homogeneous array, sizeimmutable.
- __Data Frame__ - 2 Dimension size-mutable tabular structure with potentially heterogeneously typed columns.
- __Panel__ - 3 Dimension size-mutable array.
- DataFrame is a container of Series, Panel is a container of DataFrame.
- All Pandas data structures are value mutable (can be changed) and except Series all are size mutable. Series is size immutable.

# Pandas - Series

### pandas.Series( data, index, dtype, copy)
- __data__ - takes various forms like ndarray, list, dictionary,constants
- __index__ - Index values must be unique and hashable, same length as data. Default np.arrange(n) if no index is passed.
- __dtype__ - is for data type. If None, data type will be inferred
- __copy__ - Copy data. Default False

In [3]:
#Creating Empty Series
s = pd.Series()
print(s)

Series([], dtype: float64)


### Create a Series from ndarray
- If data is an ndarray, then index passed must be of the same length. If no index is passed, then by default index will be __range(n)__ where __n__ is array length, i.e., __[0,1,2,3…. range(len(array))-1]__.

In [4]:
import numpy as np
array = np.arange(9)

s = pd.Series(array)
print(s)

0    0
1    1
2    2
3    3
4    4
5    5
6    6
7    7
8    8
dtype: int32


In [7]:
# Lets Specify Index
import numpy as np
array = np.arange(9)

s = pd.Series(array,np.arange(10,19))
print(s)

# We passed the index values here. Now we can see the customized indexed values in the output.

10    0
11    1
12    2
13    3
14    4
15    5
16    6
17    7
18    8
dtype: int32


### Create a Series from dict
- A __dict__ can be passed as input and if no index is specified, then the dictionary keys are taken in a sorted order to construct index. If __index__ is passed, the values in data corresponding to the labels in the index will be pulled out.

In [9]:
data = {'a' : 0., 'b' : 1., 'c' : 2.}

s = pd.Series(data)

print(s)
#Observe − Dictionary keys are used to construct index.

a    0.0
b    1.0
c    2.0
dtype: float64


In [11]:
data = {'a' : 0., 'b' : 1., 'c' : 2.}
s = pd.Series(data,index=['b','c','d','a'])
print(s)

#Observe − Index order is persisted and the missing element is filled with NaN (Not a Number).

b    1.0
c    2.0
d    NaN
a    0.0
dtype: float64


### Create a Series from Scalar
- If data is a scalar value, an index must be provided. The value will be repeated to match the length of __index__

In [12]:
s = pd.Series(5, index=[0, 1, 2, 3])
print(s)

0    5
1    5
2    5
3    5
dtype: int64


### Accessing Data from Series with Position, Slicing, Index(Label)

In [17]:
s = pd.Series([1,2,3,4,5],index = ['a','b','c','d','e'])
print(s)
print('\n')

print(s[3]) # Accessinge Data with Position
print('\n')

print(s['b']) # Accessing Data with Label
print('\n')

print(s[3:]) # Access from the 3rd index
print('\n')

print(s[:3]) # Access up to last 3 indexes
print('\n')

print(s[-3:])# Access last 3 indexes

a    1
b    2
c    3
d    4
e    5
dtype: int64


4


2


d    4
e    5
dtype: int64


a    1
b    2
c    3
dtype: int64


c    3
d    4
e    5
dtype: int64


# Pandas - DataFrame

### pandas.DataFrame( data, index, columns, dtype, copy)
- __data__ - takes various forms like __ndarray__, __series__, __map__, __lists__, __dict__, __constants__ and also another __DataFrame__
- __index__ - For the row labels, the Index to be used for the resulting frame is Optional Default _np.arrange(n)_ if no index is passed.
- __columns__ - For column labels, the optional default syntax is - _np.arrange(n)_. This is only true if no index is passed.
- __dtype__ - Data type of each column.
- __copy__ - This command (or whatever it is) is used for copying of data, if the default is False.

In [18]:
# Creating Empty DataFrame

df = pd.DataFrame()

print(df)

Empty DataFrame
Columns: []
Index: []


### Creating a DataFrame from __Lists__
- The DataFrame can be created using a single list or a list of lists.

In [19]:
data = [10,11,12,13,14]

df = pd.DataFrame(data)

print(df)

    0
0  10
1  11
2  12
3  13
4  14


In [24]:
data = [['Alex',10],['Bob',12],['Clarke',13]]

df = pd.DataFrame(data)
print(df)
print('\n')

df_new = pd.DataFrame(data, columns=['Name','Age'], dtype=float) #Creating column names
print(df_new)

        0   1
0    Alex  10
1     Bob  12
2  Clarke  13


     Name   Age
0    Alex  10.0
1     Bob  12.0
2  Clarke  13.0


### Create a DataFrame from Dict of ndarrays / Lists
- All the ndarrays must be of same length. If index is passed, then the length of the index should equal to the length of the arrays.<br>If no index is passed, then by default, index will be range(n), where n is the array length.

In [27]:
Name_List = ['Tom', 'Jack', 'Steve', 'Ricky']
Age_List = [28,34,29,42]
data_dict = {'Name': Name_List, 'Age':Age_List}

df = pd.DataFrame(data_dict)
print(df)

# The dictionary keys are by default taken as column names.

    Name  Age
0    Tom   28
1   Jack   34
2  Steve   29
3  Ricky   42


In [26]:
Name_List = ['Tom', 'Jack', 'Steve', 'Ricky']
Age_List = [28,34,29,42]
data_dict = {'Name': Name_List, 'Age':Age_List}

df = pd.DataFrame(data_dict, index=['rank1','rank2','rank3','rank4'])
print(df)

        Name  Age
rank1    Tom   28
rank2   Jack   34
rank3  Steve   29
rank4  Ricky   42


### Create a DataFrame from List of Dicts
- List of Dictionaries can be passed as input data to create a DataFrame. The __dictionary keys__ are by default taken as __column names__.

In [28]:
data = [{'a': 1, 'b': 2},{'a': 5, 'b': 10, 'c': 20}]
df = pd.DataFrame(data)

print(df)

   a   b     c
0  1   2   NaN
1  5  10  20.0


In [30]:
data = [{'a': 1, 'b': 2},{'a': 5, 'b': 10, 'c': 20}]

#With two column indices, values same as dictionary keys
df1 = pd.DataFrame(data, index=['first', 'second'], columns=['a', 'b'])

#With two column indices with one index with other name
df2 = pd.DataFrame(data, index=['first', 'second'], columns=['a', 'b1'])

print(df1)
print('\n')
print(df2)

        a   b
first   1   2
second  5  10


        a  b1
first   1 NaN
second  5 NaN


### Create a DataFrame from Dict of Series
- Dictionary of Series can be passed to form a DataFrame. The resultant index is the union of all the series indexes passed.

In [33]:
data = {'Name': pd.Series(['Shri','Harsha'],index=['a', 'b']), 'Age': pd.Series([10,11,12],index=['a', 'b', 'c'])}

df = pd.DataFrame(data)
print(df)

     Name  Age
a    Shri   10
b  Harsha   11
c     NaN   12


### Column Selection

In [35]:
d = {'one' : pd.Series([1, 2, 3], index=['a', 'b', 'c']),
   'two' : pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])}

df = pd.DataFrame(d)

print(df)
print('\n')

print(df['two'])

   one  two
a  1.0    1
b  2.0    2
c  3.0    3
d  NaN    4


a    1
b    2
c    3
d    4
Name: two, dtype: int64


### Column Addition

In [38]:
d = {'one' : pd.Series([1, 2, 3], index=['a', 'b', 'c']),
   'two' : pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])}

df = pd.DataFrame(d)
print(df)
print('\n')

df['three'] = pd.Series([10,20,30], index=['a', 'b', 'c'])

print(df)
print('\n')

df['four'] = df['one']+df['three']
print(df)

   one  two
a  1.0    1
b  2.0    2
c  3.0    3
d  NaN    4


   one  two  three
a  1.0    1   10.0
b  2.0    2   20.0
c  3.0    3   30.0
d  NaN    4    NaN


   one  two  three  four
a  1.0    1   10.0  11.0
b  2.0    2   20.0  22.0
c  3.0    3   30.0  33.0
d  NaN    4    NaN   NaN


### Column Deletion

In [41]:
d = {'one' : pd.Series([1, 2, 3], index=['a', 'b', 'c']), 
   'two' : pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd']), 
   'three' : pd.Series([10,20,30], index=['a','b','c'])}

df = pd.DataFrame(d)
print(df)
print('\n')

#Using del function
del(df['two'])
print(df)
print('\n')

# Using Pandas pop dataframe function

df.pop('three')
print(df)
print('\n')

   one  two  three
a  1.0    1   10.0
b  2.0    2   20.0
c  3.0    3   30.0
d  NaN    4    NaN


   one  three
a  1.0   10.0
b  2.0   20.0
c  3.0   30.0
d  NaN    NaN


   one
a  1.0
b  2.0
c  3.0
d  NaN




### Row Selection