## Pandas

### Series

A Series is very similar to Numpy array- infact built on top of it. However, series can have axis labels- it can be indexed using these labels- instead of just numbers and it can hold any python object.

In [13]:
import numpy as np
import pandas as pd

In [3]:
# Creating array
arr = np.array([1,2,3])

# Creating labels
labels = ['a','b','c']

# Creating list
list = [10,20,30]

# Creating dictonaries
dir = {'a':10,'b':20,'c':30}


In [4]:
# Creating series using the array
ser1 = pd.Series(data=arr)
ser2 = pd.Series(arr,labels)
print (ser1)
print (ser2)

# Accesing the data in the series
print (ser1[0])
print (ser2['a'])

0    1
1    2
2    3
dtype: int32
a    1
b    2
c    3
dtype: int32
1
1


In [5]:
# Creating series using the list 
pd.Series(list)

0    10
1    20
2    30
dtype: int64

In [6]:
# Creating series from the dictonaries
pd.Series(dir)

a    10
b    20
c    30
dtype: int64

In [7]:
# Labels of the Series need not be unique
ser3 = pd.Series([1,2,3,4],index=['A','B','D','D'])
print(ser3)
ser3['D']

A    1
B    2
D    3
D    4
dtype: int64


D    3
D    4
dtype: int64

# DataFrames

A Data frame is a two dimensional data structure whose features are-
1. Potential columns are of different data type.
2. Size is mutable
3. Labeled axes (both row and column)
4. Can perform arithmetic operations on rows and columns

In [14]:
np.random.seed(101)

In [17]:
# Creating the data frame
df = pd.DataFrame(np.random.randn(5,4),index='A B C D E'.split(),columns='W X Y Z'.split())
df

Unnamed: 0,W,X,Y,Z
A,0.302665,1.693723,-1.706086,-1.159119
B,-0.134841,0.390528,0.166905,0.184502
C,0.807706,0.07296,0.638787,0.329646
D,-0.497104,-0.75407,-0.943406,0.484752
E,-0.116773,1.901755,0.238127,1.996652


### Selection and Indexing

In [18]:
df['W']

A    0.302665
B   -0.134841
C    0.807706
D   -0.497104
E   -0.116773
Name: W, dtype: float64

In [20]:
df[['W','Z']]

Unnamed: 0,W,Z
A,0.302665,-1.159119
B,-0.134841,0.184502
C,0.807706,0.329646
D,-0.497104,0.484752
E,-0.116773,1.996652


In [38]:
# Data frames are nothing but series
print (type(df['W']))

<class 'pandas.core.series.Series'>


In [39]:
# Creating a new column
df['new'] = df['Y']+df['Z']+df['W']

In [40]:
df

Unnamed: 0,W,X,Y,Z,new
A,0.302665,1.693723,-1.706086,-1.159119,-2.56254
B,-0.134841,0.390528,0.166905,0.184502,0.216566
C,0.807706,0.07296,0.638787,0.329646,1.776139
D,-0.497104,-0.75407,-0.943406,0.484752,-0.955759
E,-0.116773,1.901755,0.238127,1.996652,2.118006


In [41]:
# Removing the new column. By default, the removal of columns are not inplace.
df.drop('new',axis=1)

Unnamed: 0,W,X,Y,Z
A,0.302665,1.693723,-1.706086,-1.159119
B,-0.134841,0.390528,0.166905,0.184502
C,0.807706,0.07296,0.638787,0.329646
D,-0.497104,-0.75407,-0.943406,0.484752
E,-0.116773,1.901755,0.238127,1.996652


In [42]:
df

Unnamed: 0,W,X,Y,Z,new
A,0.302665,1.693723,-1.706086,-1.159119,-2.56254
B,-0.134841,0.390528,0.166905,0.184502,0.216566
C,0.807706,0.07296,0.638787,0.329646,1.776139
D,-0.497104,-0.75407,-0.943406,0.484752,-0.955759
E,-0.116773,1.901755,0.238127,1.996652,2.118006


In [43]:
# To remove column permanently, inplace has to be specified explicitly
df.drop('new',axis=1,inplace=True)
df

Unnamed: 0,W,X,Y,Z
A,0.302665,1.693723,-1.706086,-1.159119
B,-0.134841,0.390528,0.166905,0.184502
C,0.807706,0.07296,0.638787,0.329646
D,-0.497104,-0.75407,-0.943406,0.484752
E,-0.116773,1.901755,0.238127,1.996652


In [46]:
# Locating a row
df.loc['A']

W    0.302665
X    1.693723
Y   -1.706086
Z   -1.159119
Name: A, dtype: float64

In [48]:
# Locating based on index
df.iloc[0]

W    0.302665
X    1.693723
Y   -1.706086
Z   -1.159119
Name: A, dtype: float64

In [49]:
# Selecting an element from the data frame
df.loc['A','Z']

-1.1591194155484297

In [51]:
# Selecting a subset from the data frame
df.loc[['A','C'],['X','Y']]

Unnamed: 0,X,Y
A,1.693723,-1.706086
C,0.07296,0.638787
