# Pandas



### Pandas is one of the tools in Machine Learning 
### It is used for data cleaning and analysis. 
### Other features : Exploring, cleaning, transforming and visualizing from data. 
### Pandas is an open-source python package built on top of Numpy

In [1]:
import pandas as pd

In [2]:
a = pd.Series([1,2,3,4,5,6])
a

0    1
1    2
2    3
3    4
4    5
5    6
dtype: int64

In [3]:
type(a)

pandas.core.series.Series

In [4]:
# excessing elements form series
a[3]

4

In [5]:
a = pd.Series(['a','b','z'])
a

0    a
1    b
2    z
dtype: object

In [6]:
# Date index
date = pd.date_range(start='14-06-2000', end='25-06-2000')
date

DatetimeIndex(['2000-06-14', '2000-06-15', '2000-06-16', '2000-06-17',
               '2000-06-18', '2000-06-19', '2000-06-20', '2000-06-21',
               '2000-06-22', '2000-06-23', '2000-06-24', '2000-06-25'],
              dtype='datetime64[ns]', freq='D')

In [7]:
type(date)

pandas.core.indexes.datetimes.DatetimeIndex

## Pandas Dataframe

In [8]:
# creating array of data
name = ['Deepanshu','Gunjan','Vikrant','Amrita','Neha']

In [9]:
domain = ['AI/ML','Java','JS','Python','AI/ML']

In [10]:
experience = [3,2,4,1,3]

In [11]:
# creating dictonary from array
data = {'Name': name,
        'Domain':domain,
        'Experience':experience
}

In [12]:
# passing dictionary to DataFrame
df = pd.DataFrame(data)
df

Unnamed: 0,Name,Domain,Experience
0,Deepanshu,AI/ML,3
1,Gunjan,Java,2
2,Vikrant,JS,4
3,Amrita,Python,1
4,Neha,AI/ML,3


In [13]:
# zip function: take out values of particular index and make a tuple out of it

zipped = list(zip(name,domain,experience))
zipped

[('Deepanshu', 'AI/ML', 3),
 ('Gunjan', 'Java', 2),
 ('Vikrant', 'JS', 4),
 ('Amrita', 'Python', 1),
 ('Neha', 'AI/ML', 3)]

In [14]:
import numpy as np

In [15]:
a = np.random.random([5,4])
a

array([[0.45158197, 0.03016642, 0.00358231, 0.37253811],
       [0.78177851, 0.13681304, 0.69017415, 0.90955555],
       [0.34495701, 0.45509816, 0.91434293, 0.67273532],
       [0.23215006, 0.57116316, 0.748195  , 0.03922744],
       [0.41285724, 0.56758941, 0.38631141, 0.69299754]])

In [16]:
df1 = pd.DataFrame(a) 
df1

Unnamed: 0,0,1,2,3
0,0.451582,0.030166,0.003582,0.372538
1,0.781779,0.136813,0.690174,0.909556
2,0.344957,0.455098,0.914343,0.672735
3,0.23215,0.571163,0.748195,0.039227
4,0.412857,0.567589,0.386311,0.692998


In [17]:
df1 = pd.DataFrame(data = a, columns='W X Y Z'.split(), index = 'A B C D E'.split())
df1

Unnamed: 0,W,X,Y,Z
A,0.451582,0.030166,0.003582,0.372538
B,0.781779,0.136813,0.690174,0.909556
C,0.344957,0.455098,0.914343,0.672735
D,0.23215,0.571163,0.748195,0.039227
E,0.412857,0.567589,0.386311,0.692998


In [18]:
df1.drop('Z', axis=1)
# axis =1 defines column, bydefault it a row oriented,

Unnamed: 0,W,X,Y
A,0.451582,0.030166,0.003582
B,0.781779,0.136813,0.690174
C,0.344957,0.455098,0.914343
D,0.23215,0.571163,0.748195
E,0.412857,0.567589,0.386311


In [19]:
df1

Unnamed: 0,W,X,Y,Z
A,0.451582,0.030166,0.003582,0.372538
B,0.781779,0.136813,0.690174,0.909556
C,0.344957,0.455098,0.914343,0.672735
D,0.23215,0.571163,0.748195,0.039227
E,0.412857,0.567589,0.386311,0.692998


In [20]:
# if you want drop permanent the use inplace as true
df1.drop('Z', axis=1, inplace = True)
df1

Unnamed: 0,W,X,Y
A,0.451582,0.030166,0.003582
B,0.781779,0.136813,0.690174
C,0.344957,0.455098,0.914343
D,0.23215,0.571163,0.748195
E,0.412857,0.567589,0.386311


In [21]:
#accessing column values by default
df1['W']

A    0.451582
B    0.781779
C    0.344957
D    0.232150
E    0.412857
Name: W, dtype: float64

In [22]:
# location work on index only and provide corresponding values
df1.loc['A']

W    0.451582
X    0.030166
Y    0.003582
Name: A, dtype: float64

In [23]:
# to fetch column or row number wise we use iloc(index loaction)
df1.iloc[2]

W    0.344957
X    0.455098
Y    0.914343
Name: C, dtype: float64

In [24]:
df1.loc['A','W']

0.451581965075469

In [25]:
df1.loc[['A','B'],['W','X']]

Unnamed: 0,W,X
A,0.451582,0.030166
B,0.781779,0.136813


In [26]:
# pick random numbers in between low and high and shape is given in size
val1 = np.random.randint(low =10,high=90,size=[20,])
val1

array([47, 56, 30, 23, 13, 37, 51, 68, 44, 41, 79, 37, 82, 13, 67, 89, 57,
       89, 62, 82])

In [27]:
# pick passed elements from the given list
val2 = np.random.choice(['Ansh','vans','veer','shiv'],20)
val2

array(['shiv', 'shiv', 'veer', 'Ansh', 'veer', 'shiv', 'vans', 'veer',
       'vans', 'shiv', 'Ansh', 'shiv', 'shiv', 'vans', 'Ansh', 'veer',
       'veer', 'vans', 'Ansh', 'Ansh'], dtype='<U4')

In [28]:
zipped = list(zip(val1,val2))
zipped

[(47, 'shiv'),
 (56, 'shiv'),
 (30, 'veer'),
 (23, 'Ansh'),
 (13, 'veer'),
 (37, 'shiv'),
 (51, 'vans'),
 (68, 'veer'),
 (44, 'vans'),
 (41, 'shiv'),
 (79, 'Ansh'),
 (37, 'shiv'),
 (82, 'shiv'),
 (13, 'vans'),
 (67, 'Ansh'),
 (89, 'veer'),
 (57, 'veer'),
 (89, 'vans'),
 (62, 'Ansh'),
 (82, 'Ansh')]

In [29]:
# creating dataframe
df = pd.DataFrame(data=zipped,columns=['Age','Name'])
df

Unnamed: 0,Age,Name
0,47,shiv
1,56,shiv
2,30,veer
3,23,Ansh
4,13,veer
5,37,shiv
6,51,vans
7,68,veer
8,44,vans
9,41,shiv


In [30]:
type(df)

pandas.core.frame.DataFrame

In [31]:
#Default 5 top values
df.head() 

Unnamed: 0,Age,Name
0,47,shiv
1,56,shiv
2,30,veer
3,23,Ansh
4,13,veer


In [32]:
df.head(2)

Unnamed: 0,Age,Name
0,47,shiv
1,56,shiv


In [33]:
#tail return last rows
df.tail()

Unnamed: 0,Age,Name
15,89,veer
16,57,veer
17,89,vans
18,62,Ansh
19,82,Ansh


In [34]:
df.tail(3)

Unnamed: 0,Age,Name
17,89,vans
18,62,Ansh
19,82,Ansh


In [35]:
#return shape
df.shape

(20, 2)

In [36]:
# return columns name and type (string is for objects)
df.columns

Index(['Age', 'Name'], dtype='object')

### Accessing DataFrame

In [37]:
df['Name'] # pass column name and will access that column

0     shiv
1     shiv
2     veer
3     Ansh
4     veer
5     shiv
6     vans
7     veer
8     vans
9     shiv
10    Ansh
11    shiv
12    shiv
13    vans
14    Ansh
15    veer
16    veer
17    vans
18    Ansh
19    Ansh
Name: Name, dtype: object

In [38]:
df.info() # Will give you complete Dataframe information

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Age     20 non-null     int64 
 1   Name    20 non-null     object
dtypes: int64(1), object(1)
memory usage: 448.0+ bytes


In [39]:
df.describe()

Unnamed: 0,Age
count,20.0
mean,53.35
std,23.930326
min,13.0
25%,37.0
50%,53.5
75%,70.75
max,89.0


In [40]:
df.set_index('Name',inplace=True)
df

Unnamed: 0_level_0,Age
Name,Unnamed: 1_level_1
shiv,47
shiv,56
veer,30
Ansh,23
veer,13
shiv,37
vans,51
veer,68
vans,44
shiv,41


In [41]:
df.sort_index(axis=0, ascending=True)

Unnamed: 0_level_0,Age
Name,Unnamed: 1_level_1
Ansh,82
Ansh,67
Ansh,23
Ansh,79
Ansh,62
shiv,82
shiv,37
shiv,47
shiv,37
shiv,56


In [42]:
df = pd.DataFrame({'Num':[1,2,3,4],'value':[444,555,666,444],'text':['abc','def','abc','ght']})
df

Unnamed: 0,Num,value,text
0,1,444,abc
1,2,555,def
2,3,666,abc
3,4,444,ght


In [43]:
df['value'].unique()

array([444, 555, 666], dtype=int64)

In [44]:
# number of unique values in columns value
df['value'].nunique()

3

In [45]:
df['value'].value_counts()

444    2
555    1
666    1
Name: value, dtype: int64

In [46]:
def mult(x):
    return x*2

In [47]:
df['Num'].apply(mult)

0    2
1    4
2    6
3    8
Name: Num, dtype: int64

In [48]:
df['Num'].sum()

10

In [49]:
df = pd.DataFrame({'col1': [1,2,3,np.nan],
                   'col2': [np.nan,555,666,444]
})

In [50]:
df

Unnamed: 0,col1,col2
0,1.0,
1,2.0,555.0
2,3.0,666.0
3,,444.0


In [51]:
df.isnull()

Unnamed: 0,col1,col2
0,False,True
1,False,False
2,False,False
3,True,False


In [52]:
# drop null value rows
df.dropna()

Unnamed: 0,col1,col2
1,2.0,555.0
2,3.0,666.0


In [53]:
df = pd.DataFrame({'col1': [1,2,3,np.nan],
                   'col2': [np.nan,555,666,np.nan],
                   'col3': ['qwe','abc','pqr','gdn']
})
df

Unnamed: 0,col1,col2,col3
0,1.0,,qwe
1,2.0,555.0,abc
2,3.0,666.0,pqr
3,,,gdn


In [54]:
# fill null values
df.fillna('ML')

Unnamed: 0,col1,col2,col3
0,1,ML,qwe
1,2,555,abc
2,3,666,pqr
3,ML,ML,gdn


In [55]:
# drop column wise
df.dropna(axis=1)

Unnamed: 0,col3
0,qwe
1,abc
2,pqr
3,gdn


In [56]:
# threshold =2 means drop where 2 nan or more than 2 nan in a single row
df.dropna(thresh = 2)

Unnamed: 0,col1,col2,col3
0,1.0,,qwe
1,2.0,555.0,abc
2,3.0,666.0,pqr
