# Pandas



### Pandas is one of the tools in Machine Learning 
### It is used for data cleaning and analysis. 
### Other features : Exploring, cleaning, transforming and visualizing from data. 
### Pandas is an open-source python package built on top of Numpy

In [1]:
import pandas as pd

In [2]:
a = pd.Series([1,2,3,4,5,6])
a

0    1
1    2
2    3
3    4
4    5
5    6
dtype: int64

In [3]:
type(a)

pandas.core.series.Series

In [4]:
# excessing elements form series
a[3]

4

In [5]:
a = pd.Series(['a','b','z'])
a

0    a
1    b
2    z
dtype: object

In [6]:
# Date index
date = pd.date_range(start='14-06-2000', end='25-06-2000')
date

DatetimeIndex(['2000-06-14', '2000-06-15', '2000-06-16', '2000-06-17',
               '2000-06-18', '2000-06-19', '2000-06-20', '2000-06-21',
               '2000-06-22', '2000-06-23', '2000-06-24', '2000-06-25'],
              dtype='datetime64[ns]', freq='D')

In [7]:
type(date)

pandas.core.indexes.datetimes.DatetimeIndex

## Pandas Dataframe

In [8]:
# creating array of data
name = ['Deepanshu','Gunjan','Vikrant','Amrita','Neha']

In [9]:
domain = ['AI/ML','Java','JS','Python','AI/ML']

In [10]:
experience = [3,2,4,1,3]

In [11]:
# creating dictonary from array
data = {'Name': name,
        'Domain':domain,
        'Experience':experience
}

In [12]:
# passing dictionary to DataFrame
df = pd.DataFrame(data)
df

Unnamed: 0,Name,Domain,Experience
0,Deepanshu,AI/ML,3
1,Gunjan,Java,2
2,Vikrant,JS,4
3,Amrita,Python,1
4,Neha,AI/ML,3


In [13]:
# zip function: take out values of particular index and make a tuple out of it

zipped = list(zip(name,domain,experience))
zipped

[('Deepanshu', 'AI/ML', 3),
 ('Gunjan', 'Java', 2),
 ('Vikrant', 'JS', 4),
 ('Amrita', 'Python', 1),
 ('Neha', 'AI/ML', 3)]

In [14]:
import numpy as np

In [15]:
a = np.random.random([5,4])
a

array([[0.46476612, 0.87324269, 0.82275691, 0.0070718 ],
       [0.14813534, 0.66198363, 0.75683911, 0.28584597],
       [0.71864111, 0.81906685, 0.22071165, 0.89162087],
       [0.4248475 , 0.20570935, 0.20827695, 0.7958653 ],
       [0.66359296, 0.69941124, 0.30244239, 0.15540184]])

In [16]:
df1 = pd.DataFrame(a) 
df1

Unnamed: 0,0,1,2,3
0,0.464766,0.873243,0.822757,0.007072
1,0.148135,0.661984,0.756839,0.285846
2,0.718641,0.819067,0.220712,0.891621
3,0.424847,0.205709,0.208277,0.795865
4,0.663593,0.699411,0.302442,0.155402


In [17]:
df1 = pd.DataFrame(data = a, columns='W X Y Z'.split(), index = 'A B C D E'.split())
df1

Unnamed: 0,W,X,Y,Z
A,0.464766,0.873243,0.822757,0.007072
B,0.148135,0.661984,0.756839,0.285846
C,0.718641,0.819067,0.220712,0.891621
D,0.424847,0.205709,0.208277,0.795865
E,0.663593,0.699411,0.302442,0.155402


In [18]:
df1.drop('Z', axis=1)
# axis =1 defines column, bydefault it a row oriented,

Unnamed: 0,W,X,Y
A,0.464766,0.873243,0.822757
B,0.148135,0.661984,0.756839
C,0.718641,0.819067,0.220712
D,0.424847,0.205709,0.208277
E,0.663593,0.699411,0.302442


In [19]:
df1

Unnamed: 0,W,X,Y,Z
A,0.464766,0.873243,0.822757,0.007072
B,0.148135,0.661984,0.756839,0.285846
C,0.718641,0.819067,0.220712,0.891621
D,0.424847,0.205709,0.208277,0.795865
E,0.663593,0.699411,0.302442,0.155402


In [20]:
# if you want drop permanent the use inplace as true
df1.drop('Z', axis=1, inplace = True)
df1

Unnamed: 0,W,X,Y
A,0.464766,0.873243,0.822757
B,0.148135,0.661984,0.756839
C,0.718641,0.819067,0.220712
D,0.424847,0.205709,0.208277
E,0.663593,0.699411,0.302442


In [21]:
#accessing column values by default
df1['W']

A    0.464766
B    0.148135
C    0.718641
D    0.424847
E    0.663593
Name: W, dtype: float64

In [22]:
# location work on index only and provide corresponding values
df1.loc['A']

W    0.464766
X    0.873243
Y    0.822757
Name: A, dtype: float64

In [23]:
# to fetch column or row number wise we use iloc(index loaction)
df1.iloc[2]

W    0.718641
X    0.819067
Y    0.220712
Name: C, dtype: float64

In [24]:
# pick random numbers in between low and high and shape is given in size
val1 = np.random.randint(low =10,high=90,size=[20,])
val1

array([30, 17, 15, 23, 12, 66, 58, 35, 69, 40, 13, 47, 81, 43, 51, 30, 18,
       68, 77, 86])

In [25]:
# pick passed elements from the given list
val2 = np.random.choice(['Ansh','vans','veer','shiv'],20)
val2

array(['veer', 'Ansh', 'Ansh', 'veer', 'shiv', 'veer', 'veer', 'shiv',
       'Ansh', 'vans', 'veer', 'vans', 'shiv', 'vans', 'shiv', 'veer',
       'shiv', 'veer', 'shiv', 'shiv'], dtype='<U4')

In [26]:
zipped = list(zip(val1,val2))
zipped

[(30, 'veer'),
 (17, 'Ansh'),
 (15, 'Ansh'),
 (23, 'veer'),
 (12, 'shiv'),
 (66, 'veer'),
 (58, 'veer'),
 (35, 'shiv'),
 (69, 'Ansh'),
 (40, 'vans'),
 (13, 'veer'),
 (47, 'vans'),
 (81, 'shiv'),
 (43, 'vans'),
 (51, 'shiv'),
 (30, 'veer'),
 (18, 'shiv'),
 (68, 'veer'),
 (77, 'shiv'),
 (86, 'shiv')]

In [27]:
# creating dataframe
df = pd.DataFrame(data=zipped,columns=['Age','Name'])
df

Unnamed: 0,Age,Name
0,30,veer
1,17,Ansh
2,15,Ansh
3,23,veer
4,12,shiv
5,66,veer
6,58,veer
7,35,shiv
8,69,Ansh
9,40,vans


In [28]:
type(df)

pandas.core.frame.DataFrame

In [29]:
#Default 5 top values
df.head() 

Unnamed: 0,Age,Name
0,30,veer
1,17,Ansh
2,15,Ansh
3,23,veer
4,12,shiv


In [30]:
df.head(2)

Unnamed: 0,Age,Name
0,30,veer
1,17,Ansh


In [31]:
#tail return last rows
df.tail()

Unnamed: 0,Age,Name
15,30,veer
16,18,shiv
17,68,veer
18,77,shiv
19,86,shiv


In [32]:
df.tail(3)

Unnamed: 0,Age,Name
17,68,veer
18,77,shiv
19,86,shiv


In [33]:
#return shape
df.shape

(20, 2)

In [34]:
# return columns name and type (string is for objects)
df.columns

Index(['Age', 'Name'], dtype='object')

### Accessing DataFrame

In [35]:
df['Name'] # pass column name and will access that column

0     veer
1     Ansh
2     Ansh
3     veer
4     shiv
5     veer
6     veer
7     shiv
8     Ansh
9     vans
10    veer
11    vans
12    shiv
13    vans
14    shiv
15    veer
16    shiv
17    veer
18    shiv
19    shiv
Name: Name, dtype: object

In [36]:
df.info() # Will give you complete Dataframe information

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Age     20 non-null     int64 
 1   Name    20 non-null     object
dtypes: int64(1), object(1)
memory usage: 448.0+ bytes


In [37]:
df.describe()

Unnamed: 0,Age
count,20.0
mean,43.95
std,24.411979
min,12.0
25%,21.75
50%,41.5
75%,66.5
max,86.0


In [38]:
df.set_index('Name',inplace=True)
df

Unnamed: 0_level_0,Age
Name,Unnamed: 1_level_1
veer,30
Ansh,17
Ansh,15
veer,23
shiv,12
veer,66
veer,58
shiv,35
Ansh,69
vans,40


In [39]:
df.sort_index(axis=0, ascending=True)

Unnamed: 0_level_0,Age
Name,Unnamed: 1_level_1
Ansh,17
Ansh,15
Ansh,69
shiv,86
shiv,18
shiv,12
shiv,51
shiv,35
shiv,77
shiv,81
