# Pandas



### Pandas is one of the tools in Machine Learning 
### It is used for data cleaning and analysis. 
### Other features : Exploring, cleaning, transforming and visualizing from data. 
### Pandas is an open-source python package built on top of Numpy

In [1]:
import pandas as pd

In [2]:
a = pd.Series([1,2,3,4,5,6])
a

0    1
1    2
2    3
3    4
4    5
5    6
dtype: int64

In [3]:
type(a)

pandas.core.series.Series

In [4]:
# excessing elements form series
a[3]

4

In [5]:
a = pd.Series(['a','b','z'])
a

0    a
1    b
2    z
dtype: object

In [6]:
# Date index
date = pd.date_range(start='14-06-2000', end='25-06-2000')
date

DatetimeIndex(['2000-06-14', '2000-06-15', '2000-06-16', '2000-06-17',
               '2000-06-18', '2000-06-19', '2000-06-20', '2000-06-21',
               '2000-06-22', '2000-06-23', '2000-06-24', '2000-06-25'],
              dtype='datetime64[ns]', freq='D')

In [7]:
type(date)

pandas.core.indexes.datetimes.DatetimeIndex

## Pandas Dataframe

In [8]:
# creating array of data
name = ['Deepanshu','Gunjan','Vikrant','Amrita','Neha']

In [9]:
domain = ['AI/ML','Java','JS','Python','AI/ML']

In [10]:
experience = [3,2,4,1,3]

In [11]:
# creating dictonary from array
data = {'Name': name,
        'Domain':domain,
        'Experience':experience
}

In [12]:
# passing dictionary to DataFrame
df = pd.DataFrame(data)
df

Unnamed: 0,Name,Domain,Experience
0,Deepanshu,AI/ML,3
1,Gunjan,Java,2
2,Vikrant,JS,4
3,Amrita,Python,1
4,Neha,AI/ML,3


In [13]:
# zip function: take out values of particular index and make a tuple out of it

zipped = list(zip(name,domain,experience))
zipped

[('Deepanshu', 'AI/ML', 3),
 ('Gunjan', 'Java', 2),
 ('Vikrant', 'JS', 4),
 ('Amrita', 'Python', 1),
 ('Neha', 'AI/ML', 3)]

In [14]:
import numpy as np

In [15]:
a = np.random.random([5,4])
a

array([[0.09587545, 0.56297389, 0.73219429, 0.10897103],
       [0.42518   , 0.48050684, 0.4421099 , 0.74561501],
       [0.50834952, 0.09593832, 0.49487053, 0.98965218],
       [0.61704408, 0.89919933, 0.7723343 , 0.57905651],
       [0.01516904, 0.68241009, 0.19442336, 0.45695484]])

In [16]:
df1 = pd.DataFrame(a) 
df1

Unnamed: 0,0,1,2,3
0,0.095875,0.562974,0.732194,0.108971
1,0.42518,0.480507,0.44211,0.745615
2,0.50835,0.095938,0.494871,0.989652
3,0.617044,0.899199,0.772334,0.579057
4,0.015169,0.68241,0.194423,0.456955


In [17]:
df1 = pd.DataFrame(data = a, columns='W X Y Z'.split(), index = 'A B C D E'.split())
df1

Unnamed: 0,W,X,Y,Z
A,0.095875,0.562974,0.732194,0.108971
B,0.42518,0.480507,0.44211,0.745615
C,0.50835,0.095938,0.494871,0.989652
D,0.617044,0.899199,0.772334,0.579057
E,0.015169,0.68241,0.194423,0.456955


In [18]:
df1.drop('Z', axis=1)
# axis =1 defines column, bydefault it a row oriented,

Unnamed: 0,W,X,Y
A,0.095875,0.562974,0.732194
B,0.42518,0.480507,0.44211
C,0.50835,0.095938,0.494871
D,0.617044,0.899199,0.772334
E,0.015169,0.68241,0.194423


In [19]:
df1

Unnamed: 0,W,X,Y,Z
A,0.095875,0.562974,0.732194,0.108971
B,0.42518,0.480507,0.44211,0.745615
C,0.50835,0.095938,0.494871,0.989652
D,0.617044,0.899199,0.772334,0.579057
E,0.015169,0.68241,0.194423,0.456955


In [20]:
# if you want drop permanent the use inplace as true
df1.drop('Z', axis=1, inplace = True)
df1

Unnamed: 0,W,X,Y
A,0.095875,0.562974,0.732194
B,0.42518,0.480507,0.44211
C,0.50835,0.095938,0.494871
D,0.617044,0.899199,0.772334
E,0.015169,0.68241,0.194423


In [21]:
#accessing column values by default
df1['W']

A    0.095875
B    0.425180
C    0.508350
D    0.617044
E    0.015169
Name: W, dtype: float64

In [22]:
# location work on index only and provide corresponding values
df1.loc['A']

W    0.095875
X    0.562974
Y    0.732194
Name: A, dtype: float64

In [23]:
# to fetch column or row number wise we use iloc(index loaction)
df1.iloc[2]

W    0.508350
X    0.095938
Y    0.494871
Name: C, dtype: float64

In [24]:
df1.loc['A','W']

0.09587545476830528

In [25]:
df1.loc[['A','B'],['W','X']]

Unnamed: 0,W,X
A,0.095875,0.562974
B,0.42518,0.480507


In [26]:
# pick random numbers in between low and high and shape is given in size
val1 = np.random.randint(low =10,high=90,size=[20,])
val1

array([48, 40, 22, 88, 12, 52, 49, 66, 67, 48, 66, 78, 78, 36, 75, 54, 74,
       78, 67, 13])

In [27]:
# pick passed elements from the given list
val2 = np.random.choice(['Ansh','vans','veer','shiv'],20)
val2

array(['Ansh', 'Ansh', 'veer', 'shiv', 'Ansh', 'vans', 'Ansh', 'vans',
       'veer', 'Ansh', 'Ansh', 'veer', 'Ansh', 'vans', 'vans', 'shiv',
       'Ansh', 'vans', 'vans', 'vans'], dtype='<U4')

In [28]:
zipped = list(zip(val1,val2))
zipped

[(48, 'Ansh'),
 (40, 'Ansh'),
 (22, 'veer'),
 (88, 'shiv'),
 (12, 'Ansh'),
 (52, 'vans'),
 (49, 'Ansh'),
 (66, 'vans'),
 (67, 'veer'),
 (48, 'Ansh'),
 (66, 'Ansh'),
 (78, 'veer'),
 (78, 'Ansh'),
 (36, 'vans'),
 (75, 'vans'),
 (54, 'shiv'),
 (74, 'Ansh'),
 (78, 'vans'),
 (67, 'vans'),
 (13, 'vans')]

In [29]:
# creating dataframe
df = pd.DataFrame(data=zipped,columns=['Age','Name'])
df

Unnamed: 0,Age,Name
0,48,Ansh
1,40,Ansh
2,22,veer
3,88,shiv
4,12,Ansh
5,52,vans
6,49,Ansh
7,66,vans
8,67,veer
9,48,Ansh


In [30]:
type(df)

pandas.core.frame.DataFrame

In [31]:
#Default 5 top values
df.head() 

Unnamed: 0,Age,Name
0,48,Ansh
1,40,Ansh
2,22,veer
3,88,shiv
4,12,Ansh


In [32]:
df.head(2)

Unnamed: 0,Age,Name
0,48,Ansh
1,40,Ansh


In [33]:
#tail return last rows
df.tail()

Unnamed: 0,Age,Name
15,54,shiv
16,74,Ansh
17,78,vans
18,67,vans
19,13,vans


In [34]:
df.tail(3)

Unnamed: 0,Age,Name
17,78,vans
18,67,vans
19,13,vans


In [35]:
#return shape
df.shape

(20, 2)

In [36]:
# return columns name and type (string is for objects)
df.columns

Index(['Age', 'Name'], dtype='object')

### Accessing DataFrame

In [37]:
df['Name'] # pass column name and will access that column

0     Ansh
1     Ansh
2     veer
3     shiv
4     Ansh
5     vans
6     Ansh
7     vans
8     veer
9     Ansh
10    Ansh
11    veer
12    Ansh
13    vans
14    vans
15    shiv
16    Ansh
17    vans
18    vans
19    vans
Name: Name, dtype: object

In [38]:
df.info() # Will give you complete Dataframe information

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Age     20 non-null     int64 
 1   Name    20 non-null     object
dtypes: int64(1), object(1)
memory usage: 448.0+ bytes


In [39]:
df.describe()

Unnamed: 0,Age
count,20.0
mean,55.55
std,22.234338
min,12.0
25%,46.0
50%,60.0
75%,74.25
max,88.0


In [40]:
df.set_index('Name',inplace=True)
df

Unnamed: 0_level_0,Age
Name,Unnamed: 1_level_1
Ansh,48
Ansh,40
veer,22
shiv,88
Ansh,12
vans,52
Ansh,49
vans,66
veer,67
Ansh,48


In [41]:
df.sort_index(axis=0, ascending=True)

Unnamed: 0_level_0,Age
Name,Unnamed: 1_level_1
Ansh,48
Ansh,74
Ansh,78
Ansh,66
Ansh,49
Ansh,48
Ansh,12
Ansh,40
shiv,88
shiv,54


In [42]:
df = pd.DataFrame({'Num':[1,2,3,4],'value':[444,555,666,444],'text':['abc','def','abc','ght']})
df

Unnamed: 0,Num,value,text
0,1,444,abc
1,2,555,def
2,3,666,abc
3,4,444,ght


In [43]:
df['value'].unique()

array([444, 555, 666], dtype=int64)

In [44]:
# number of unique values in columns value
df['value'].nunique()

3

In [45]:
df['value'].value_counts()

444    2
555    1
666    1
Name: value, dtype: int64

In [46]:
def mult(x):
    return x*2

In [47]:
df['Num'].apply(mult)

0    2
1    4
2    6
3    8
Name: Num, dtype: int64

In [48]:
df['Num'].sum()

10

In [49]:
df = pd.DataFrame({'col1': [1,2,3,np.nan],
                   'col2': [np.nan,555,666,444]
})

In [50]:
df

Unnamed: 0,col1,col2
0,1.0,
1,2.0,555.0
2,3.0,666.0
3,,444.0


In [51]:
df.isnull()

Unnamed: 0,col1,col2
0,False,True
1,False,False
2,False,False
3,True,False


In [52]:
# drop null value rows
df.dropna()

Unnamed: 0,col1,col2
1,2.0,555.0
2,3.0,666.0


In [53]:
df = pd.DataFrame({'col1': [1,2,3,np.nan],
                   'col2': [np.nan,555,666,np.nan],
                   'col3': ['qwe','abc','pqr','gdn']
})
df

Unnamed: 0,col1,col2,col3
0,1.0,,qwe
1,2.0,555.0,abc
2,3.0,666.0,pqr
3,,,gdn


In [54]:
# fill null values
df.fillna('ML')

Unnamed: 0,col1,col2,col3
0,1,ML,qwe
1,2,555,abc
2,3,666,pqr
3,ML,ML,gdn


In [55]:
# drop column wise
df.dropna(axis=1)

Unnamed: 0,col3
0,qwe
1,abc
2,pqr
3,gdn


In [56]:
# threshold =2 means drop where 2 nan or more than 2 nan in a single row
df.dropna(thresh = 2)

Unnamed: 0,col1,col2,col3
0,1.0,,qwe
1,2.0,555.0,abc
2,3.0,666.0,pqr


## Reading and writing CSV(Comma Seperated Values)

In [57]:
a = ['2021-06-16', '2021-06-17', '2021-06-18', '2021-06-19',
               '2021-06-20', '2021-06-21', '2021-06-22', '2021-06-23',
               '2021-06-24', '2021-06-25']
b = [23,25,29,30,35,22,22,23,25,16]
c = [6,3,7,3,2,9,5,6,4,3]
d = ['Rain','Sunny','Snow','Snow','Sunny','Fog','Sunny','Snow','Snow','Sunny']

In [58]:
df = pd.DataFrame({'Day':a, 
                   'Temperature':b ,
                   'WindSpeed':c, 
                   'Event':d})
df

Unnamed: 0,Day,Temperature,WindSpeed,Event
0,2021-06-16,23,6,Rain
1,2021-06-17,25,3,Sunny
2,2021-06-18,29,7,Snow
3,2021-06-19,30,3,Snow
4,2021-06-20,35,2,Sunny
5,2021-06-21,22,9,Fog
6,2021-06-22,22,5,Sunny
7,2021-06-23,23,6,Snow
8,2021-06-24,25,4,Snow
9,2021-06-25,16,3,Sunny


In [59]:
# Creating a CSV File
df.to_csv('weather.csv',index=False)

In [60]:
#Read from CSV File
pd.read_csv('weather.csv')

Unnamed: 0,Day,Temperature,WindSpeed,Event
0,2021-06-16,23,6,Rain
1,2021-06-17,25,3,Sunny
2,2021-06-18,29,7,Snow
3,2021-06-19,30,3,Snow
4,2021-06-20,35,2,Sunny
5,2021-06-21,22,9,Fog
6,2021-06-22,22,5,Sunny
7,2021-06-23,23,6,Snow
8,2021-06-24,25,4,Snow
9,2021-06-25,16,3,Sunny


## Reading and writing from and to Excel file

In [61]:
# Creating Excel File from DataFrame
df.to_excel('weather.xlsx', sheet_name='Data', index=False)

In [62]:
#Reading Excel File
pd.read_excel('weather.xlsx')

Unnamed: 0,Day,Temperature,WindSpeed,Event
0,2021-06-16,23,6,Rain
1,2021-06-17,25,3,Sunny
2,2021-06-18,29,7,Snow
3,2021-06-19,30,3,Snow
4,2021-06-20,35,2,Sunny
5,2021-06-21,22,9,Fog
6,2021-06-22,22,5,Sunny
7,2021-06-23,23,6,Snow
8,2021-06-24,25,4,Snow
9,2021-06-25,16,3,Sunny
