In [1]:
#Pandas is an open-source Python Library providing high-performance data manipulation 
# & analysis tool using its powerful data structures
# pandas deals with dataframe, series, panel

In [2]:
!pip install pandas



In [3]:
#import the pandas library and aliasing as pd
import pandas as pd
import numpy as np

In [6]:
#Without passing index
#numpy array
arr = np.array(['a','b','c','d'])
print(arr)

['a' 'b' 'c' 'd']


In [5]:
#creating the series using numpy array
s = pd.Series(arr)
print(s)

0    a
1    b
2    c
3    d
dtype: object


In [7]:
#check the type
type(s)

pandas.core.series.Series

In [8]:
#by passing index.
#numpy array
arr = np.array(['a','b','c','d'])

#creating series
s = pd.Series(arr,index=[100,101,102,103])
s

100    a
101    b
102    c
103    d
dtype: object

In [9]:
#dictionary  #key default index
# Ex1)
data = {'a' : 0, 'b' : 1, 'c' : 2}
s = pd.Series(data)
s

a    0
b    1
c    2
dtype: int64

In [10]:
# Create a Series from Scalar
s = pd.Series(5, index=[0, 1, 2, 3])
s

0    5
1    5
2    5
3    5
dtype: int64

In [11]:
#Accesing data from series
# By using position
# Data in the series can be accessed similar to that in an ndarray.
s = pd.Series([1,2,3,4,5])
s

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [12]:
#retrieve the first element
s[0]

1

In [13]:
# first 3 elements
s[0:3] 

0    1
1    2
2    3
dtype: int64

In [14]:
#By using keys
s = pd.Series([1,2,3,4,5],index = ['a','b','c','d','e'])
s

a    1
b    2
c    3
d    4
e    5
dtype: int64

In [15]:
#retrieve a single element
s['a']

1

In [16]:
#to get indexes
s.index

Index(['a', 'b', 'c', 'd', 'e'], dtype='object')

In [17]:
# to get values
s.values

array([1, 2, 3, 4, 5], dtype=int64)

In [18]:
#DataFrame : A Data frame is a two-dimensional data structure
# A basic DataFrame, which can be created is an Empty Dataframe.
import pandas as pd
df = pd.DataFrame()
print(df)

Empty DataFrame
Columns: []
Index: []


In [19]:
#Create a DataFrame from Dict of lists/ndArray
#all ndarray must be same length
#if index passed : len(index)==len(array=) if I not passed , default index will be in range n

In [20]:
# Ex1) without index
name = ['Siya', 'Riya', 'Sam', 'Raj']
age = [22,23,24,25]

# creating dictionary
data = {'Name':name,'Age':age}
print(data)
# passing dictionary to create df
df = pd.DataFrame(data)
df

{'Name': ['Siya', 'Riya', 'Sam', 'Raj'], 'Age': [22, 23, 24, 25]}


Unnamed: 0,Name,Age
0,Siya,22
1,Riya,23
2,Sam,24
3,Raj,25


In [21]:
#Ex2) with index
name = ['Siya', 'Riya', 'Sam', 'Raj']
age = [22,23,24,25]

# my new indexes
indexes = ['rank1','rank2','rank3','rank4']

#this is my dictionary
data = {'Name':name,'Age':age}

#passing dictionary to create dataframe
df = pd.DataFrame(data, index=indexes)
df

Unnamed: 0,Name,Age
rank1,Siya,22
rank2,Riya,23
rank3,Sam,24
rank4,Raj,25


In [22]:
df['Age']

rank1    22
rank2    23
rank3    24
rank4    25
Name: Age, dtype: int64

In [23]:
df.Age

rank1    22
rank2    23
rank3    24
rank4    25
Name: Age, dtype: int64

In [24]:
# say suppose you want to add city column 
city = ['Banglore','Pune','Hyderabad','Mumbai']

#adding new column named 'City'
df["City"] = city
df

Unnamed: 0,Name,Age,City
rank1,Siya,22,Banglore
rank2,Riya,23,Pune
rank3,Sam,24,Hyderabad
rank4,Raj,25,Mumbai


In [25]:
# 1) Using del keyword
del df["City"]
df

Unnamed: 0,Name,Age
rank1,Siya,22
rank2,Riya,23
rank3,Sam,24
rank4,Raj,25


In [26]:
# 2) Using pop function
df.pop('Age')
df

Unnamed: 0,Name
rank1,Siya
rank2,Riya
rank3,Sam
rank4,Raj


In [27]:
#Example
import numpy as np
import pandas as pd

#np arrays
random = np.random.randint(low = 0, high =100, size = 20)
name = np.random.choice(['A', 'B', 'C', 'D', 'E'],size =  20)
choice = np.random.choice([10,11,13,12,14],size = 20)

#creating dictionary
dict1 = {
    'Random':random, 
    'Name':name,
    'Choice':choice
}

#creating dataframe
df = pd.DataFrame(dict1)
df

Unnamed: 0,Random,Name,Choice
0,71,C,14
1,69,C,11
2,38,C,12
3,7,C,13
4,48,E,11
5,57,D,11
6,25,E,11
7,99,C,14
8,36,A,10
9,94,C,12


In [28]:
# to check the type
type(df)

pandas.core.frame.DataFrame

In [29]:
#to get the dimension or shape
df.shape

(20, 3)

In [30]:
# to get the column names
df.columns

Index(['Random', 'Name', 'Choice'], dtype='object')

In [31]:
#to get the values
df.values

array([[71, 'C', 14],
       [69, 'C', 11],
       [38, 'C', 12],
       [7, 'C', 13],
       [48, 'E', 11],
       [57, 'D', 11],
       [25, 'E', 11],
       [99, 'C', 14],
       [36, 'A', 10],
       [94, 'C', 12],
       [54, 'E', 10],
       [90, 'A', 11],
       [30, 'D', 12],
       [12, 'E', 11],
       [33, 'D', 14],
       [61, 'A', 10],
       [66, 'A', 12],
       [30, 'D', 11],
       [47, 'C', 11],
       [9, 'B', 13]], dtype=object)

In [32]:
#to get the first 5 rows
df.head()

Unnamed: 0,Random,Name,Choice
0,71,C,14
1,69,C,11
2,38,C,12
3,7,C,13
4,48,E,11


In [33]:
# to get the last 5 rows
df.tail()

Unnamed: 0,Random,Name,Choice
15,61,A,10
16,66,A,12
17,30,D,11
18,47,C,11
19,9,B,13


In [34]:
#to get the information about the dataframe
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Random  20 non-null     int32 
 1   Name    20 non-null     object
 2   Choice  20 non-null     int32 
dtypes: int32(2), object(1)
memory usage: 448.0+ bytes


In [35]:
# it will describe only the columns which contains numerical values
df.describe()

Unnamed: 0,Random,Choice
count,20.0,20.0
mean,48.8,11.7
std,27.331493,1.301821
min,7.0,10.0
25%,30.0,11.0
50%,47.5,11.0
75%,66.75,12.25
max,99.0,14.0


In [36]:
# similarly you can check for individual columns
df['Random'].describe()

count    20.000000
mean     48.800000
std      27.331493
min       7.000000
25%      30.000000
50%      47.500000
75%      66.750000
max      99.000000
Name: Random, dtype: float64

In [37]:
#Dealing with csv files
cars_data = pd.read_csv('CarPrice_Assignment.csv')

In [38]:
cars_data.head()

Unnamed: 0,car_ID,symboling,CarName,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,...,enginesize,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price
0,1,3,alfa-romero giulia,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495.0
1,2,3,alfa-romero stelvio,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500.0
2,3,1,alfa-romero Quadrifoglio,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500.0
3,4,2,audi 100 ls,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950.0
4,5,2,audi 100ls,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450.0


In [39]:
cars_data.shape

(205, 26)

In [40]:
# earlier created dataframe
df

Unnamed: 0,Random,Name,Choice
0,71,C,14
1,69,C,11
2,38,C,12
3,7,C,13
4,48,E,11
5,57,D,11
6,25,E,11
7,99,C,14
8,36,A,10
9,94,C,12


In [42]:
df.to_csv('file.csv')

In [43]:
df

Unnamed: 0,Random,Name,Choice
0,71,C,14
1,69,C,11
2,38,C,12
3,7,C,13
4,48,E,11
5,57,D,11
6,25,E,11
7,99,C,14
8,36,A,10
9,94,C,12


In [44]:
df.set_index('Random',inplace = True)   #return new dataframe
df

Unnamed: 0_level_0,Name,Choice
Random,Unnamed: 1_level_1,Unnamed: 2_level_1
71,C,14
69,C,11
38,C,12
7,C,13
48,E,11
57,D,11
25,E,11
99,C,14
36,A,10
94,C,12


In [45]:
df.sort_index(ascending=True)

Unnamed: 0_level_0,Name,Choice
Random,Unnamed: 1_level_1,Unnamed: 2_level_1
7,C,13
9,B,13
12,E,11
25,E,11
30,D,11
30,D,12
33,D,14
36,A,10
38,C,12
47,C,11


In [46]:
df

Unnamed: 0_level_0,Name,Choice
Random,Unnamed: 1_level_1,Unnamed: 2_level_1
71,C,14
69,C,11
38,C,12
7,C,13
48,E,11
57,D,11
25,E,11
99,C,14
36,A,10
94,C,12


In [47]:
df.sort_values(by='Choice', ascending = True)

Unnamed: 0_level_0,Name,Choice
Random,Unnamed: 1_level_1,Unnamed: 2_level_1
36,A,10
54,E,10
61,A,10
69,C,11
30,D,11
48,E,11
57,D,11
25,E,11
47,C,11
90,A,11


In [48]:
#np arrays
random = np.random.randint(low = 0, high =100, size = 20)
name = np.random.choice(['A', 'B', 'C', 'D', 'E'],size =  20)
choice = np.random.choice([10,11,13,12,14],size = 20)

#creating dictionary
dict1 = {
    'Random':random, 
    'Name':name,
    'Choice':choice
}

#creating dataframe
df = pd.DataFrame(dict1)
df

Unnamed: 0,Random,Name,Choice
0,11,D,14
1,2,B,13
2,6,D,14
3,92,E,13
4,38,C,13
5,90,A,11
6,78,A,10
7,92,E,13
8,94,B,11
9,94,B,13


In [49]:
# to access the first element in Random Column
df.iloc[0,0]

11

In [None]:
# to access the 5th element in Choice Column
df.iloc[4,2]

In [50]:
# to access the 11th element in the name column
df.iloc[11,1]

'A'

In [51]:
# similarly you can do the slicing here as well

# to get first 5 rows 
df.iloc[0:5]

Unnamed: 0,Random,Name,Choice
0,11,D,14
1,2,B,13
2,6,D,14
3,92,E,13
4,38,C,13


In [52]:
# to get first 10 rows and first 2 columns
df.iloc[:10,:2]  #start:stop:step

Unnamed: 0,Random,Name
0,11,D
1,2,B
2,6,D
3,92,E
4,38,C
5,90,A
6,78,A
7,92,E
8,94,B
9,94,B


In [53]:
#creating dictionary
dict1 = {
    'Letter':['a','b'],
    'Number':[1,2]
}

#creating dataframe1
d1 = pd.DataFrame(dict1)
d1

Unnamed: 0,Letter,Number
0,a,1
1,b,2


In [54]:
#creating dict2
dict2 = {
    'Letter':['c','d'],
    'Number':[3,4],
    'Animal':['lion','tiger']
}

#creating dataframe2
d2 = pd.DataFrame(dict2)
d2

Unnamed: 0,Letter,Number,Animal
0,c,3,lion
1,d,4,tiger


In [55]:
pd.concat([d1,d2], axis =0)  #row wise concatination

Unnamed: 0,Letter,Number,Animal
0,a,1,
1,b,2,
0,c,3,lion
1,d,4,tiger


In [56]:
# to get the indexes in order
pd.concat([d1,d2], axis =0, ignore_index=True)

Unnamed: 0,Letter,Number,Animal
0,a,1,
1,b,2,
2,c,3,lion
3,d,4,tiger


In [57]:
pd.concat([d1,d2], axis = 1)  #column wise concatination

Unnamed: 0,Letter,Number,Letter.1,Number.1,Animal
0,a,1,c,3,lion
1,b,2,d,4,tiger


In [58]:
#Merging
dict1 = {
    "city" : ["A","B","C","D","E"],
    "temperature" : [32,33,34,35,36]
}

df1 = pd.DataFrame(dict1)
df1

Unnamed: 0,city,temperature
0,A,32
1,B,33
2,C,34
3,D,35
4,E,36


In [59]:
dict2 = {
    "city" : ["C","D","E","F","G"],
    "humidity" : [65,66,67,68,69]
}

df2 = pd.DataFrame(dict2)
df2

Unnamed: 0,city,humidity
0,C,65
1,D,66
2,E,67
3,F,68
4,G,69


In [60]:
df = pd.merge(df1,df2, on='city')
df
# observe by default it is performing inner join

Unnamed: 0,city,temperature,humidity
0,C,34,65
1,D,35,66
2,E,36,67


In [61]:
pd.merge(df1,df2, on='city', how ='outer')

Unnamed: 0,city,temperature,humidity
0,A,32.0,
1,B,33.0,
2,C,34.0,65.0
3,D,35.0,66.0
4,E,36.0,67.0
5,F,,68.0
6,G,,69.0


In [62]:
#Creating new series
data = pd.Series([0, 1, 2, 3, 4, 5,np.nan, 6, 7, 8])
data

0    0.0
1    1.0
2    2.0
3    3.0
4    4.0
5    5.0
6    NaN
7    6.0
8    7.0
9    8.0
dtype: float64

In [63]:
#to detect missing values
data.isnull()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
7    False
8    False
9    False
dtype: bool

In [64]:
# Detect existing (non-missing) values.
data.notnull()

0     True
1     True
2     True
3     True
4     True
5     True
6    False
7     True
8     True
9     True
dtype: bool

In [65]:
# creating a dataframe
dict1 = {
    0:[1,4,7,np.nan],
    1:[2,5,np.nan,np.nan],
    2:[3,np.nan,np.nan,np.nan],
    3:[np.nan,np.nan,np.nan,np.nan]
}

df = pd.DataFrame(dict1)
df

Unnamed: 0,0,1,2,3
0,1.0,2.0,3.0,
1,4.0,5.0,,
2,7.0,,,
3,,,,


In [66]:
#drops an entire row if one nan value is present
df.dropna()

Unnamed: 0,0,1,2,3


In [67]:
# deletes the rows in which all the values are null
df.dropna(how='all')

Unnamed: 0,0,1,2,3
0,1.0,2.0,3.0,
1,4.0,5.0,,
2,7.0,,,


In [68]:
# deletes the columns in which all the values are null
df.dropna(axis=1, how='all')

Unnamed: 0,0,1,2
0,1.0,2.0,3.0
1,4.0,5.0,
2,7.0,,
3,,,


In [69]:
df

Unnamed: 0,0,1,2,3
0,1.0,2.0,3.0,
1,4.0,5.0,,
2,7.0,,,
3,,,,


In [70]:
# threshold Require that many 'non-nan' values.
df.dropna(thresh=1)

Unnamed: 0,0,1,2,3
0,1.0,2.0,3.0,
1,4.0,5.0,,
2,7.0,,,


In [71]:
df.dropna(thresh=1,axis = 1)

Unnamed: 0,0,1,2
0,1.0,2.0,3.0
1,4.0,5.0,
2,7.0,,
3,,,


In [72]:
# Filling null values
df_fill = df.fillna(10)
df_fill

Unnamed: 0,0,1,2,3
0,1.0,2.0,3.0,10.0
1,4.0,5.0,10.0,10.0
2,7.0,10.0,10.0,10.0
3,10.0,10.0,10.0,10.0


In [73]:
df

Unnamed: 0,0,1,2,3
0,1.0,2.0,3.0,
1,4.0,5.0,,
2,7.0,,,
3,,,,


In [74]:
# to fill perticular values in perticular columns
df_fill = df.fillna({0: 10, 1: 20, 2: 30, 3: 40})
df_fill

Unnamed: 0,0,1,2,3
0,1.0,2.0,3.0,40.0
1,4.0,5.0,30.0,40.0
2,7.0,20.0,30.0,40.0
3,10.0,20.0,30.0,40.0


In [75]:
df

Unnamed: 0,0,1,2,3
0,1.0,2.0,3.0,
1,4.0,5.0,,
2,7.0,,,
3,,,,


In [76]:
# fills the nan value with previous observation/row value
df_fill = df.fillna(method='ffill')
df_fill

Unnamed: 0,0,1,2,3
0,1.0,2.0,3.0,
1,4.0,5.0,3.0,
2,7.0,5.0,3.0,
3,7.0,5.0,3.0,


In [77]:
#if you specify the limit,then only for those many 'NaN' values in a column it will fill the values
df_fill = df.fillna(method='ffill', limit = 2)
df_fill

Unnamed: 0,0,1,2,3
0,1.0,2.0,3.0,
1,4.0,5.0,3.0,
2,7.0,5.0,3.0,
3,7.0,5.0,,


In [78]:
df

Unnamed: 0,0,1,2,3
0,1.0,2.0,3.0,
1,4.0,5.0,,
2,7.0,,,
3,,,,


In [79]:
#fills the nan value with previous column's value when axis = 1
df_fill = df.fillna(axis = 1, method='ffill',limit = 1)
df_fill

Unnamed: 0,0,1,2,3
0,1.0,2.0,3.0,3.0
1,4.0,5.0,5.0,
2,7.0,7.0,,
3,,,,


In [80]:
df

Unnamed: 0,0,1,2,3
0,1.0,2.0,3.0,
1,4.0,5.0,,
2,7.0,,,
3,,,,


In [81]:
df_mean = df.fillna(df.mean())  #by mean
df_mean

Unnamed: 0,0,1,2,3
0,1.0,2.0,3.0,
1,4.0,5.0,3.0,
2,7.0,3.5,3.0,
3,4.0,3.5,3.0,


In [82]:
#.replace(old_val,new_val) : replaces the old values with new values
ser1 = pd.Series([1,2,100,4,5,100,7,8,100])
ser1

0      1
1      2
2    100
3      4
4      5
5    100
6      7
7      8
8    100
dtype: int64

In [83]:
# to replace 100 with 1
ser1.replace(100, 1)

0    1
1    2
2    1
3    4
4    5
5    1
6    7
7    8
8    1
dtype: int64

In [84]:
ser1

0      1
1      2
2    100
3      4
4      5
5    100
6      7
7      8
8    100
dtype: int64

In [85]:
# to replace multiple values with one single value
ser1.replace([100, 1, 2], 50)

0    50
1    50
2    50
3     4
4     5
5    50
6     7
7     8
8    50
dtype: int64

In [86]:
ser1

0      1
1      2
2    100
3      4
4      5
5    100
6      7
7      8
8    100
dtype: int64

In [87]:
#get_dummies():  it is one of the method to convert categorical variable to numerical variables
ser2 = pd.Series(list('abcdeabcd'))
ser2

0    a
1    b
2    c
3    d
4    e
5    a
6    b
7    c
8    d
dtype: object

In [88]:
pd.get_dummies(ser2)

Unnamed: 0,a,b,c,d,e
0,1,0,0,0,0
1,0,1,0,0,0
2,0,0,1,0,0
3,0,0,0,1,0
4,0,0,0,0,1
5,1,0,0,0,0
6,0,1,0,0,0
7,0,0,1,0,0
8,0,0,0,1,0
