# Pandas


Pandas is a python library used for working with the datasets.
It provides various functions to explore, analyze, clean, and manipulate data.

In [1]:
import pandas as pd

## 1.Pandas has two fundamental data structures.
(i) a Series and 
(ii)a DataFrame

In [3]:
data = ['Toyota', 'Honda', 'BMW']
series = pd.Series(data)
series

0    Toyota
1     Honda
2       BMW
dtype: object

In [4]:
data = [10, 20, 50]
series = pd.Series(data)
series

0    10
1    20
2    50
dtype: int64

In [5]:
data = ['Toyota', 50, 67.90]
series = pd.Series(data)
series

0    Toyota
1        50
2      67.9
dtype: object

In [6]:
type(series)

pandas.core.series.Series

In [8]:
data = {
    "Name":['John', 'Vision', 'Tony'],
    "Age":[12, 45, 67],
    "City":['Paris', 'London', 'Delhi']
}

df = pd.DataFrame(data)
df

Unnamed: 0,Name,Age,City
0,John,12,Paris
1,Vision,45,London
2,Tony,67,Delhi


In [9]:
s1 = pd.Series(['John', 'Vision', 'Tony'])
s2 = [12, 45, 67]

df = pd.DataFrame({"Name":s1, "Age":s2, "City":['Paris', 'London', 'Delhi']})
df

Unnamed: 0,Name,Age,City
0,John,12,Paris
1,Vision,45,London
2,Tony,67,Delhi


## 2.Importing and Exporting CSV files


In [139]:
data = pd.read_csv('..\Datasets\carsales.csv') # '..\'->Refers to parent directory
data.head()

#The file contains only 5 columns. The index (from 0) is generated automatically by pandas.

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Toyota,White,150043,4,"$4,000.00"
1,Honda,Red,87899,4,"$5,000.00"
2,Toyota,Blue,32549,3,"$7,000.00"
3,BMW,Black,11179,5,"$22,000.00"
4,Nissan,White,213095,4,"$3,500.00"


In [17]:
csv = data.to_csv('export.csv')

#Reading again the exported file
csv = pd.read_csv('export.csv')
csv.head()

Unnamed: 0.1,Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,0,Toyota,White,150043,4,"$4,000.00"
1,1,Honda,Red,87899,4,"$5,000.00"
2,2,Toyota,Blue,32549,3,"$7,000.00"
3,3,BMW,Black,11179,5,"$22,000.00"
4,4,Nissan,White,213095,4,"$3,500.00"


In [20]:
"""In the above csv, the extra column is because of the index generated by pandas.
'index=False' parameter is to be used to exclude the row index from being written as a column in the CSV file. 
If you omit this parameter or set it to True, the row index will be included in the exported file."""
csv = data.to_csv('export.csv', index=False)

#Reading again the exported file
csv = pd.read_csv('export.csv')
csv.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Toyota,White,150043,4,"$4,000.00"
1,Honda,Red,87899,4,"$5,000.00"
2,Toyota,Blue,32549,3,"$7,000.00"
3,BMW,Black,11179,5,"$22,000.00"
4,Nissan,White,213095,4,"$3,500.00"


## 3. Attributes for describing data : i)shape    ii)columns    iii)index    iv)dtypes  v)values

In [21]:
data

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Toyota,White,150043,4,"$4,000.00"
1,Honda,Red,87899,4,"$5,000.00"
2,Toyota,Blue,32549,3,"$7,000.00"
3,BMW,Black,11179,5,"$22,000.00"
4,Nissan,White,213095,4,"$3,500.00"
5,Toyota,Green,99213,4,"$4,500.00"
6,Honda,Blue,45698,4,"$7,500.00"
7,Honda,Blue,54738,4,"$7,000.00"
8,Toyota,White,60000,4,"$6,250.00"
9,Nissan,White,31600,4,"$9,700.00"


In [36]:
#shape returns a tuple representing the dimensions of the DataFrame (row, column)
shape = data.shape
print(shape)
type(shape)

(10, 5)


tuple

In [26]:
#columns returns an Index object that contains the column labels of the DataFrame
columns = data.columns
print(columns)
type(columns)

Index(['Make', 'Colour', 'Odometer (KM)', 'Doors', 'Price'], dtype='object')


pandas.core.indexes.base.Index

In [27]:
#index returns an Index object that contains the row labels of the DataFrame
index = data.index
print(index)
type(index)

RangeIndex(start=0, stop=10, step=1)


pandas.core.indexes.range.RangeIndex

In [71]:
#values returns a 2D array containing all values except row index and column label
values = data.values
print(values)
type(values)

[['Toyota' 'White' 150043 4 '$4,000.00']
 ['Honda' 'Red' 87899 4 '$5,000.00']
 ['Toyota' 'Blue' 32549 3 '$7,000.00']
 ['BMW' 'Black' 11179 5 '$22,000.00']
 ['Nissan' 'White' 213095 4 '$3,500.00']
 ['Toyota' 'Green' 99213 4 '$4,500.00']
 ['Honda' 'Blue' 45698 4 '$7,500.00']
 ['Honda' 'Blue' 54738 4 '$7,000.00']
 ['Toyota' 'White' 60000 4 '$6,250.00']
 ['Nissan' 'White' 31600 4 '$9,700.00']]


numpy.ndarray

In [28]:
#dtypes returns a Series object that contains the data types of each column in the DataFrame.
dtypes = data.dtypes
print(dtypes)
type(dtypes)

Make             object
Colour           object
Odometer (KM)     int64
Doors             int64
Price            object
dtype: object


pandas.core.series.Series

## 4. Functions for describing data (works for both series and df)

In [35]:
#info provides a summary of the DataFrame.
info = data.info()
print(info)
type(info)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Make           10 non-null     object
 1   Colour         10 non-null     object
 2   Odometer (KM)  10 non-null     int64 
 3   Doors          10 non-null     int64 
 4   Price          10 non-null     object
dtypes: int64(2), object(3)
memory usage: 528.0+ bytes
None


NoneType

In [37]:
#generates descriptive statistics of numerical columns in the DataFrame for only numerical columns
data.describe()

Unnamed: 0,Odometer (KM),Doors
count,10.0,10.0
mean,78601.4,4.0
std,61983.471735,0.471405
min,11179.0,3.0
25%,35836.25,4.0
50%,57369.0,4.0
75%,96384.5,4.0
max,213095.0,5.0


In [59]:
#Concatenates all object types and adds numeric types
data.sum()

Make             ToyotaHondaToyotaBMWNissanToyotaHondaHondaToyo...
Colour               WhiteRedBlueBlackWhiteGreenBlueBlueWhiteWhite
Odometer (KM)                                               786014
Doors                                                           40
Price            $4,000.00$5,000.00$7,000.00$22,000.00$3,500.00...
dtype: object

In [83]:
print(data.count())
print('---------------------------------------')
print(data.mean())
print('---------------------------------------')
print(data.std())
print('---------------------------------------')
print(data.min())
print('---------------------------------------')
print(data.max())
print('---------------------------------------')
print(data.median())

Make             10
Colour           10
Odometer (KM)    10
Doors            10
Price            10
dtype: int64
---------------------------------------
Odometer (KM)    78601.4
Doors                4.0
dtype: float64
---------------------------------------
Odometer (KM)    61983.471735
Doors                0.471405
dtype: float64
---------------------------------------
Make                    BMW
Colour                Black
Odometer (KM)         11179
Doors                     3
Price            $22,000.00
dtype: object
---------------------------------------
Make                Toyota
Colour               White
Odometer (KM)       213095
Doors                    5
Price            $9,700.00
dtype: object
---------------------------------------
Odometer (KM)    57369.0
Doors                4.0
dtype: float64


  print(data.mean())
  print(data.std())
  print(data.median())


##### On the same line, the above functions can be also used for a specific column(i.e a series)

In [53]:
data['Make'].describe()

count         10
unique         4
top       Toyota
freq           4
Name: Make, dtype: object

In [78]:
#Randomly returns a column
data.sample(axis=1)

Unnamed: 0,Doors
0,4
1,4
2,3
3,5
4,4
5,4
6,4
7,4
8,4
9,4


In [79]:
#Randomly returns a column
data.sample(axis=0) #default ==> data.sample()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
7,Honda,Blue,54738,4,"$7,000.00"


In [80]:
"""Note:
axis = 0 ==> row
axis = 1 ==> column
"""

'Note:\naxis = 0 ==> row\naxis = 1 ==> column\n'

In [82]:
#provides number of rows
len(data)

10

In [85]:
data.tail(2)

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
8,Toyota,White,60000,4,"$6,250.00"
9,Nissan,White,31600,4,"$9,700.00"


## 5. Data Viewing functions

In [86]:
'''head() displays the first five rows of a DataFrame by default. However, you can specify the number of rows to display within
the parentheses.'''
data.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Toyota,White,150043,4,"$4,000.00"
1,Honda,Red,87899,4,"$5,000.00"
2,Toyota,Blue,32549,3,"$7,000.00"
3,BMW,Black,11179,5,"$22,000.00"
4,Nissan,White,213095,4,"$3,500.00"


In [87]:
data.head(2)

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Toyota,White,150043,4,"$4,000.00"
1,Honda,Red,87899,4,"$5,000.00"


In [88]:
'''tail() displays the last five rows of a DataFrame by default. However, you can specify the number of rows to display within
the parentheses.'''
data.tail()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
5,Toyota,Green,99213,4,"$4,500.00"
6,Honda,Blue,45698,4,"$7,500.00"
7,Honda,Blue,54738,4,"$7,000.00"
8,Toyota,White,60000,4,"$6,250.00"
9,Nissan,White,31600,4,"$9,700.00"


In [89]:
data.tail(2)

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
8,Toyota,White,60000,4,"$6,250.00"
9,Nissan,White,31600,4,"$9,700.00"


In [94]:
'''sample() randomly displays specdified number of rows.
default is one.
axis=0 selects random row (default)
axis=1 selects random column'''
data.sample(3)

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Toyota,White,150043,4,"$4,000.00"
9,Nissan,White,31600,4,"$9,700.00"
6,Honda,Blue,45698,4,"$7,500.00"


In [97]:
data.sample(2, axis=1)

Unnamed: 0,Doors,Make
0,4,Toyota
1,4,Honda
2,3,Toyota
3,5,BMW
4,4,Nissan
5,4,Toyota
6,4,Honda
7,4,Honda
8,4,Toyota
9,4,Nissan


##### Providing custom index

In [99]:
'''SWe can also specify our own index in a series or dataframe as follows'''
sr = pd.Series(['A', 'B', 'C', 'D', 'E'])
sr

0    A
1    B
2    C
3    D
4    E
dtype: object

In [100]:
#providing custom index [Default index starts from 0 and increments by one for next record]
sr = pd.Series(['A', 'B', 'C', 'D', 'E'], index=[2, 3, 1, 3, 5])
sr

2    A
3    B
1    C
3    D
5    E
dtype: object

In [111]:
#Does not work well with dataframe
newdf = data
newdf = pd.DataFrame(newdf, index=[1, 4, 5, 6, 3, 4, 7, 4, 1, 9])
newdf

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
1,Honda,Red,87899,4,"$5,000.00"
4,Nissan,White,213095,4,"$3,500.00"
5,Toyota,Green,99213,4,"$4,500.00"
6,Honda,Blue,45698,4,"$7,500.00"
3,BMW,Black,11179,5,"$22,000.00"
4,Nissan,White,213095,4,"$3,500.00"
7,Honda,Blue,54738,4,"$7,000.00"
4,Nissan,White,213095,4,"$3,500.00"
1,Honda,Red,87899,4,"$5,000.00"
9,Nissan,White,31600,4,"$9,700.00"


##### loc : l stands for label based indexing

In [112]:
data

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Toyota,White,150043,4,"$4,000.00"
1,Honda,Red,87899,4,"$5,000.00"
2,Toyota,Blue,32549,3,"$7,000.00"
3,BMW,Black,11179,5,"$22,000.00"
4,Nissan,White,213095,4,"$3,500.00"
5,Toyota,Green,99213,4,"$4,500.00"
6,Honda,Blue,45698,4,"$7,500.00"
7,Honda,Blue,54738,4,"$7,000.00"
8,Toyota,White,60000,4,"$6,250.00"
9,Nissan,White,31600,4,"$9,700.00"


In [None]:
"""loc indexer is used to access and modify data in a DataFrame using label-based indexing. It allows you to select rows and
columns based on their labels, rather than their numeric positions.
df.loc[row_val, col_val]"""

In [113]:
#selecting single row
data.loc[9]

Make                Nissan
Colour               White
Odometer (KM)        31600
Doors                    4
Price            $9,700.00
Name: 9, dtype: object

In [116]:
#selecting multiple row (more than 2)
data.loc[[0, 4, 9]]

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Toyota,White,150043,4,"$4,000.00"
4,Nissan,White,213095,4,"$3,500.00"
9,Nissan,White,31600,4,"$9,700.00"


In [117]:
#selecting single column
data.loc[:, 'Price']

0     $4,000.00
1     $5,000.00
2     $7,000.00
3    $22,000.00
4     $3,500.00
5     $4,500.00
6     $7,500.00
7     $7,000.00
8     $6,250.00
9     $9,700.00
Name: Price, dtype: object

In [119]:
#selecting multiple column (more than 2)
data.loc[:, ['Price', 'Doors']]

Unnamed: 0,Price,Doors
0,"$4,000.00",4
1,"$5,000.00",4
2,"$7,000.00",3
3,"$22,000.00",5
4,"$3,500.00",4
5,"$4,500.00",4
6,"$7,500.00",4
7,"$7,000.00",4
8,"$6,250.00",4
9,"$9,700.00",4


In [121]:
#selecting specific rows and columns
data.loc[[0, 4, 7, 9], ['Price', 'Doors', 'Make']]

Unnamed: 0,Price,Doors,Make
0,"$4,000.00",4,Toyota
4,"$3,500.00",4,Nissan
7,"$7,000.00",4,Honda
9,"$9,700.00",4,Nissan


In [125]:
# Select rows based on a condition
data.loc[data['Colour'] != 'White']

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
1,Honda,Red,87899,4,"$5,000.00"
2,Toyota,Blue,32549,3,"$7,000.00"
3,BMW,Black,11179,5,"$22,000.00"
5,Toyota,Green,99213,4,"$4,500.00"
6,Honda,Blue,45698,4,"$7,500.00"
7,Honda,Blue,54738,4,"$7,000.00"


In [127]:
data.loc[data['Odometer (KM)'] <= 60000]

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
2,Toyota,Blue,32549,3,"$7,000.00"
3,BMW,Black,11179,5,"$22,000.00"
6,Honda,Blue,45698,4,"$7,500.00"
7,Honda,Blue,54738,4,"$7,000.00"
8,Toyota,White,60000,4,"$6,250.00"
9,Nissan,White,31600,4,"$9,700.00"


In [129]:
# Modify a specific value
data.loc[2, 'Colour'] = 'Red'
data.head(3)

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Toyota,White,150043,4,"$4,000.00"
1,Honda,Red,87899,4,"$5,000.00"
2,Toyota,Red,32549,3,"$7,000.00"


In [142]:
# Modify an entire row
data.loc[5, :] = 0  
data

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Toyota,White,150043,4,"$4,000.00"
1,Honda,Red,87899,4,"$5,000.00"
2,Toyota,Blue,32549,3,"$7,000.00"
3,BMW,Black,11179,5,"$22,000.00"
4,Nissan,White,213095,4,"$3,500.00"
5,0,0,0,0,0
6,Honda,Blue,45698,4,"$7,500.00"
7,Honda,Blue,54738,4,"$7,000.00"
8,Toyota,White,60000,4,"$6,250.00"
9,Nissan,White,31600,4,"$9,700.00"


In [156]:
# Modify an entire column
data.loc[:, 'Doors'] = [1, 2,3, 4, 5, 8, 34, 2, 12, 11] 
data

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Toyota,White,150043,1,"$4,000.00"
1,Honda,Red,87899,2,"$5,000.00"
2,Toyota,Blue,32549,3,"$7,000.00"
3,BMW,Black,11179,4,"$22,000.00"
4,Nissan,White,213095,5,"$3,500.00"
5,0,0,0,8,0
6,Honda,Blue,45698,34,"$7,500.00"
7,Honda,Blue,54738,2,"$7,000.00"
8,Toyota,White,60000,12,"$6,250.00"
9,Nissan,White,31600,11,"$9,700.00"


##### Slicing in loc

In [147]:
# Select a range of rows (end row is inclusive)
data.loc[2:5] 

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
2,Toyota,Blue,32549,3,"$7,000.00"
3,BMW,Black,11179,5,"$22,000.00"
4,Nissan,White,213095,4,"$3,500.00"
5,0,0,0,0,0


In [149]:
# Select every other row (last value is the step size)
data.loc[2:7:3]

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
2,Toyota,Blue,32549,3,"$7,000.00"
5,0,0,0,0,0


In [150]:
# Select a range of columns
data.loc[:, 'Colour':'Doors']

Unnamed: 0,Colour,Odometer (KM),Doors
0,White,150043,4
1,Red,87899,4
2,Blue,32549,3
3,Black,11179,5
4,White,213095,4
5,0,0,0
6,Blue,45698,4
7,Blue,54738,4
8,White,60000,4
9,White,31600,4


In [151]:
# Select every other column
data.loc[:, ::2]

Unnamed: 0,Make,Odometer (KM),Price
0,Toyota,150043,"$4,000.00"
1,Honda,87899,"$5,000.00"
2,Toyota,32549,"$7,000.00"
3,BMW,11179,"$22,000.00"
4,Nissan,213095,"$3,500.00"
5,0,0,0
6,Honda,45698,"$7,500.00"
7,Honda,54738,"$7,000.00"
8,Toyota,60000,"$6,250.00"
9,Nissan,31600,"$9,700.00"


In [152]:
# Select a range of rows and columns
data.loc[1:5, 'Odometer (KM)':'Price']

Unnamed: 0,Odometer (KM),Doors,Price
1,87899,4,"$5,000.00"
2,32549,3,"$7,000.00"
3,11179,5,"$22,000.00"
4,213095,4,"$3,500.00"
5,0,0,0


In [154]:
# Select every other row and column
data.loc[1:5:2, 'Colour':'Price':2]

Unnamed: 0,Colour,Doors
1,Red,4
3,Black,5
5,0,0


In [157]:
data

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Toyota,White,150043,1,"$4,000.00"
1,Honda,Red,87899,2,"$5,000.00"
2,Toyota,Blue,32549,3,"$7,000.00"
3,BMW,Black,11179,4,"$22,000.00"
4,Nissan,White,213095,5,"$3,500.00"
5,0,0,0,8,0
6,Honda,Blue,45698,34,"$7,500.00"
7,Honda,Blue,54738,2,"$7,000.00"
8,Toyota,White,60000,12,"$6,250.00"
9,Nissan,White,31600,11,"$9,700.00"


##### iloc - i stands for integer based indexing

In [158]:
data.iloc[[0, 2, 4]]

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Toyota,White,150043,1,"$4,000.00"
2,Toyota,Blue,32549,3,"$7,000.00"
4,Nissan,White,213095,5,"$3,500.00"


In [159]:
data.iloc[:, [1, 3]]

Unnamed: 0,Colour,Doors
0,White,1
1,Red,2
2,Blue,3
3,Black,4
4,White,5
5,0,8
6,Blue,34
7,Blue,2
8,White,12
9,White,11


In [161]:
data.iloc[1:5]

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
1,Honda,Red,87899,2,"$5,000.00"
2,Toyota,Blue,32549,3,"$7,000.00"
3,BMW,Black,11179,4,"$22,000.00"
4,Nissan,White,213095,5,"$3,500.00"


In [162]:
data.loc[1:5]

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
1,Honda,Red,87899,2,"$5,000.00"
2,Toyota,Blue,32549,3,"$7,000.00"
3,BMW,Black,11179,4,"$22,000.00"
4,Nissan,White,213095,5,"$3,500.00"
5,0,0,0,8,0


In [166]:
#data["Doors"] #Same syntax
data.Doors #But this syntax fails if key has a space a mid

0     1
1     2
2     3
3     4
4     5
5     8
6    34
7     2
8    12
9    11
Name: Doors, dtype: int64