In [1]:
import pandas as pd

In [2]:
import numpy as np

In [3]:
df = pd.DataFrame({'Name': ['Alice', 'Bob', 'Charlie', 'David'],
    'Age': [25, np.nan, 30, np.nan],
    'City': ['New York', 'Los Angeles', np.nan, 'Chicago']
                  })

In [4]:
df

Unnamed: 0,Name,Age,City
0,Alice,25.0,New York
1,Bob,,Los Angeles
2,Charlie,30.0,
3,David,,Chicago


In [5]:
df['Age']

0    25.0
1     NaN
2    30.0
3     NaN
Name: Age, dtype: float64

In [6]:
df['Name']

0      Alice
1        Bob
2    Charlie
3      David
Name: Name, dtype: object

In [7]:
df['City']

0       New York
1    Los Angeles
2            NaN
3        Chicago
Name: City, dtype: object

In [8]:
df[['Age','Name']]

Unnamed: 0,Age,Name
0,25.0,Alice
1,,Bob
2,30.0,Charlie
3,,David


## Filling the NULL Values 

In [11]:
df['Age'].fillna(df['Age'].mean(),inplace=True)

In [12]:
df

Unnamed: 0,Name,Age,City
0,Alice,25.0,New York
1,Bob,27.5,Los Angeles
2,Charlie,30.0,
3,David,27.5,Chicago


In [13]:
import pandas as pd
import numpy as np

df = pd.DataFrame({
    'Name': ['Alice', 'Bob', 'Charlie', 'David'],
    'Age': [25, np.nan, 30, np.nan],
    'City': ['New York', 'Los Angeles', np.nan, 'Chicago']
})

In [14]:
df

Unnamed: 0,Name,Age,City
0,Alice,25.0,New York
1,Bob,,Los Angeles
2,Charlie,30.0,
3,David,,Chicago


In [15]:
df['City'].fillna('Chicago',inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['City'].fillna('Chicago',inplace=True)


In [16]:
df

Unnamed: 0,Name,Age,City
0,Alice,25.0,New York
1,Bob,,Los Angeles
2,Charlie,30.0,Chicago
3,David,,Chicago


## Filling null values using BACKWARD Fill - bfill

In [17]:
import pandas as pd
import numpy as np

df = pd.DataFrame({
    'Name': ['Alice', 'Bob', 'Charlie', 'David'],
    'Age': [25, np.nan, 30, np.nan],
    'City': ['New York', 'Los Angeles', np.nan, 'Chicago']
})

In [18]:
df

Unnamed: 0,Name,Age,City
0,Alice,25.0,New York
1,Bob,,Los Angeles
2,Charlie,30.0,
3,David,,Chicago


In [19]:
df['Age'].fillna(method ='bfill')

  df['Age'].fillna(method ='bfill')


0    25.0
1    30.0
2    30.0
3     NaN
Name: Age, dtype: float64

In [20]:
df.fillna(method='bfill')

  df.fillna(method='bfill')


Unnamed: 0,Name,Age,City
0,Alice,25.0,New York
1,Bob,30.0,Los Angeles
2,Charlie,30.0,Chicago
3,David,,Chicago


## Filling null values using FORWARD Fill - ffill

In [22]:
import pandas as pd
import numpy as np

df = pd.DataFrame({
    'Name': ['Alice', 'Bob', 'Charlie', 'David'],
    'Age': [25, np.nan, 30, np.nan],
    'City': ['New York', 'Los Angeles', np.nan, 'Chicago']
})

In [23]:
df

Unnamed: 0,Name,Age,City
0,Alice,25.0,New York
1,Bob,,Los Angeles
2,Charlie,30.0,
3,David,,Chicago


In [24]:
df['City'].fillna(method='ffill')

  df['City'].fillna(method='ffill')


0       New York
1    Los Angeles
2    Los Angeles
3        Chicago
Name: City, dtype: object

In [25]:
df.fillna(method='ffill')

  df.fillna(method='ffill')


Unnamed: 0,Name,Age,City
0,Alice,25.0,New York
1,Bob,25.0,Los Angeles
2,Charlie,30.0,Los Angeles
3,David,30.0,Chicago


## PERCENTAGE of Null values and NUMBER of Null values

In [26]:
df

Unnamed: 0,Name,Age,City
0,Alice,25.0,New York
1,Bob,,Los Angeles
2,Charlie,30.0,
3,David,,Chicago


In [27]:
df.shape

(4, 3)

In [28]:
rows,columns = df.shape

In [29]:
rows

4

In [30]:
columns

3

In [31]:
df.isna()

Unnamed: 0,Name,Age,City
0,False,False,False
1,False,True,False
2,False,False,True
3,False,True,False


### Count of null values

In [32]:
df.isna().sum()

Name    0
Age     2
City    1
dtype: int64

### Percentage of null values

In [33]:
df.isna().sum()/rows

Name    0.00
Age     0.50
City    0.25
dtype: float64

In [34]:
(df.isna().sum()/rows)*100

Name     0.0
Age     50.0
City    25.0
dtype: float64

In [35]:
import pandas as pd
import numpy as np

df = pd.DataFrame({
    'Name': ['Alice', 'Bob', 'Charlie', 'David'],
    'Age': [25, np.nan, 30, np.nan],
    'City': ['New York', 'Los Angeles', np.nan, 'Chicago']
})

In [36]:
df

Unnamed: 0,Name,Age,City
0,Alice,25.0,New York
1,Bob,,Los Angeles
2,Charlie,30.0,
3,David,,Chicago


# DROPPING null values

## 1st Scenario - how = "any" axis = 1

In [37]:
df.dropna(how= "any", axis = 1)

Unnamed: 0,Name
0,Alice
1,Bob
2,Charlie
3,David


## 2nd Scneraio - how = "all" axis = 1

In [38]:
df.dropna(how ="all",axis=1)

Unnamed: 0,Name,Age,City
0,Alice,25.0,New York
1,Bob,,Los Angeles
2,Charlie,30.0,
3,David,,Chicago


## 3rd Scenario - how = "any" axis=0

In [39]:
df.dropna(how = "any", axis =0)

Unnamed: 0,Name,Age,City
0,Alice,25.0,New York


## 4th Scenario - how = "all" axis = 0

In [40]:
df.dropna(how = "all",axis=0)

Unnamed: 0,Name,Age,City
0,Alice,25.0,New York
1,Bob,,Los Angeles
2,Charlie,30.0,
3,David,,Chicago


## Examples for dropping null values using new dataset

In [41]:
import pandas as pd
import numpy as np

df = pd.DataFrame({
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve', 'Frank', np.nan, 'Hannah', 'Ivy', 'Jack'],
    'Age': [25, np.nan, 30, np.nan, 40, 28, np.nan, 33, 26, 29],
    'City': ['New York', 'Los Angeles', np.nan, 'Chicago', 'Houston', 'Miami', np.nan, 'Dallas', 'Austin', 'Seattle'],
    'Score': [85, 90, 95, 88, np.nan, 76, np.nan, 92, 80, 89],
    'Remarks': [np.nan]*10  # Entire column is null
})

In [42]:
df

Unnamed: 0,Name,Age,City,Score,Remarks
0,Alice,25.0,New York,85.0,
1,Bob,,Los Angeles,90.0,
2,Charlie,30.0,,95.0,
3,David,,Chicago,88.0,
4,Eve,40.0,Houston,,
5,Frank,28.0,Miami,76.0,
6,,,,,
7,Hannah,33.0,Dallas,92.0,
8,Ivy,26.0,Austin,80.0,
9,Jack,29.0,Seattle,89.0,


In [43]:
df.dropna(how="any" ,axis = 1).shape

(10, 0)

In [44]:
df.dropna(how="all" ,axis = 1).shape

(10, 4)

In [45]:
df.dropna(how="any" ,axis = 0).shape

(0, 5)

In [46]:
df.dropna(how="all" ,axis = 0).shape

(9, 5)

## MEMORY OPTIMIZATION

In [47]:
import pandas as pd

df = pd.DataFrame({
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve', 'Frank', 'Grace', 'Hannah', 'Ivy', 'Jack'],
    'Age': [25, 27, 30, 32, 40, 28, 31, 33, 26, 29],
    'City': ['New York', 'Los Angeles', 'San Francisco', 'Chicago', 'Houston', 'Miami', 'Boston', 'Dallas', 'Austin', 'Seattle'],
    'Score': [85, 90, 95, 88, 91, 76, 87, 92, 80, 89],
    'Remarks': ['Good', 'Excellent', 'Good', 'Average', 'Very Good', 'Fair', 'Good', 'Excellent', 'Average', 'Good']
})

In [48]:
df

Unnamed: 0,Name,Age,City,Score,Remarks
0,Alice,25,New York,85,Good
1,Bob,27,Los Angeles,90,Excellent
2,Charlie,30,San Francisco,95,Good
3,David,32,Chicago,88,Average
4,Eve,40,Houston,91,Very Good
5,Frank,28,Miami,76,Fair
6,Grace,31,Boston,87,Good
7,Hannah,33,Dallas,92,Excellent
8,Ivy,26,Austin,80,Average
9,Jack,29,Seattle,89,Good


#### to know how much memory is used by each column

In [49]:
df.memory_usage()

Index      132
Name        80
Age         80
City        80
Score       80
Remarks     80
dtype: int64

In [50]:
df.dtypes

Name       object
Age         int64
City       object
Score       int64
Remarks    object
dtype: object

In [51]:
df['Age'].min()

np.int64(25)

In [52]:
df['Age'].max()

np.int64(40)

### Converting int64 datatype to int8 datatype

In [53]:
df['Age'] = df['Age'].astype(np.int8)

In [54]:
df.dtypes

Name       object
Age          int8
City       object
Score       int64
Remarks    object
dtype: object

In [55]:
df.memory_usage()

Index      132
Name        80
Age         10
City        80
Score       80
Remarks     80
dtype: int64

In [1]:
import pandas as pd
df=pd.read_excel("C:/Users/cherry/Downloads/Sample - Superstore.xls")

In [4]:
df.shape

(9994, 21)

In [5]:
df.memory_usage()

Index              132
Row ID           79952
Order ID         79952
Order Date       79952
Ship Date        79952
Ship Mode        79952
Customer ID      79952
Customer Name    79952
Segment          79952
Country          79952
City             79952
State            79952
Postal Code      79952
Region           79952
Product ID       79952
Category         79952
Sub-Category     79952
Product Name     79952
Sales            79952
Quantity         79952
Discount         79952
Profit           79952
dtype: int64

In [6]:
df.memory_usage().sum()

np.int64(1679124)

In [7]:
df.memory_usage().sum()/(1024*1024)

np.float64(1.6013374328613281)

In [8]:
import numpy as np
df['Row ID']=df['Row ID'].astype(np.int8)

In [9]:
df.dtypes

Row ID                     int8
Order ID                 object
Order Date       datetime64[ns]
Ship Date        datetime64[ns]
Ship Mode                object
Customer ID              object
Customer Name            object
Segment                  object
Country                  object
City                     object
State                    object
Postal Code               int64
Region                   object
Product ID               object
Category                 object
Sub-Category             object
Product Name             object
Sales                   float64
Quantity                  int64
Discount                float64
Profit                  float64
dtype: object

In [10]:
df.memory_usage().sum()/(1024*1024)

np.float64(1.5346202850341797)

# Label Encoding and Dummy Variable

In [11]:
import pandas as pd

df = pd.DataFrame({
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve', 'Frank', 'Grace', 'Hannah', 'Ivy', 'Jack'],
    'Age': [25, 27, 30, 32, 40, 28, 31, 33, 26, 29],
    'City': ['New York', 'Los Angeles', 'San Francisco', 'Chicago', 'Houston', 'Miami', 'Boston', 'Dallas', 'Austin', 'Seattle'],
    'Score': [85, 90, 95, 88, 91, 76, 87, 92, 80, 89],
    'Remarks': ['Good', 'Excellent', 'Good', 'Average', 'Very Good', 'Fair', 'Good', 'Excellent', 'Average', 'Good']
})

In [12]:
df

Unnamed: 0,Name,Age,City,Score,Remarks
0,Alice,25,New York,85,Good
1,Bob,27,Los Angeles,90,Excellent
2,Charlie,30,San Francisco,95,Good
3,David,32,Chicago,88,Average
4,Eve,40,Houston,91,Very Good
5,Frank,28,Miami,76,Fair
6,Grace,31,Boston,87,Good
7,Hannah,33,Dallas,92,Excellent
8,Ivy,26,Austin,80,Average
9,Jack,29,Seattle,89,Good


In [13]:
df['Remarks']

0         Good
1    Excellent
2         Good
3      Average
4    Very Good
5         Fair
6         Good
7    Excellent
8      Average
9         Good
Name: Remarks, dtype: object

## Using pd.get_dummies()

In [14]:
pd.get_dummies(df['Remarks'])

Unnamed: 0,Average,Excellent,Fair,Good,Very Good
0,False,False,False,True,False
1,False,True,False,False,False
2,False,False,False,True,False
3,True,False,False,False,False
4,False,False,False,False,True
5,False,False,True,False,False
6,False,False,False,True,False
7,False,True,False,False,False
8,True,False,False,False,False
9,False,False,False,True,False


In [15]:
pd.get_dummies(df['Remarks'])+0

Unnamed: 0,Average,Excellent,Fair,Good,Very Good
0,0,0,0,1,0
1,0,1,0,0,0
2,0,0,0,1,0
3,1,0,0,0,0
4,0,0,0,0,1
5,0,0,1,0,0
6,0,0,0,1,0
7,0,1,0,0,0
8,1,0,0,0,0
9,0,0,0,1,0


In [16]:
import pandas as pd

df = pd.DataFrame({
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve', 'Frank', 'Grace', 'Hannah', 'Ivy', 'Jack'],
    'Age': [25, 27, 30, 32, 40, 28, 31, 33, 26, 29],
    'City': ['New York', 'Los Angeles', 'San Francisco', 'Chicago', 'Houston', 'Miami', 'Boston', 'Dallas', 'Austin', 'Seattle'],
    'Score': [85, 90, 95, 88, 91, 76, 87, 92, 80, 89],
    'Remarks': ['Good', 'Excellent', 'Good', 'Average', 'Very Good', 'Fair', 'Good', 'Excellent', 'Average', 'Good']
})

In [17]:
df

Unnamed: 0,Name,Age,City,Score,Remarks
0,Alice,25,New York,85,Good
1,Bob,27,Los Angeles,90,Excellent
2,Charlie,30,San Francisco,95,Good
3,David,32,Chicago,88,Average
4,Eve,40,Houston,91,Very Good
5,Frank,28,Miami,76,Fair
6,Grace,31,Boston,87,Good
7,Hannah,33,Dallas,92,Excellent
8,Ivy,26,Austin,80,Average
9,Jack,29,Seattle,89,Good


In [18]:
df['City']

0         New York
1      Los Angeles
2    San Francisco
3          Chicago
4          Houston
5            Miami
6           Boston
7           Dallas
8           Austin
9          Seattle
Name: City, dtype: object

In [19]:
pd.get_dummies(df['City'])

Unnamed: 0,Austin,Boston,Chicago,Dallas,Houston,Los Angeles,Miami,New York,San Francisco,Seattle
0,False,False,False,False,False,False,False,True,False,False
1,False,False,False,False,False,True,False,False,False,False
2,False,False,False,False,False,False,False,False,True,False
3,False,False,True,False,False,False,False,False,False,False
4,False,False,False,False,True,False,False,False,False,False
5,False,False,False,False,False,False,True,False,False,False
6,False,True,False,False,False,False,False,False,False,False
7,False,False,False,True,False,False,False,False,False,False
8,True,False,False,False,False,False,False,False,False,False
9,False,False,False,False,False,False,False,False,False,True


In [20]:
import pandas as pd

df = pd.DataFrame({
    'City': ['New York', 'Los Angeles', 'Chicago', 'New York', 'Chicago'],
    'Gender': ['Male', 'Female', 'Female', 'Male', 'Male']
})

In [21]:
df

Unnamed: 0,City,Gender
0,New York,Male
1,Los Angeles,Female
2,Chicago,Female
3,New York,Male
4,Chicago,Male


## Using pd.factorize() 

In [25]:
pd.factorize(df['City'])

(array([0, 1, 2, 0, 2]),
 Index(['New York', 'Los Angeles', 'Chicago'], dtype='object'))

In [26]:
pd.factorize(df['City'])[0]

array([0, 1, 2, 0, 2])

In [27]:
df['City label']=pd.factorize(df['City'])[0]

In [28]:
df

Unnamed: 0,City,Gender,City label
0,New York,Male,0
1,Los Angeles,Female,1
2,Chicago,Female,2
3,New York,Male,0
4,Chicago,Male,2


# OUTLIERS

## Examples on Sales Super Dataset 

In [29]:
import pandas as pd
df=pd.read_excel("C:/Users/cherry/Downloads/Sample - Superstore.xls")

In [30]:
df

Unnamed: 0,Row ID,Order ID,Order Date,Ship Date,Ship Mode,Customer ID,Customer Name,Segment,Country,City,...,Postal Code,Region,Product ID,Category,Sub-Category,Product Name,Sales,Quantity,Discount,Profit
0,1,CA-2016-152156,2016-11-08,2016-11-11,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,...,42420,South,FUR-BO-10001798,Furniture,Bookcases,Bush Somerset Collection Bookcase,261.9600,2,0.00,41.9136
1,2,CA-2016-152156,2016-11-08,2016-11-11,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,...,42420,South,FUR-CH-10000454,Furniture,Chairs,"Hon Deluxe Fabric Upholstered Stacking Chairs,...",731.9400,3,0.00,219.5820
2,3,CA-2016-138688,2016-06-12,2016-06-16,Second Class,DV-13045,Darrin Van Huff,Corporate,United States,Los Angeles,...,90036,West,OFF-LA-10000240,Office Supplies,Labels,Self-Adhesive Address Labels for Typewriters b...,14.6200,2,0.00,6.8714
3,4,US-2015-108966,2015-10-11,2015-10-18,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,...,33311,South,FUR-TA-10000577,Furniture,Tables,Bretford CR4500 Series Slim Rectangular Table,957.5775,5,0.45,-383.0310
4,5,US-2015-108966,2015-10-11,2015-10-18,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,...,33311,South,OFF-ST-10000760,Office Supplies,Storage,Eldon Fold 'N Roll Cart System,22.3680,2,0.20,2.5164
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9989,9990,CA-2014-110422,2014-01-21,2014-01-23,Second Class,TB-21400,Tom Boeckenhauer,Consumer,United States,Miami,...,33180,South,FUR-FU-10001889,Furniture,Furnishings,Ultra Door Pull Handle,25.2480,3,0.20,4.1028
9990,9991,CA-2017-121258,2017-02-26,2017-03-03,Standard Class,DB-13060,Dave Brooks,Consumer,United States,Costa Mesa,...,92627,West,FUR-FU-10000747,Furniture,Furnishings,Tenex B1-RE Series Chair Mats for Low Pile Car...,91.9600,2,0.00,15.6332
9991,9992,CA-2017-121258,2017-02-26,2017-03-03,Standard Class,DB-13060,Dave Brooks,Consumer,United States,Costa Mesa,...,92627,West,TEC-PH-10003645,Technology,Phones,Aastra 57i VoIP phone,258.5760,2,0.20,19.3932
9992,9993,CA-2017-121258,2017-02-26,2017-03-03,Standard Class,DB-13060,Dave Brooks,Consumer,United States,Costa Mesa,...,92627,West,OFF-PA-10004041,Office Supplies,Paper,"It's Hot Message Books with Stickers, 2 3/4"" x 5""",29.6000,4,0.00,13.3200


In [32]:
df.columns

Index(['Row ID', 'Order ID', 'Order Date', 'Ship Date', 'Ship Mode',
       'Customer ID', 'Customer Name', 'Segment', 'Country', 'City', 'State',
       'Postal Code', 'Region', 'Product ID', 'Category', 'Sub-Category',
       'Product Name', 'Sales', 'Quantity', 'Discount', 'Profit'],
      dtype='object')

In [33]:
df.shape

(9994, 21)

In [34]:
df['Sales'].min()

np.float64(0.44399999999999995)

In [35]:
df['Sales'].max()

np.float64(22638.48)

In [37]:
np.quantile(df['Sales'],0.25)

np.float64(17.28)

In [38]:
np.quantile(df['Sales'],0.50)

np.float64(54.489999999999995)

In [39]:
np.quantile(df['Sales'],0.75)

np.float64(209.94)

# Interquartile Range 
### IQR - The interquartile range is calculated by subtracting the first quartile from the third quartile. 
### IQR = Q3 - Q1

In [42]:
iqr = np.quantile(df['Sales'],0.75)-np.quantile(df['Sales'],0.25)

In [43]:
iqr

np.float64(192.66)

## OUTLIERS 
### anything below minimum value (Q1 - 1.5*IQR) and above maximum value ( Q3 + 1.5*IQR)

In [46]:
np.quantile(df['Sales'],0.75)+1.5*iqr

np.float64(498.93)

In [47]:
np.quantile(df['Sales'],0.25)-1.5*iqr

np.float64(-271.71000000000004)

### Calculating the percentage of outliers

In [48]:
(df['Sales'][df['Sales']>np.quantile(df['Sales'],0.75)+1.5*(iqr)].shape[0]/df.shape[0])*100

11.677006203722232

In [59]:
(df['Sales'][df['Sales']>np.quantile(df['Sales'],0.75)+1.5*iqr])

1        731.9400
3        957.5775
7        907.1520
10      1706.1840
11       911.4240
          ...    
9931     683.3320
9942     998.8200
9947    1925.8800
9948    2405.2000
9968     735.9800
Name: Sales, Length: 1167, dtype: float64

In [60]:
(df['Sales'][df['Sales']>np.quantile(df['Sales'],0.75)+1.5*iqr]).shape

(1167,)

In [61]:
(df['Sales'][df['Sales']>np.quantile(df['Sales'],0.75)+1.5*iqr]).shape[0]

1167

In [62]:
(df['Sales'][df['Sales']>np.quantile(df['Sales'],0.75)+1.5*iqr].shape[0]/df.shape[0])

0.11677006203722233

In [63]:
(df['Sales'][df['Sales']>np.quantile(df['Sales'],0.75)+1.5*iqr].shape[0]/df.shape[0])*100

11.677006203722232

## Number of unique values

In [64]:
df['Region'].nunique()

4

### If you want to know what are the unique values in Region

In [67]:
df['Region'].unique()

array(['South', 'West', 'Central', 'East'], dtype=object)

### If you want to know the value count of each unique value

In [71]:
df['Region'].value_counts()

Region
West       3203
East       2848
Central    2323
South      1620
Name: count, dtype: int64

In [69]:
df.shape

(9994, 21)

### calculating percentage of each unique value by using " normalize = True "

In [72]:
(df['Region'].value_counts(normalize=True))

Region
West       0.320492
East       0.284971
Central    0.232439
South      0.162097
Name: proportion, dtype: float64

In [76]:
(df['Region'].value_counts(normalize=True))*100

Region
West       32.049230
East       28.497098
Central    23.243946
South      16.209726
Name: proportion, dtype: float64

### If you want specified number of rows from first then you need use "head". By default head gives first 5 rows

In [77]:
df.head()

Unnamed: 0,Row ID,Order ID,Order Date,Ship Date,Ship Mode,Customer ID,Customer Name,Segment,Country,City,...,Postal Code,Region,Product ID,Category,Sub-Category,Product Name,Sales,Quantity,Discount,Profit
0,1,CA-2016-152156,2016-11-08,2016-11-11,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,...,42420,South,FUR-BO-10001798,Furniture,Bookcases,Bush Somerset Collection Bookcase,261.96,2,0.0,41.9136
1,2,CA-2016-152156,2016-11-08,2016-11-11,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,...,42420,South,FUR-CH-10000454,Furniture,Chairs,"Hon Deluxe Fabric Upholstered Stacking Chairs,...",731.94,3,0.0,219.582
2,3,CA-2016-138688,2016-06-12,2016-06-16,Second Class,DV-13045,Darrin Van Huff,Corporate,United States,Los Angeles,...,90036,West,OFF-LA-10000240,Office Supplies,Labels,Self-Adhesive Address Labels for Typewriters b...,14.62,2,0.0,6.8714
3,4,US-2015-108966,2015-10-11,2015-10-18,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,...,33311,South,FUR-TA-10000577,Furniture,Tables,Bretford CR4500 Series Slim Rectangular Table,957.5775,5,0.45,-383.031
4,5,US-2015-108966,2015-10-11,2015-10-18,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,...,33311,South,OFF-ST-10000760,Office Supplies,Storage,Eldon Fold 'N Roll Cart System,22.368,2,0.2,2.5164


In [78]:
df.head(2)

Unnamed: 0,Row ID,Order ID,Order Date,Ship Date,Ship Mode,Customer ID,Customer Name,Segment,Country,City,...,Postal Code,Region,Product ID,Category,Sub-Category,Product Name,Sales,Quantity,Discount,Profit
0,1,CA-2016-152156,2016-11-08,2016-11-11,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,...,42420,South,FUR-BO-10001798,Furniture,Bookcases,Bush Somerset Collection Bookcase,261.96,2,0.0,41.9136
1,2,CA-2016-152156,2016-11-08,2016-11-11,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,...,42420,South,FUR-CH-10000454,Furniture,Chairs,"Hon Deluxe Fabric Upholstered Stacking Chairs,...",731.94,3,0.0,219.582


### If you want specified number of rows from last then you need use "tail". By default tail gives last 5 rows

In [79]:
df.tail()

Unnamed: 0,Row ID,Order ID,Order Date,Ship Date,Ship Mode,Customer ID,Customer Name,Segment,Country,City,...,Postal Code,Region,Product ID,Category,Sub-Category,Product Name,Sales,Quantity,Discount,Profit
9989,9990,CA-2014-110422,2014-01-21,2014-01-23,Second Class,TB-21400,Tom Boeckenhauer,Consumer,United States,Miami,...,33180,South,FUR-FU-10001889,Furniture,Furnishings,Ultra Door Pull Handle,25.248,3,0.2,4.1028
9990,9991,CA-2017-121258,2017-02-26,2017-03-03,Standard Class,DB-13060,Dave Brooks,Consumer,United States,Costa Mesa,...,92627,West,FUR-FU-10000747,Furniture,Furnishings,Tenex B1-RE Series Chair Mats for Low Pile Car...,91.96,2,0.0,15.6332
9991,9992,CA-2017-121258,2017-02-26,2017-03-03,Standard Class,DB-13060,Dave Brooks,Consumer,United States,Costa Mesa,...,92627,West,TEC-PH-10003645,Technology,Phones,Aastra 57i VoIP phone,258.576,2,0.2,19.3932
9992,9993,CA-2017-121258,2017-02-26,2017-03-03,Standard Class,DB-13060,Dave Brooks,Consumer,United States,Costa Mesa,...,92627,West,OFF-PA-10004041,Office Supplies,Paper,"It's Hot Message Books with Stickers, 2 3/4"" x 5""",29.6,4,0.0,13.32
9993,9994,CA-2017-119914,2017-05-04,2017-05-09,Second Class,CC-12220,Chris Cortes,Consumer,United States,Westminster,...,92683,West,OFF-AP-10002684,Office Supplies,Appliances,"Acco 7-Outlet Masterpiece Power Center, Wihtou...",243.16,2,0.0,72.948


In [81]:
df.tail(3)

Unnamed: 0,Row ID,Order ID,Order Date,Ship Date,Ship Mode,Customer ID,Customer Name,Segment,Country,City,...,Postal Code,Region,Product ID,Category,Sub-Category,Product Name,Sales,Quantity,Discount,Profit
9991,9992,CA-2017-121258,2017-02-26,2017-03-03,Standard Class,DB-13060,Dave Brooks,Consumer,United States,Costa Mesa,...,92627,West,TEC-PH-10003645,Technology,Phones,Aastra 57i VoIP phone,258.576,2,0.2,19.3932
9992,9993,CA-2017-121258,2017-02-26,2017-03-03,Standard Class,DB-13060,Dave Brooks,Consumer,United States,Costa Mesa,...,92627,West,OFF-PA-10004041,Office Supplies,Paper,"It's Hot Message Books with Stickers, 2 3/4"" x 5""",29.6,4,0.0,13.32
9993,9994,CA-2017-119914,2017-05-04,2017-05-09,Second Class,CC-12220,Chris Cortes,Consumer,United States,Westminster,...,92683,West,OFF-AP-10002684,Office Supplies,Appliances,"Acco 7-Outlet Masterpiece Power Center, Wihtou...",243.16,2,0.0,72.948
