In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_excel("Superstore.xls")

In [3]:
df.shape

(9994, 11)

In [4]:
df.dtypes

Order ID                 object
Order Date       datetime64[ns]
Customer Name            object
City                     object
State                    object
Region                   object
Category                 object
Sales                   float64
Quantity                  int64
Discount                float64
Profit                  float64
dtype: object

In [5]:
df.head(2)

Unnamed: 0,Order ID,Order Date,Customer Name,City,State,Region,Category,Sales,Quantity,Discount,Profit
0,CA-2016-152156,2016-11-08,Claire Gute,Henderson,Kentucky,South,Furniture,261.96,2,0.0,41.9136
1,CA-2016-152156,2016-11-08,Claire Gute,Henderson,Kentucky,South,Furniture,731.94,3,0.0,219.582


### Select_dtypes

In [6]:
df.select_dtypes(include = ['float64','int64'])

Unnamed: 0,Sales,Quantity,Discount,Profit
0,261.9600,2,0.00,41.9136
1,731.9400,3,0.00,219.5820
2,14.6200,2,0.00,6.8714
3,957.5775,5,0.45,-383.0310
4,22.3680,2,0.20,2.5164
...,...,...,...,...
9989,25.2480,3,0.20,4.1028
9990,91.9600,2,0.00,15.6332
9991,258.5760,2,0.20,19.3932
9992,29.6000,4,0.00,13.3200


In [7]:
df.select_dtypes(include = "object")

Unnamed: 0,Order ID,Customer Name,City,State,Region,Category
0,CA-2016-152156,Claire Gute,Henderson,Kentucky,South,Furniture
1,CA-2016-152156,Claire Gute,Henderson,Kentucky,South,Furniture
2,CA-2016-138688,Darrin Van Huff,Los Angeles,California,West,Office Supplies
3,US-2015-108966,Sean O'Donnell,Fort Lauderdale,Florida,South,Furniture
4,US-2015-108966,Sean O'Donnell,Fort Lauderdale,Florida,South,Office Supplies
...,...,...,...,...,...,...
9989,CA-2014-110422,Tom Boeckenhauer,Miami,Florida,South,Furniture
9990,CA-2017-121258,Dave Brooks,Costa Mesa,California,West,Furniture
9991,CA-2017-121258,Dave Brooks,Costa Mesa,California,West,Technology
9992,CA-2017-121258,Dave Brooks,Costa Mesa,California,West,Office Supplies


In [8]:
df.select_dtypes(exclude = "object")

Unnamed: 0,Order Date,Sales,Quantity,Discount,Profit
0,2016-11-08,261.9600,2,0.00,41.9136
1,2016-11-08,731.9400,3,0.00,219.5820
2,2016-06-12,14.6200,2,0.00,6.8714
3,2015-10-11,957.5775,5,0.45,-383.0310
4,2015-10-11,22.3680,2,0.20,2.5164
...,...,...,...,...,...
9989,2014-01-21,25.2480,3,0.20,4.1028
9990,2017-02-26,91.9600,2,0.00,15.6332
9991,2017-02-26,258.5760,2,0.20,19.3932
9992,2017-02-26,29.6000,4,0.00,13.3200


### Query Function

In [9]:
df.head(2)

Unnamed: 0,Order ID,Order Date,Customer Name,City,State,Region,Category,Sales,Quantity,Discount,Profit
0,CA-2016-152156,2016-11-08,Claire Gute,Henderson,Kentucky,South,Furniture,261.96,2,0.0,41.9136
1,CA-2016-152156,2016-11-08,Claire Gute,Henderson,Kentucky,South,Furniture,731.94,3,0.0,219.582


In [10]:
df.query('Sales > 1000 & State == "Florida"')

Unnamed: 0,Order ID,Order Date,Customer Name,City,State,Region,Category,Sales,Quantity,Discount,Profit
990,CA-2015-153549,2015-03-29,Sara Luxemburg,Jacksonville,Florida,South,Furniture,1166.92,5,0.2,131.2785
1046,CA-2016-169103,2016-03-08,Zuschuss Carroll,Miami,Florida,South,Technology,1363.96,5,0.2,85.2475
2697,CA-2014-145317,2014-03-18,Sean Miller,Jacksonville,Florida,South,Technology,22638.48,6,0.5,-1811.0784
3118,CA-2015-121720,2015-06-11,Jim Epp,Lakeland,Florida,South,Furniture,1123.92,5,0.2,-182.637
3124,CA-2015-121720,2015-06-11,Jim Epp,Lakeland,Florida,South,Office Supplies,1036.624,2,0.2,51.8312
3768,CA-2014-153913,2014-12-16,Ken Black,Hialeah,Florida,South,Furniture,1013.832,9,0.2,101.3832
4297,CA-2017-129021,2017-08-23,Patrick O'Brill,Tallahassee,Florida,South,Technology,4367.896,13,0.2,327.5922
4519,CA-2017-107167,2017-06-10,Neil Ducich,Jacksonville,Florida,South,Office Supplies,1347.52,8,0.2,84.22
5185,CA-2015-134719,2015-10-10,John Dryer,Jacksonville,Florida,South,Office Supplies,1801.632,6,0.2,-337.806
6048,CA-2015-105571,2015-11-07,Christine Phan,Miami,Florida,South,Office Supplies,1345.485,5,0.7,-1031.5385


### Chaining Operations with pipe

In [11]:
salary = pd.read_excel("Superstore.xls", sheet_name="Salary")

In [12]:
salary

Unnamed: 0,Name,Salary_USD,Department
0,Ashok,60000.0,HR
1,Eric,70000.0,Engineering
2,Nature,,Marketing
3,Virat,80000.0,Sport
4,Corey,65000.0,HR
5,Lucas,40000.0,Sales
6,Matt,55000.0,Engineering
7,Vivek,80000.0,GenAI
8,Abhisheak,,GenAI


In [13]:
def convert_salary_to_eur(salary,conversion_rate=0.85):
    salary['Sales_EUR'] = salary["Salary_USD"] * conversion_rate
    return salary

def filter_by_salary(salary, min_salary):
    return salary[salary['Sales_EUR'] > min_salary]

def drop_na(salary):
    return salary.dropna()

In [14]:
salary_transformed = (salary
                      .pipe(drop_na) # first drop NaN values
                      .pipe(convert_salary_to_eur,conversion_rate = 0.90)
                      .pipe(filter_by_salary, min_salary = 50000)
                     )

In [16]:
salary

Unnamed: 0,Name,Salary_USD,Department
0,Ashok,60000.0,HR
1,Eric,70000.0,Engineering
2,Nature,,Marketing
3,Virat,80000.0,Sport
4,Corey,65000.0,HR
5,Lucas,40000.0,Sales
6,Matt,55000.0,Engineering
7,Vivek,80000.0,GenAI
8,Abhisheak,,GenAI


In [15]:
salary_transformed

Unnamed: 0,Name,Salary_USD,Department,Sales_EUR
0,Ashok,60000.0,HR,54000.0
1,Eric,70000.0,Engineering,63000.0
3,Virat,80000.0,Sport,72000.0
4,Corey,65000.0,HR,58500.0
7,Vivek,80000.0,GenAI,72000.0


### Exploding a list-like column

df.explode('column_with_lists')

In [17]:
data = {
    'Name' : ['Vivek', 'Ram', 'Abhishek'],
    'Projects' : [['HR','Marketing'],['Engineering','Data Science'],['Marketing']],
    'Salary_USD' : [60000,70000,80000]
}

In [18]:
data

{'Name': ['Vivek', 'Ram', 'Abhishek'],
 'Projects': [['HR', 'Marketing'],
  ['Engineering', 'Data Science'],
  ['Marketing']],
 'Salary_USD': [60000, 70000, 80000]}

In [19]:
data = pd.DataFrame(data)

In [20]:
data

Unnamed: 0,Name,Projects,Salary_USD
0,Vivek,"[HR, Marketing]",60000
1,Ram,"[Engineering, Data Science]",70000
2,Abhishek,[Marketing],80000


In [21]:
df_exploded = data.explode('Projects')

In [22]:
df_exploded

Unnamed: 0,Name,Projects,Salary_USD
0,Vivek,HR,60000
0,Vivek,Marketing,60000
1,Ram,Engineering,70000
1,Ram,Data Science,70000
2,Abhishek,Marketing,80000


### ApplyMap - Element wise Operations

In [25]:
salary.head()

Unnamed: 0,Name,Salary_USD,Department
0,Ashok,60000.0,HR
1,Eric,70000.0,Engineering
2,Nature,,Marketing
3,Virat,80000.0,Sport
4,Corey,65000.0,HR


In [26]:
salary.applymap(lambda x : len(str(x)) if isinstance(x,str) else x)

Unnamed: 0,Name,Salary_USD,Department
0,5,60000.0,2
1,4,70000.0,11
2,6,,9
3,5,80000.0,5
4,5,65000.0,2
5,5,40000.0,5
6,4,55000.0,11
7,5,80000.0,5
8,9,,5


### Creating new columns with pandas assign functions

In [28]:
df.head(1)

Unnamed: 0,Order ID,Order Date,Customer Name,City,State,Region,Category,Sales,Quantity,Discount,Profit
0,CA-2016-152156,2016-11-08,Claire Gute,Henderson,Kentucky,South,Furniture,261.96,2,0.0,41.9136


In [29]:
df.assign(Total_Cost = lambda x : x['Sales'] * x['Quantity']).head()

Unnamed: 0,Order ID,Order Date,Customer Name,City,State,Region,Category,Sales,Quantity,Discount,Profit,Total_Cost
0,CA-2016-152156,2016-11-08,Claire Gute,Henderson,Kentucky,South,Furniture,261.96,2,0.0,41.9136,523.92
1,CA-2016-152156,2016-11-08,Claire Gute,Henderson,Kentucky,South,Furniture,731.94,3,0.0,219.582,2195.82
2,CA-2016-138688,2016-06-12,Darrin Van Huff,Los Angeles,California,West,Office Supplies,14.62,2,0.0,6.8714,29.24
3,US-2015-108966,2015-10-11,Sean O'Donnell,Fort Lauderdale,Florida,South,Furniture,957.5775,5,0.45,-383.031,4787.8875
4,US-2015-108966,2015-10-11,Sean O'Donnell,Fort Lauderdale,Florida,South,Office Supplies,22.368,2,0.2,2.5164,44.736
