# Pandas Advanced Functions

In [79]:
import pandas as pd
import numpy as np
import datetime as dt
import warnings
warnings.filterwarnings("ignore")

In [80]:
df = pd.read_csv('Auto Sales data.csv')

In [81]:
df.head(3)

Unnamed: 0,ORDERNUMBER,QUANTITYORDERED,PRICEEACH,ORDERLINENUMBER,SALES,ORDERDATE,DAYS_SINCE_LASTORDER,STATUS,PRODUCTLINE,MSRP,PRODUCTCODE,CUSTOMERNAME,PHONE,ADDRESSLINE1,CITY,POSTALCODE,COUNTRY,CONTACTLASTNAME,CONTACTFIRSTNAME,DEALSIZE
0,10107,30,95.7,2,2871.0,24/02/2018,828,Shipped,Motorcycles,95,S10_1678,Land of Toys Inc.,2125557818,897 Long Airport Avenue,NYC,10022,USA,Yu,Kwai,Small
1,10121,34,81.35,5,2765.9,07/05/2018,757,Shipped,Motorcycles,95,S10_1678,Reims Collectables,26.47.1555,59 rue de l'Abbaye,Reims,51100,France,Henriot,Paul,Small
2,10134,41,94.74,2,3884.34,01/07/2018,703,Shipped,Motorcycles,95,S10_1678,Lyon Souveniers,+33 1 46 62 7555,27 rue du Colonel Pierre Avia,Paris,75508,France,Da Cunha,Daniel,Medium


In [82]:
df = df[['QUANTITYORDERED', 'PRICEEACH', 'SALES', 'ORDERDATE', 'DAYS_SINCE_LASTORDER', 'STATUS',
        'PRODUCTLINE', 'MSRP', 'CITY', 'COUNTRY', 'DEALSIZE']]

In [83]:
df.head(3)

Unnamed: 0,QUANTITYORDERED,PRICEEACH,SALES,ORDERDATE,DAYS_SINCE_LASTORDER,STATUS,PRODUCTLINE,MSRP,CITY,COUNTRY,DEALSIZE
0,30,95.7,2871.0,24/02/2018,828,Shipped,Motorcycles,95,NYC,USA,Small
1,34,81.35,2765.9,07/05/2018,757,Shipped,Motorcycles,95,Reims,France,Small
2,41,94.74,3884.34,01/07/2018,703,Shipped,Motorcycles,95,Paris,France,Medium


In [84]:
df.shape

(2747, 11)

### Data type of each column

In [85]:
df.dtypes

QUANTITYORDERED           int64
PRICEEACH               float64
SALES                   float64
ORDERDATE                object
DAYS_SINCE_LASTORDER      int64
STATUS                   object
PRODUCTLINE              object
MSRP                      int64
CITY                     object
COUNTRY                  object
DEALSIZE                 object
dtype: object

### Count of Nulls

In [86]:
df.isnull().sum()

QUANTITYORDERED         0
PRICEEACH               0
SALES                   0
ORDERDATE               0
DAYS_SINCE_LASTORDER    0
STATUS                  0
PRODUCTLINE             0
MSRP                    0
CITY                    0
COUNTRY                 0
DEALSIZE                0
dtype: int64

### Unique values in a column

In [87]:
df.COUNTRY.unique()

array(['USA', 'France', 'Norway', 'Australia', 'Finland', 'Austria', 'UK',
       'Spain', 'Sweden', 'Singapore', 'Canada', 'Japan', 'Italy',
       'Denmark', 'Belgium', 'Philippines', 'Germany', 'Switzerland',
       'Ireland'], dtype=object)

### Number of unique values in a column

In [88]:
df.COUNTRY.nunique()

19

### Check for duplicates

subset: by default will remove duplicate rows, else only those from specified column

keep: Values to keep and other duplicates will be removed in the order they occur in dataset

In [89]:
df.drop_duplicates(subset = ['PRODUCTLINE'],)

Unnamed: 0,QUANTITYORDERED,PRICEEACH,SALES,ORDERDATE,DAYS_SINCE_LASTORDER,STATUS,PRODUCTLINE,MSRP,CITY,COUNTRY,DEALSIZE
0,30,95.7,2871.0,24/02/2018,828,Shipped,Motorcycles,95,NYC,USA,Small
24,26,207.87,5404.62,29/01/2018,878,Shipped,Classic Cars,214,Stavern,Norway,Medium
200,27,125.74,3394.98,29/01/2018,1054,Shipped,Trucks and Buses,136,Stavern,Norway,Medium
454,39,123.29,4808.31,10/01/2018,1327,Shipped,Vintage Cars,102,NYC,USA,Medium
526,36,146.65,5279.4,17/02/2018,1361,Shipped,Planes,157,Bergamo,Italy,Medium
897,41,83.44,3421.04,17/02/2018,1732,Shipped,Ships,86,Bergamo,Italy,Medium
1023,38,113.95,4330.1,11/02/2018,1864,Shipped,Trains,100,Kobenhavn,Denmark,Medium


### Boolean list of duplicated rows for filtering purposes

In [90]:
df.duplicated(subset = ['PRODUCTLINE'])

0       False
1        True
2        True
3        True
4        True
        ...  
2742     True
2743     True
2744     True
2745     True
2746     True
Length: 2747, dtype: bool

In [91]:
### Change dtypes

### Pandas Data Time

In [92]:
pd.DataFrame({'dates': [dt.datetime.now()]})

Unnamed: 0,dates
0,2026-01-11 18:32:07.475957


In [93]:
df.ORDERDATE.dtype

dtype('O')

In [94]:
df = df.assign(date_pd = pd.to_datetime(df.ORDERDATE, ))
df.dtypes

QUANTITYORDERED                  int64
PRICEEACH                      float64
SALES                          float64
ORDERDATE                       object
DAYS_SINCE_LASTORDER             int64
STATUS                          object
PRODUCTLINE                     object
MSRP                             int64
CITY                            object
COUNTRY                         object
DEALSIZE                        object
date_pd                 datetime64[ns]
dtype: object

In [95]:
# STRPTIME: Converts str to datetime datatype
# STRFTIME: Converts datatime to str datatype
df['date_dt'] = df['ORDERDATE'].apply(lambda x: dt.datetime.strptime(x, '%d/%m/%Y'))
df.dtypes

QUANTITYORDERED                  int64
PRICEEACH                      float64
SALES                          float64
ORDERDATE                       object
DAYS_SINCE_LASTORDER             int64
STATUS                          object
PRODUCTLINE                     object
MSRP                             int64
CITY                            object
COUNTRY                         object
DEALSIZE                        object
date_pd                 datetime64[ns]
date_dt                 datetime64[ns]
dtype: object

In [96]:
df.head(1)

Unnamed: 0,QUANTITYORDERED,PRICEEACH,SALES,ORDERDATE,DAYS_SINCE_LASTORDER,STATUS,PRODUCTLINE,MSRP,CITY,COUNTRY,DEALSIZE,date_pd,date_dt
0,30,95.7,2871.0,24/02/2018,828,Shipped,Motorcycles,95,NYC,USA,Small,2018-02-24,2018-02-24


### More Date time 

In [97]:
d = pd.DataFrame({'date': [dt.datetime.now()]})
d['date2'] = d['date'].dt.strftime("%Y-%m-%d %H:%M:%S")
d['date3'] = d['date'].astype(str)
d['date4'] = dt.datetime(2025,1,1)
d

Unnamed: 0,date,date2,date3,date4
0,2026-01-11 18:32:07.649399,2026-01-11 18:32:07,2026-01-11 18:32:07.649399,2025-01-01


In [98]:
d.dtypes

date     datetime64[ns]
date2            object
date3            object
date4    datetime64[us]
dtype: object

In [99]:
dt.datetime.strptime(d.date2[0], "%Y-%m-%d %H:%M:%S")

datetime.datetime(2026, 1, 11, 18, 32, 7)

In [100]:
d.date - d.date4

0   375 days 18:32:07.649399
dtype: timedelta64[ns]

In [101]:
d.date.dt.day, d.date.dt.hour

(0    11
 Name: date, dtype: int32,
 0    18
 Name: date, dtype: int32)

In [102]:
import pytz
# Define timezone
tz = pytz.timezone('Etc/GMT+6')
current_time = datetime.now(tz)
current_time

datetime.datetime(2026, 1, 11, 7, 2, 7, 744514, tzinfo=<StaticTzInfo 'Etc/GMT+6'>)

In [103]:
pd.DataFrame({'date tz': [current_time]})

Unnamed: 0,date tz
0,2026-01-11 07:02:07.744514-06:00


### Group by

In [104]:
df[['DEALSIZE', 'SALES', 'STATUS']].groupby(by = ['DEALSIZE', 'STATUS'], as_index = False).count()

Unnamed: 0,DEALSIZE,STATUS,SALES
0,Large,Disputed,5
1,Large,In Process,3
2,Large,On Hold,5
3,Large,Resolved,1
4,Large,Shipped,138
5,Medium,Cancelled,33
6,Medium,Disputed,5
7,Medium,In Process,18
8,Medium,On Hold,24
9,Medium,Resolved,26


### Aggregate and Group by

In [105]:
agg = df[['DEALSIZE', 'SALES', 'STATUS', 'QUANTITYORDERED']].groupby(by = ['DEALSIZE', 'STATUS'], as_index = False).agg(
                                                                                               MeanSales = ('SALES', 'mean'),
                                                                                               CountSales = ('SALES', 'count'),
                                                                                               SumOrders = ('QUANTITYORDERED','sum'))
agg

Unnamed: 0,DEALSIZE,STATUS,MeanSales,CountSales,SumOrders
0,Large,Disputed,8549.422,5,296
1,Large,In Process,9050.786667,3,153
2,Large,On Hold,8588.138,5,373
3,Large,Resolved,8884.8,1,45
4,Large,Shipped,8240.807536,138,6327
5,Medium,Cancelled,4168.948788,33,1247
6,Medium,Disputed,4432.966,5,184
7,Medium,In Process,4654.825,18,762
8,Medium,On Hold,4430.909583,24,1097
9,Medium,Resolved,3998.726923,26,1010


### Pivot

In [106]:
# Works when there is unique value for each index and column
pvt  = agg.pivot(index = 'DEALSIZE', columns = 'STATUS', values = 'MeanSales',)
pvt

STATUS,Cancelled,Disputed,In Process,On Hold,Resolved,Shipped
DEALSIZE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Large,,8549.422,9050.786667,8588.138,8884.8,8240.807536
Medium,4168.948788,4432.966,4654.825,4430.909583,3998.726923,4406.593524
Small,2107.858148,1825.23,1689.5375,1979.778,1893.329,2072.816164


### Pivot Table

In [107]:
pvtb = df.pivot_table(index = 'DEALSIZE', columns = 'STATUS', values = 'SALES', aggfunc = 'mean')
pvtb                      

STATUS,Cancelled,Disputed,In Process,On Hold,Resolved,Shipped
DEALSIZE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Large,,8549.422,9050.786667,8588.138,8884.8,8240.807536
Medium,4168.948788,4432.966,4654.825,4430.909583,3998.726923,4406.593524
Small,2107.858148,1825.23,1689.5375,1979.778,1893.329,2072.816164


### Value Counts, Sort Values, map & np.where Function

In [108]:
df['Price Category'] = np.where(df.PRICEEACH < 100, 'Less than 100',
                                np.where(df.PRICEEACH > 200, 'More than 200',
                               'Between 100 and 200')
                               )
                               

In [109]:
df['Price Category'].value_counts()

Price Category
Less than 100          1479
Between 100 and 200    1212
More than 200            56
Name: count, dtype: int64

In [110]:
maps = {"Less than 100": 1, 
        'Between 100 and 200':2,
        'More than 200':3 }
df['Mapped Column'] = df['Price Category'].map(maps)

In [111]:
df.head(3)

Unnamed: 0,QUANTITYORDERED,PRICEEACH,SALES,ORDERDATE,DAYS_SINCE_LASTORDER,STATUS,PRODUCTLINE,MSRP,CITY,COUNTRY,DEALSIZE,date_pd,date_dt,Price Category,Mapped Column
0,30,95.7,2871.0,24/02/2018,828,Shipped,Motorcycles,95,NYC,USA,Small,2018-02-24,2018-02-24,Less than 100,1
1,34,81.35,2765.9,07/05/2018,757,Shipped,Motorcycles,95,Reims,France,Small,2018-05-07,2018-05-07,Less than 100,1
2,41,94.74,3884.34,01/07/2018,703,Shipped,Motorcycles,95,Paris,France,Medium,2018-07-01,2018-07-01,Less than 100,1


In [112]:
df['Mapped Column'].value_counts()

Mapped Column
1    1479
2    1212
3      56
Name: count, dtype: int64

In [113]:
df.sort_values(by = ['QUANTITYORDERED', 'PRICEEACH'], ascending = [False, True])

Unnamed: 0,QUANTITYORDERED,PRICEEACH,SALES,ORDERDATE,DAYS_SINCE_LASTORDER,STATUS,PRODUCTLINE,MSRP,CITY,COUNTRY,DEALSIZE,date_pd,date_dt,Price Category,Mapped Column
399,97,93.28,9048.16,14/04/2020,447,Shipped,Classic Cars,115,Strasbourg,France,Large,2020-04-14,2020-04-14,Less than 100,1
2511,85,88.75,7543.75,03/04/2020,2570,On Hold,Planes,99,Newark,USA,Large,2020-04-03,2020-04-03,Less than 100,1
2614,77,92.00,7084.00,03/04/2020,2673,On Hold,Planes,80,Newark,USA,Large,2020-04-03,2020-04-03,Less than 100,1
1656,76,94.50,7182.00,22/04/2020,1696,On Hold,Classic Cars,90,San Jose,USA,Large,2020-04-22,2020-04-22,Less than 100,1
1930,76,154.47,11739.70,14/04/2020,1978,Shipped,Classic Cars,140,Strasbourg,France,Large,2020-04-14,2020-04-14,Between 100 and 200,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2434,11,103.21,1135.31,03/04/2020,2493,On Hold,Planes,91,Newark,USA,Small,2020-04-03,2020-04-03,Between 100 and 200,2
871,10,88.14,881.40,30/05/2020,873,In Process,Vintage Cars,101,Bruxelles,Belgium,Small,2020-05-30,2020-05-30,Less than 100,1
298,10,109.22,1092.20,17/05/2020,313,Shipped,Classic Cars,117,Salzburg,Austria,Small,2020-05-17,2020-05-17,Between 100 and 200,2
1238,6,90.19,541.14,22/04/2020,1278,On Hold,Vintage Cars,92,San Jose,USA,Small,2020-04-22,2020-04-22,Less than 100,1


### str with isin/contains

In [114]:
df.STATUS.value_counts()

STATUS
Shipped       2541
Cancelled       60
Resolved        47
On Hold         44
In Process      41
Disputed        14
Name: count, dtype: int64

In [115]:
df[df.STATUS.str.contains('Hold',)].shape

(44, 15)

In [116]:
df[df.STATUS.isin(['On Hold', 'Resolved'])].shape

(91, 15)

## Join/Merge & Concat

In [117]:
df1 = pd.DataFrame(df.DEALSIZE.unique(), columns = ['dealsize'])
df1['deal_id'] = np.arange(1,4)

df_merged = pd.merge(df[['DEALSIZE', 'QUANTITYORDERED']], df1, left_on = 'DEALSIZE', right_on = 'dealsize', how = 'inner')
df_merged.head()

Unnamed: 0,DEALSIZE,QUANTITYORDERED,dealsize,deal_id
0,Small,30,Small,1
1,Small,34,Small,1
2,Medium,41,Medium,2
3,Medium,45,Medium,2
4,Medium,36,Medium,2


In [118]:
df_concat = pd.concat([df1, df1], axis = 0)
df_concat

Unnamed: 0,dealsize,deal_id
0,Small,1
1,Medium,2
2,Large,3
0,Small,1
1,Medium,2
2,Large,3


## any and all

In [119]:
df1 = df[['QUANTITYORDERED', 'MSRP']]
print('Initial Shape', df1.shape[0])

print('Sum True with all:', sum((df1[['QUANTITYORDERED', 'MSRP']]>45).all(axis = 1)))

print('Sum True with any:', sum((df1[['QUANTITYORDERED', 'MSRP']]>45).any(axis = 1)))

Initial Shape 2747
Sum True with all: 439
Sum True with any: 2598
