# Advance Functions of Pandas

* *The following files helps to understand the code better:*
* *Reduce the memory intake and ease our effort to get desired output.*

* *This notebook demonstrates advanced functions in Pandas to enhance efficiency and optimize memory usage, making it easier to obtain the desired output.*

# Import Library

* Essential libraries are imported to perform operations effectively.

In [3]:
import pandas as pd

# Load Dataset

* The dataset is loaded to perform advanced data manipulation and analysis.

In [4]:
df = pd.read_excel('global_superstore/Global Superstore.xls')

In [5]:
df.columns

Index(['Row ID', 'Order ID', 'Order Date', 'Ship Date', 'Ship Mode',
       'Customer ID', 'Customer Name', 'Segment', 'City', 'State', 'Country',
       'Postal Code', 'Market', 'Region', 'Product ID', 'Category',
       'Sub-Category', 'Product Name', 'Sales', 'Quantity', 'Discount',
       'Profit', 'Shipping Cost', 'Order Priority'],
      dtype='object')

In [6]:
df.shape

(51290, 24)

In [7]:
df.dtypes

Row ID                     int64
Order ID                  object
Order Date        datetime64[ns]
Ship Date         datetime64[ns]
Ship Mode                 object
Customer ID               object
Customer Name             object
Segment                   object
City                      object
State                     object
Country                   object
Postal Code              float64
Market                    object
Region                    object
Product ID                object
Category                  object
Sub-Category              object
Product Name              object
Sales                    float64
Quantity                   int64
Discount                 float64
Profit                   float64
Shipping Cost            float64
Order Priority            object
dtype: object

In [8]:
df.isnull().sum()

Row ID                0
Order ID              0
Order Date            0
Ship Date             0
Ship Mode             0
Customer ID           0
Customer Name         0
Segment               0
City                  0
State                 0
Country               0
Postal Code       41296
Market                0
Region                0
Product ID            0
Category              0
Sub-Category          0
Product Name          0
Sales                 0
Quantity              0
Discount              0
Profit                0
Shipping Cost         0
Order Priority        0
dtype: int64

In [9]:
df.describe()

Unnamed: 0,Row ID,Order Date,Ship Date,Postal Code,Sales,Quantity,Discount,Profit,Shipping Cost
count,51290.0,51290,51290,9994.0,51290.0,51290.0,51290.0,51290.0,51290.0
mean,25645.5,2013-05-11 21:26:49.155781120,2013-05-15 20:42:42.745174528,55190.379428,246.490581,3.476545,0.142908,28.610982,26.375818
min,1.0,2011-01-01 00:00:00,2011-01-03 00:00:00,1040.0,0.444,1.0,0.0,-6599.978,0.002
25%,12823.25,2012-06-19 00:00:00,2012-06-23 00:00:00,23223.0,30.758625,2.0,0.0,0.0,2.61
50%,25645.5,2013-07-08 00:00:00,2013-07-12 00:00:00,56430.5,85.053,3.0,0.0,9.24,7.79
75%,38467.75,2014-05-22 00:00:00,2014-05-26 00:00:00,90008.0,251.0532,5.0,0.2,36.81,24.45
max,51290.0,2014-12-31 00:00:00,2015-01-07 00:00:00,99301.0,22638.48,14.0,0.85,8399.976,933.57
std,14806.29199,,,32063.69335,487.565361,2.278766,0.21228,174.340972,57.29681


In [10]:
df.describe(include='all')

Unnamed: 0,Row ID,Order ID,Order Date,Ship Date,Ship Mode,Customer ID,Customer Name,Segment,City,State,...,Product ID,Category,Sub-Category,Product Name,Sales,Quantity,Discount,Profit,Shipping Cost,Order Priority
count,51290.0,51290,51290,51290,51290,51290,51290,51290,51290,51290,...,51290,51290,51290,51290,51290.0,51290.0,51290.0,51290.0,51290.0,51290
unique,,25035,,,4,1590,795,3,3636,1094,...,10292,3,17,3788,,,,,,4
top,,CA-2014-100111,,,Standard Class,PO-18850,Muhammed Yedwab,Consumer,New York City,California,...,OFF-AR-10003651,Office Supplies,Binders,Staples,,,,,,Medium
freq,,14,,,30775,97,108,26518,915,2001,...,35,31273,6152,227,,,,,,29433
mean,25645.5,,2013-05-11 21:26:49.155781120,2013-05-15 20:42:42.745174528,,,,,,,...,,,,,246.490581,3.476545,0.142908,28.610982,26.375818,
min,1.0,,2011-01-01 00:00:00,2011-01-03 00:00:00,,,,,,,...,,,,,0.444,1.0,0.0,-6599.978,0.002,
25%,12823.25,,2012-06-19 00:00:00,2012-06-23 00:00:00,,,,,,,...,,,,,30.758625,2.0,0.0,0.0,2.61,
50%,25645.5,,2013-07-08 00:00:00,2013-07-12 00:00:00,,,,,,,...,,,,,85.053,3.0,0.0,9.24,7.79,
75%,38467.75,,2014-05-22 00:00:00,2014-05-26 00:00:00,,,,,,,...,,,,,251.0532,5.0,0.2,36.81,24.45,
max,51290.0,,2014-12-31 00:00:00,2015-01-07 00:00:00,,,,,,,...,,,,,22638.48,14.0,0.85,8399.976,933.57,


#  ***Performing Advance Functions***

* This section covers several advanced Pandas functionalities.

In [11]:
df[['Segment', 'City', 'State']]

Unnamed: 0,Segment,City,State
0,Consumer,New York City,New York
1,Corporate,Wollongong,New South Wales
2,Consumer,Brisbane,Queensland
3,Home Office,Berlin,Berlin
4,Consumer,Dakar,Dakar
...,...,...,...
51285,Corporate,Kure,Hiroshima
51286,Consumer,Houston,Texas
51287,Home Office,Oxnard,California
51288,Home Office,Valinhos,São Paulo


In [12]:
df[(df['Segment'] == 'Consumer') & (df['Category'] == 'Technology')]

Unnamed: 0,Row ID,Order ID,Order Date,Ship Date,Ship Mode,Customer ID,Customer Name,Segment,City,State,...,Product ID,Category,Sub-Category,Product Name,Sales,Quantity,Discount,Profit,Shipping Cost,Order Priority
0,32298,CA-2012-124891,2012-07-31,2012-07-31,Same Day,RH-19495,Rick Hansen,Consumer,New York City,New York,...,TEC-AC-10003033,Technology,Accessories,Plantronics CS510 - Over-the-Head monaural Wir...,2309.650,7,0.0,762.1845,933.57,Critical
2,25330,IN-2013-71249,2013-10-17,2013-10-18,First Class,CR-12730,Craig Reiter,Consumer,Brisbane,Queensland,...,TEC-PH-10004664,Technology,Phones,"Nokia Smart Phone, with Caller ID",5175.171,9,0.1,919.9710,915.49,Medium
4,47221,SG-2013-4320,2013-11-05,2013-11-06,Same Day,RH-9495,Rick Hansen,Consumer,Dakar,Dakar,...,TEC-SHA-10000501,Technology,Copiers,"Sharp Wireless Fax, High-Speed",2832.960,8,0.0,311.5200,903.04,Critical
12,45794,SA-2011-1830,2011-12-27,2011-12-29,Second Class,MM-7260,Magdelene Morse,Consumer,Jizan,Jizan,...,TEC-CIS-10001717,Technology,Phones,"Cisco Smart Phone, with Caller ID",2616.960,4,0.0,1151.4000,832.41,Critical
21,31784,CA-2011-154627,2011-10-29,2011-10-31,First Class,SA-20830,Sue Ann Reed,Consumer,Chicago,Illinois,...,TEC-PH-10001363,Technology,Phones,Apple iPhone 5S,2735.952,6,0.2,341.9940,752.51,High
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50885,40460,CA-2012-164007,2012-06-08,2012-06-12,Standard Class,MG-17695,Maureen Gnade,Consumer,Chicago,Illinois,...,TEC-AC-10003433,Technology,Accessories,Maxell 4.7GB DVD+R 5/Pack,2.376,3,0.2,0.7425,0.18,Medium
51007,44682,SO-2011-1410,2011-12-16,2011-12-21,Standard Class,CS-1860,Cari Schnelling,Consumer,Hargeysa,Woqooyi Galbeed,...,TEC-MEM-10001163,Technology,Accessories,"Memorex Mouse, Erganomic",25.980,1,0.0,1.0200,0.15,Medium
51095,45007,KZ-2011-7910,2011-07-20,2011-07-20,Same Day,MM-8055,Michelle Moray,Consumer,Pavlodar,Pavlodar,...,TEC-LOG-10002589,Technology,Accessories,"Logitech Memory Card, USB",31.077,1,0.7,-55.9530,0.12,Medium
51143,39197,CA-2014-128363,2014-08-14,2014-08-19,Standard Class,DC-12850,Dan Campbell,Consumer,Memphis,Tennessee,...,TEC-AC-10003709,Technology,Accessories,Maxell 4.7GB DVD-R 5/Pack,1.584,2,0.2,0.4752,0.10,Medium


In [14]:
df[(df['Profit'] > 1000) & (df['Sales'] > 1000)]

Unnamed: 0,Row ID,Order ID,Order Date,Ship Date,Ship Mode,Customer ID,Customer Name,Segment,City,State,...,Product ID,Category,Sub-Category,Product Name,Sales,Quantity,Discount,Profit,Shipping Cost,Order Priority
8,40155,CA-2014-135909,2014-10-14,2014-10-21,Standard Class,JW-15220,Jane Waco,Corporate,Sacramento,California,...,OFF-BI-10003527,Office Supplies,Binders,Fellowes PB500 Electric Punch Plastic Comb Bin...,5083.96,5,0.2,1906.4850,867.69000,Low
12,45794,SA-2011-1830,2011-12-27,2011-12-29,Second Class,MM-7260,Magdelene Morse,Consumer,Jizan,Jizan,...,TEC-CIS-10001717,Technology,Phones,"Cisco Smart Phone, with Caller ID",2616.96,4,0.0,1151.4000,832.41000,Critical
14,27704,IN-2013-73951,2013-06-06,2013-06-08,Second Class,PF-19120,Peter Fuller,Consumer,Mudanjiang,Heilongjiang,...,OFF-AP-10003500,Office Supplies,Appliances,"KitchenAid Microwave, White",3701.52,12,0.0,1036.0800,804.54000,Critical
17,12069,ES-2014-1651774,2014-09-08,2014-09-14,Standard Class,PJ-18835,Patrick Jones,Corporate,Prato,Tuscany,...,OFF-AP-10004512,Office Supplies,Appliances,"Hoover Stove, Red",7958.58,14,0.0,3979.0800,778.32000,Low
23,13528,ES-2013-2860574,2013-02-27,2013-03-01,Second Class,LB-16795,Laurel Beltran,Home Office,Edinburgh,Scotland,...,OFF-AP-10003590,Office Supplies,Appliances,"KitchenAid Refrigerator, Black",5273.70,10,0.0,1898.4000,730.91000,High
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13013,23447,IN-2011-77892,2011-04-02,2011-04-08,Standard Class,HL-15040,Hunter Lopez,Consumer,Rajshahi,Rajshahi,...,TEC-PH-10001751,Technology,Phones,"Samsung Smart Phone, Cordless",3195.00,5,0.0,1150.2000,23.95000,Medium
14843,39450,CA-2014-140151,2014-03-24,2014-03-26,First Class,RB-19360,Raymond Buch,Consumer,Seattle,Washington,...,TEC-CO-10004722,Technology,Copiers,Canon imageCLASS 2200 Advanced Copier,13999.96,4,0.0,6719.9808,20.00054,Medium
16062,39785,CA-2013-158841,2013-02-02,2013-02-04,Second Class,SE-20110,Sanjit Engle,Consumer,Arlington,Virginia,...,TEC-MA-10001127,Technology,Machines,HP Designjet T520 Inkjet Large Format Printer ...,8749.95,5,0.0,2799.9840,17.83000,Critical
19307,21850,IN-2011-50060,2011-09-06,2011-09-13,Standard Class,MC-17575,Matt Collins,Consumer,Yancheng,Jiangsu,...,FUR-BO-10000035,Furniture,Bookcases,"Dania Classic Bookcase, Pine",2472.66,6,0.0,1038.4200,13.25000,Medium


# Making Queries on Complete DataFrame

* Learn how to query a complete DataFrame effectively to extract specific data.

In [15]:
df.query('Profit > 1000 & Discount < 0.2')

Unnamed: 0,Row ID,Order ID,Order Date,Ship Date,Ship Mode,Customer ID,Customer Name,Segment,City,State,...,Product ID,Category,Sub-Category,Product Name,Sales,Quantity,Discount,Profit,Shipping Cost,Order Priority
12,45794,SA-2011-1830,2011-12-27,2011-12-29,Second Class,MM-7260,Magdelene Morse,Consumer,Jizan,Jizan,...,TEC-CIS-10001717,Technology,Phones,"Cisco Smart Phone, with Caller ID",2616.960,4,0.0,1151.4000,832.41000,Critical
14,27704,IN-2013-73951,2013-06-06,2013-06-08,Second Class,PF-19120,Peter Fuller,Consumer,Mudanjiang,Heilongjiang,...,OFF-AP-10003500,Office Supplies,Appliances,"KitchenAid Microwave, White",3701.520,12,0.0,1036.0800,804.54000,Critical
17,12069,ES-2014-1651774,2014-09-08,2014-09-14,Standard Class,PJ-18835,Patrick Jones,Corporate,Prato,Tuscany,...,OFF-AP-10004512,Office Supplies,Appliances,"Hoover Stove, Red",7958.580,14,0.0,3979.0800,778.32000,Low
23,13528,ES-2013-2860574,2013-02-27,2013-03-01,Second Class,LB-16795,Laurel Beltran,Home Office,Edinburgh,Scotland,...,OFF-AP-10003590,Office Supplies,Appliances,"KitchenAid Refrigerator, Black",5273.700,10,0.0,1898.4000,730.91000,High
27,11645,ES-2011-4699764,2011-03-14,2011-03-17,Second Class,EB-14110,Eugene Barchas,Consumer,Leipzig,Saxony,...,OFF-AP-10004512,Office Supplies,Appliances,"Hoover Stove, Red",3069.738,6,0.1,1364.2380,725.34000,Critical
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13013,23447,IN-2011-77892,2011-04-02,2011-04-08,Standard Class,HL-15040,Hunter Lopez,Consumer,Rajshahi,Rajshahi,...,TEC-PH-10001751,Technology,Phones,"Samsung Smart Phone, Cordless",3195.000,5,0.0,1150.2000,23.95000,Medium
14843,39450,CA-2014-140151,2014-03-24,2014-03-26,First Class,RB-19360,Raymond Buch,Consumer,Seattle,Washington,...,TEC-CO-10004722,Technology,Copiers,Canon imageCLASS 2200 Advanced Copier,13999.960,4,0.0,6719.9808,20.00054,Medium
16062,39785,CA-2013-158841,2013-02-02,2013-02-04,Second Class,SE-20110,Sanjit Engle,Consumer,Arlington,Virginia,...,TEC-MA-10001127,Technology,Machines,HP Designjet T520 Inkjet Large Format Printer ...,8749.950,5,0.0,2799.9840,17.83000,Critical
19307,21850,IN-2011-50060,2011-09-06,2011-09-13,Standard Class,MC-17575,Matt Collins,Consumer,Yancheng,Jiangsu,...,FUR-BO-10000035,Furniture,Bookcases,"Dania Classic Bookcase, Pine",2472.660,6,0.0,1038.4200,13.25000,Medium


In [44]:
df.query('Profit > 1000 & Discount < 0.2 & Segment == "Consumer"')

Unnamed: 0,Row ID,Order ID,Order Date,Ship Date,Ship Mode,Customer ID,Customer Name,Segment,City,State,...,Sales,Quantity,Discount,Profit,Shipping Cost,Order Priority,New_column,Actual Cost,Actual Cost 1,Profit Group
12,45794,SA-2011-1830,2011-12-27,2011-12-29,Second Class,MM-7260,Magdelene Morse,Consumer,Jizan,Jizan,...,2616.960,4,0.0,1151.4000,832.41000,Critical,2302.8,3449.37000,1784.55000,High Profit
14,27704,IN-2013-73951,2013-06-06,2013-06-08,Second Class,PF-19120,Peter Fuller,Consumer,Mudanjiang,Heilongjiang,...,3701.520,12,0.0,1036.0800,804.54000,Critical,2072.16,4506.06000,2896.98000,High Profit
27,11645,ES-2011-4699764,2011-03-14,2011-03-17,Second Class,EB-14110,Eugene Barchas,Consumer,Leipzig,Saxony,...,3069.738,6,0.1,1364.2380,725.34000,Critical,2728.476,3795.07800,2651.37180,High Profit
36,36423,CA-2011-160766,2011-09-14,2011-09-14,Same Day,DM-13015,Darrin Martin,Consumer,New York City,New York,...,2799.960,4,0.0,1371.9804,675.15000,High,2743.9608,3475.11000,2124.81000,High Profit
38,15380,ES-2014-2637201,2014-01-14,2014-01-18,Standard Class,PO-18865,Patrick O'Donnell,Consumer,Stockton-on-Tees,England,...,4141.020,13,0.0,1697.6700,668.96000,High,3395.34,4809.98000,3472.06000,High Profit
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13013,23447,IN-2011-77892,2011-04-02,2011-04-08,Standard Class,HL-15040,Hunter Lopez,Consumer,Rajshahi,Rajshahi,...,3195.000,5,0.0,1150.2000,23.95000,Medium,2300.4,3218.95000,3171.05000,High Profit
14843,39450,CA-2014-140151,2014-03-24,2014-03-26,First Class,RB-19360,Raymond Buch,Consumer,Seattle,Washington,...,13999.960,4,0.0,6719.9808,20.00054,Medium,13439.9616,14019.96054,13979.95946,High Profit
16062,39785,CA-2013-158841,2013-02-02,2013-02-04,Second Class,SE-20110,Sanjit Engle,Consumer,Arlington,Virginia,...,8749.950,5,0.0,2799.9840,17.83000,Critical,5599.968,8767.78000,8732.12000,High Profit
19307,21850,IN-2011-50060,2011-09-06,2011-09-13,Standard Class,MC-17575,Matt Collins,Consumer,Yancheng,Jiangsu,...,2472.660,6,0.0,1038.4200,13.25000,Medium,2076.84,2485.91000,2459.41000,High Profit


In [19]:
df.query('Profit > 1000 & Discount < 0.2 & Segment in ["Consumer", "Corporate"]')

Unnamed: 0,Row ID,Order ID,Order Date,Ship Date,Ship Mode,Customer ID,Customer Name,Segment,City,State,...,Product ID,Category,Sub-Category,Product Name,Sales,Quantity,Discount,Profit,Shipping Cost,Order Priority
12,45794,SA-2011-1830,2011-12-27,2011-12-29,Second Class,MM-7260,Magdelene Morse,Consumer,Jizan,Jizan,...,TEC-CIS-10001717,Technology,Phones,"Cisco Smart Phone, with Caller ID",2616.960,4,0.0,1151.4000,832.41000,Critical
14,27704,IN-2013-73951,2013-06-06,2013-06-08,Second Class,PF-19120,Peter Fuller,Consumer,Mudanjiang,Heilongjiang,...,OFF-AP-10003500,Office Supplies,Appliances,"KitchenAid Microwave, White",3701.520,12,0.0,1036.0800,804.54000,Critical
17,12069,ES-2014-1651774,2014-09-08,2014-09-14,Standard Class,PJ-18835,Patrick Jones,Corporate,Prato,Tuscany,...,OFF-AP-10004512,Office Supplies,Appliances,"Hoover Stove, Red",7958.580,14,0.0,3979.0800,778.32000,Low
27,11645,ES-2011-4699764,2011-03-14,2011-03-17,Second Class,EB-14110,Eugene Barchas,Consumer,Leipzig,Saxony,...,OFF-AP-10004512,Office Supplies,Appliances,"Hoover Stove, Red",3069.738,6,0.1,1364.2380,725.34000,Critical
35,47905,CG-2011-8610,2011-09-14,2011-09-15,First Class,AH-30,Aaron Hawkins,Corporate,Kamina,Katanga,...,TEC-APP-10000308,Technology,Phones,"Apple Smart Phone, Full Size",3817.260,6,0.0,1068.6600,678.15000,High
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13013,23447,IN-2011-77892,2011-04-02,2011-04-08,Standard Class,HL-15040,Hunter Lopez,Consumer,Rajshahi,Rajshahi,...,TEC-PH-10001751,Technology,Phones,"Samsung Smart Phone, Cordless",3195.000,5,0.0,1150.2000,23.95000,Medium
14843,39450,CA-2014-140151,2014-03-24,2014-03-26,First Class,RB-19360,Raymond Buch,Consumer,Seattle,Washington,...,TEC-CO-10004722,Technology,Copiers,Canon imageCLASS 2200 Advanced Copier,13999.960,4,0.0,6719.9808,20.00054,Medium
16062,39785,CA-2013-158841,2013-02-02,2013-02-04,Second Class,SE-20110,Sanjit Engle,Consumer,Arlington,Virginia,...,TEC-MA-10001127,Technology,Machines,HP Designjet T520 Inkjet Large Format Printer ...,8749.950,5,0.0,2799.9840,17.83000,Critical
19307,21850,IN-2011-50060,2011-09-06,2011-09-13,Standard Class,MC-17575,Matt Collins,Consumer,Yancheng,Jiangsu,...,FUR-BO-10000035,Furniture,Bookcases,"Dania Classic Bookcase, Pine",2472.660,6,0.0,1038.4200,13.25000,Medium


In [20]:
df['New_column'] = ''
for i1 in list(range(df.shape[0])):
    df.loc[i1, 'New_column'] = df.loc[i1, 'Profit'] * 2

# Applying a Function on Complete Columns

* Apply custom or built-in functions to entire columns for efficient data processing.

In [22]:
df['Actual Cost'] = df.apply(lambda row: row['Sales'] + row['Shipping Cost'], axis=1)

In [24]:
df['Actual Cost']

0        3243.220
1        4633.025
2        6090.661
3        3802.670
4        3736.000
           ...   
51285      65.110
51286       0.454
51287      22.930
51288      13.443
51289      61.382
Name: Actual Cost, Length: 51290, dtype: float64

In [25]:
df['Actual Cost 1'] = df.apply(lambda row: row['Sales'] - row['Shipping Cost'] + row['Sales'] * row['Discount'], axis=1)

In [26]:
df.head()

Unnamed: 0,Row ID,Order ID,Order Date,Ship Date,Ship Mode,Customer ID,Customer Name,Segment,City,State,...,Product Name,Sales,Quantity,Discount,Profit,Shipping Cost,Order Priority,New_column,Actual Cost,Actual Cost 1
0,32298,CA-2012-124891,2012-07-31,2012-07-31,Same Day,RH-19495,Rick Hansen,Consumer,New York City,New York,...,Plantronics CS510 - Over-the-Head monaural Wir...,2309.65,7,0.0,762.1845,933.57,Critical,1524.369,3243.22,1376.08
1,26341,IN-2013-77878,2013-02-05,2013-02-07,Second Class,JR-16210,Justin Ritter,Corporate,Wollongong,New South Wales,...,"Novimex Executive Leather Armchair, Black",3709.395,9,0.1,-288.765,923.63,Critical,-577.53,4633.025,3156.7045
2,25330,IN-2013-71249,2013-10-17,2013-10-18,First Class,CR-12730,Craig Reiter,Consumer,Brisbane,Queensland,...,"Nokia Smart Phone, with Caller ID",5175.171,9,0.1,919.971,915.49,Medium,1839.942,6090.661,4777.1981
3,13524,ES-2013-1579342,2013-01-28,2013-01-30,First Class,KM-16375,Katherine Murray,Home Office,Berlin,Berlin,...,"Motorola Smart Phone, Cordless",2892.51,5,0.1,-96.54,910.16,Medium,-193.08,3802.67,2271.601
4,47221,SG-2013-4320,2013-11-05,2013-11-06,Same Day,RH-9495,Rick Hansen,Consumer,Dakar,Dakar,...,"Sharp Wireless Fax, High-Speed",2832.96,8,0.0,311.52,903.04,Critical,623.04,3736.0,1929.92


In [27]:
df.tail()

Unnamed: 0,Row ID,Order ID,Order Date,Ship Date,Ship Mode,Customer ID,Customer Name,Segment,City,State,...,Product Name,Sales,Quantity,Discount,Profit,Shipping Cost,Order Priority,New_column,Actual Cost,Actual Cost 1
51285,29002,IN-2014-62366,2014-06-19,2014-06-19,Same Day,KE-16420,Katrina Edelman,Corporate,Kure,Hiroshima,...,"Advantus Thumb Tacks, 12 Pack",65.1,5,0.0,4.5,0.01,Medium,9.0,65.11,65.09
51286,35398,US-2014-102288,2014-06-20,2014-06-24,Standard Class,ZC-21910,Zuschuss Carroll,Consumer,Houston,Texas,...,Hoover Replacement Belt for Commercial Guardsm...,0.444,1,0.8,-1.11,0.01,Medium,-2.22,0.454,0.7892
51287,40470,US-2013-155768,2013-12-02,2013-12-02,Same Day,LB-16795,Laurel Beltran,Home Office,Oxnard,California,...,"#10- 4 1/8"" x 9 1/2"" Security-Tint Envelopes",22.92,3,0.0,11.2308,0.01,High,22.4616,22.93,22.91
51288,9596,MX-2012-140767,2012-02-18,2012-02-22,Standard Class,RB-19795,Ross Baird,Home Office,Valinhos,São Paulo,...,"Acco Index Tab, Economy",13.44,2,0.0,2.4,0.003,Medium,4.8,13.443,13.437
51289,6147,MX-2012-134460,2012-05-22,2012-05-26,Second Class,MC-18100,Mick Crebagga,Consumer,Tipitapa,Managua,...,"Eaton Computer Printout Paper, 8.5 x 11",61.38,3,0.0,1.8,0.002,High,3.6,61.382,61.378


In [28]:
df.head(10)

Unnamed: 0,Row ID,Order ID,Order Date,Ship Date,Ship Mode,Customer ID,Customer Name,Segment,City,State,...,Product Name,Sales,Quantity,Discount,Profit,Shipping Cost,Order Priority,New_column,Actual Cost,Actual Cost 1
0,32298,CA-2012-124891,2012-07-31,2012-07-31,Same Day,RH-19495,Rick Hansen,Consumer,New York City,New York,...,Plantronics CS510 - Over-the-Head monaural Wir...,2309.65,7,0.0,762.1845,933.57,Critical,1524.369,3243.22,1376.08
1,26341,IN-2013-77878,2013-02-05,2013-02-07,Second Class,JR-16210,Justin Ritter,Corporate,Wollongong,New South Wales,...,"Novimex Executive Leather Armchair, Black",3709.395,9,0.1,-288.765,923.63,Critical,-577.53,4633.025,3156.7045
2,25330,IN-2013-71249,2013-10-17,2013-10-18,First Class,CR-12730,Craig Reiter,Consumer,Brisbane,Queensland,...,"Nokia Smart Phone, with Caller ID",5175.171,9,0.1,919.971,915.49,Medium,1839.942,6090.661,4777.1981
3,13524,ES-2013-1579342,2013-01-28,2013-01-30,First Class,KM-16375,Katherine Murray,Home Office,Berlin,Berlin,...,"Motorola Smart Phone, Cordless",2892.51,5,0.1,-96.54,910.16,Medium,-193.08,3802.67,2271.601
4,47221,SG-2013-4320,2013-11-05,2013-11-06,Same Day,RH-9495,Rick Hansen,Consumer,Dakar,Dakar,...,"Sharp Wireless Fax, High-Speed",2832.96,8,0.0,311.52,903.04,Critical,623.04,3736.0,1929.92
5,22732,IN-2013-42360,2013-06-28,2013-07-01,Second Class,JM-15655,Jim Mitchum,Corporate,Sydney,New South Wales,...,"Samsung Smart Phone, with Caller ID",2862.675,5,0.1,763.275,897.35,Critical,1526.55,3760.025,2251.5925
6,30570,IN-2011-81826,2011-11-07,2011-11-09,First Class,TS-21340,Toby Swindell,Consumer,Porirua,Wellington,...,"Novimex Executive Leather Armchair, Adjustable",1822.08,4,0.0,564.84,894.77,Critical,1129.68,2716.85,927.31
7,31192,IN-2012-86369,2012-04-14,2012-04-18,Standard Class,MB-18085,Mick Brown,Consumer,Hamilton,Waikato,...,"Chromcraft Conference Table, Fully Assembled",5244.84,6,0.0,996.48,878.38,High,1992.96,6123.22,4366.46
8,40155,CA-2014-135909,2014-10-14,2014-10-21,Standard Class,JW-15220,Jane Waco,Corporate,Sacramento,California,...,Fellowes PB500 Electric Punch Plastic Comb Bin...,5083.96,5,0.2,1906.485,867.69,Low,3812.97,5951.65,5233.062
9,40936,CA-2012-116638,2012-01-28,2012-01-31,Second Class,JH-15985,Joseph Holt,Consumer,Concord,North Carolina,...,Chromcraft Bull-Nose Wood Oval Conference Tabl...,4297.644,13,0.4,-1862.3124,865.74,Critical,-3724.6248,5163.384,5150.9616


# Grouping Data (`Group By`)

* Explore how to group data using the `groupby` method for insightful aggregation and analysis.

In [31]:
df.groupby('State')['Sales'].sum()

State
'Ajman        209.98800
'Amman       7167.99000
'Asir        1603.50000
Abia         1064.12400
Abruzzi      6723.75000
               ...     
Zinder        362.73000
Zulia        4208.93696
Zürich      11707.74000
Šiauliai       96.56100
Žilina         60.12000
Name: Sales, Length: 1094, dtype: float64

In [32]:
df.groupby('Segment')['Profit'].mean()

Segment
Consumer       28.254008
Corporate      28.596042
Home Office    29.648847
Name: Profit, dtype: float64

# Making Bins

* Understand the process of creating bins to categorize data into intervals.

* `pd.cut` is a powerful function in Pandas that is used to segment and sort data values into discrete bins. It’s commonly used for converting continuous numerical data into categorical data by dividing it into intervals. This is particularly helpful in tasks such as grouping data for analysis or creating histograms.
  - **bins:** The number of bins, or the specific bin edges. It can be an integer or a sequence of scalars.
  - **labels:** Labels for the bins. If `False`, integer indicators of the bins are returned.

In [39]:
df['Profit Group'] = pd.cut(df['Profit'], bins=[0, 500, 1000, 10000], labels=['Loss', 'Low Profit', 'High Profit'])

In [40]:
df['Profit Group']

0        Low Profit
1               NaN
2        Low Profit
3               NaN
4              Loss
            ...    
51285          Loss
51286           NaN
51287          Loss
51288          Loss
51289          Loss
Name: Profit Group, Length: 51290, dtype: category
Categories (3, object): ['Loss' < 'Low Profit' < 'High Profit']

In [45]:
df['Profit Group'].unique()

['Low Profit', NaN, 'Loss', 'High Profit']
Categories (3, object): ['Loss' < 'Low Profit' < 'High Profit']

In [46]:
df['Discount Group'] = pd.cut(df['Discount'], bins=[0, 0.1, 0.2, 0.3], labels=['Low Discount', 'Medium Discount', 'High Discount'])

In [47]:
df['Discount Group'].unique()

[NaN, 'Low Discount', 'Medium Discount', 'High Discount']
Categories (3, object): ['Low Discount' < 'Medium Discount' < 'High Discount']

# Checking Null Values in the Dataset

* Identify and handle missing values in the dataset for cleaner data and accurate analysis.

In [48]:
df.isnull().sum()

Row ID                0
Order ID              0
Order Date            0
Ship Date             0
Ship Mode             0
Customer ID           0
Customer Name         0
Segment               0
City                  0
State                 0
Country               0
Postal Code       41296
Market                0
Region                0
Product ID            0
Category              0
Sub-Category          0
Product Name          0
Sales                 0
Quantity              0
Discount              0
Profit                0
Shipping Cost         0
Order Priority        0
New_column            0
Actual Cost           0
Actual Cost 1         0
Profit Group      13212
Discount Group    39370
dtype: int64