# Pandas EDA

## 1. EDA Demonstration

In [3]:
import pandas as pd

In [5]:
# We will store the table csv data in a variable
# Tabs are used as the separating character in this file
orders = pd.read_csv('data/orders.csv')

In [6]:
# Use the head method to investigate the first row of data
orders.head()

Unnamed: 0,order_id,order_date,ship_date,ship_mode,customer_id,product_id,sales,quantity,discount,profit,postal_code,region_id
0,ES-2019-3581298,2019-10-05,2019-10-10,Standard Class,MW-18220,FUR-BO-10002318,1239.84,3,0.25,0.19,,28804.0
1,CA-2018-2597967,2018-12-06,2018-12-08,Standard Class,TS-21430,FUR-FU-10003096,30.36,4,0.28,0.24,,7530.0
2,ES-2017-410130,2017-04-20,2017-04-21,Second Class,ES-14080,OFF-BI-10004007,14.69,1,0.12,0.1,,69959.0
3,IT-2019-1363430,2019-09-03,2019-09-04,Second Class,RP-19390,OFF-BI-10002040,15.4,1,0.15,0.1,,4715.0
4,ID-2018-682568,2018-10-29,2018-10-29,Standard Class,PH-18790,TEC-CO-10000452,656.99,3,0.23,0.19,6824.0,5636.0


In [7]:
# Create a dataframe to view the column names and data types
pd.DataFrame(orders.dtypes, columns=["DataType"])

Unnamed: 0,DataType
order_id,object
order_date,object
ship_date,object
ship_mode,object
customer_id,object
product_id,object
sales,float64
quantity,int64
discount,float64
profit,float64


In [8]:
# Use the shape attribute to determine the amount of rows and columns total
orders.shape
print(f"There are {orders.shape[0]} rows and {orders.shape[1]} columns")

There are 5000 rows and 12 columns


In [10]:
# Use the .columns property to list out the column names
# Use the .index.name property to identify the index
print(orders.columns)
print(orders.index)

Index(['order_id', 'order_date', 'ship_date', 'ship_mode', 'customer_id',
       'product_id', 'sales', 'quantity', 'discount', 'profit', 'postal_code',
       'region_id'],
      dtype='object')
RangeIndex(start=0, stop=5000, step=1)


In [11]:
# Provide a dictionary to the rename method to rename any columns
# The inplace flag determines whether to modify the original dataframe
orders.rename(columns={"discount": "discount_amount", "order_date": "ordered_on_date"}, inplace=False)

Unnamed: 0,order_id,ordered_on_date,ship_date,ship_mode,customer_id,product_id,sales,quantity,discount_amount,profit,postal_code,region_id
0,ES-2019-3581298,2019-10-05,2019-10-10,Standard Class,MW-18220,FUR-BO-10002318,1239.84,3,0.25,0.19,,28804.0
1,CA-2018-2597967,2018-12-06,2018-12-08,Standard Class,TS-21430,FUR-FU-10003096,30.36,4,0.28,0.24,,7530.0
2,ES-2017-410130,2017-04-20,2017-04-21,Second Class,ES-14080,OFF-BI-10004007,14.69,1,0.12,0.10,,69959.0
3,IT-2019-1363430,2019-09-03,2019-09-04,Second Class,RP-19390,OFF-BI-10002040,15.40,1,0.15,0.10,,4715.0
4,ID-2018-682568,2018-10-29,2018-10-29,Standard Class,PH-18790,TEC-CO-10000452,656.99,3,0.23,0.19,6824.0,5636.0
...,...,...,...,...,...,...,...,...,...,...,...,...
4995,IN-2018-63053,2018-12-14,2018-12-16,Same Day,KM-16660,OFF-LA-10003644,54.39,7,0.23,0.19,,1410.0
4996,MX-2019-4981245,2019-11-28,2019-12-01,Standard Class,SC-20725,OFF-AR-10003680,19.74,1,0.15,0.10,78415.0,1488.0
4997,MZ-2019-4195017,2019-06-22,2019-06-26,Same Day,JW-5220,OFF-OIC-10000121,27.72,2,0.05,0.00,80027.0,4553.0
4998,NI-2018-1509129,2018-08-19,2018-08-22,Second Class,MC-7425,TEC-BEL-10003985,621.50,8,0.03,0.00,,19848.0


In [13]:
# Explore columns with column attributes and methods
orders['postal_code'].value_counts()


10035.0    90
90049.0    66
98115.0    55
19143.0    48
94122.0    48
           ..
49423.0     1
85301.0     1
2149.0      1
33433.0     1
20852.0     1
Name: postal_code, Length: 278, dtype: int64

In [14]:
len(orders['postal_code'].unique())


279

In [15]:
orders['profit'].describe()

count    5000.000000
mean        1.978328
std        38.126910
min      -734.530000
25%         0.100000
50%         0.200000
75%         0.290000
max      1537.830000
Name: profit, dtype: float64

## 2. Exploring a new dataset

In [11]:
# Import the products.csv dataset and explore the following questions:
products = pd.read_csv('data/datasets/products.csv')

In [12]:
# A. What are the columns and index: do they suggest any relationship to other tables?
products.columns

Index(['product_id', 'category', 'sub_category', 'product_name',
       'product_cost_to_consumer'],
      dtype='object')

In [13]:
# B. How many rows of data are there?
products.shape[0]

10292

In [14]:
# C. What are the types of each column?
pd.DataFrame(products.dtypes, columns=["DataType"])

Unnamed: 0,DataType
product_id,object
category,object
sub_category,object
product_name,object
product_cost_to_consumer,float64


## 3. Boolean Filtering

In [16]:
# Let's return to the orders dataset for the rest of the challenges
# Use boolean filtering and DataFrame/DataSeries methods to solve the following challanges:

In [17]:
# A. What is the mean profit of orders where the ship_mode is "Second Class"
orders[orders["ship_mode"] == "Second Class"]["profit"].mean()

0.7557168894289046

In [25]:
# B. Which product is the most commonly ordered on 2019-10-05 
orders[orders["order_date"]=="2019-10-05"][["product_id","quantity"]].groupby("product_id").sum()

Unnamed: 0_level_0,quantity
product_id,Unnamed: 1_level_1
FUR-BO-10002318,3
FUR-FU-10003878,2
OFF-BI-10000930,5
OFF-BI-10003806,2
OFF-ELD-10002279,2
OFF-EN-10004560,2
OFF-LA-10002088,1
OFF-PA-10004675,3
TEC-PH-10001817,4


## 4. Sorting and Filtering

In [28]:
# A. What are the three most profitable orders purchased by customer PO-8865?
orders[orders['customer_id'] == "MW-18220"].sort_values(by="profit", ascending=False).head(3)

Unnamed: 0,order_id,order_date,ship_date,ship_mode,customer_id,product_id,sales,quantity,discount,profit,postal_code,region_id
1266,ES-2018-2955506,2018-12-08,2018-12-08,Second Class,MW-18220,OFF-BI-10001808,393.84,8,0.33,0.29,,6017.0
0,ES-2019-3581298,2019-10-05,2019-10-10,Standard Class,MW-18220,FUR-BO-10002318,1239.84,3,0.25,0.19,,28804.0


In [30]:
# B. What are the five orders of product_id FUR-FU-10003096 with the highest quanitity?
orders[orders['product_id'] == 'FUR-FU-10003096'].sort_values(by='quantity', ascending=False).head(5)

Unnamed: 0,order_id,order_date,ship_date,ship_mode,customer_id,product_id,sales,quantity,discount,profit,postal_code,region_id
1,CA-2018-2597967,2018-12-06,2018-12-08,Standard Class,TS-21430,FUR-FU-10003096,30.36,4,0.28,0.24,,7530.0


## Recap

We covered a lot of ground! It's ok if this takes a while to gel.

```python

# basic DataFrame operations
df.head()
df.tail()
df.shape
df.columns
df.Index

# selecting columns
df.column_name
df['column_name']

# renaming columns
df.rename({'old_name':'new_name'}, inplace=True)
df.columns = ['new_column_a', 'new_column_b']

# notable columns operations
df.describe() # five number summary
df['col1'].nunique() # number of unique values
df['col1'].value_counts() # number of occurrences of each value in column

# filtering
df[ df['col1'] < 50 ] # filter column to be less than 50
df[ (df['col1'] == value1) & (df['col2'] > value2) ] # filter column where col1 is equal to value1 AND col2 is greater to value 2

# sorting
df.sort_values(by='column_name', ascending = False) # sort biggest to smallest

```


It's common to refer back to your own code *all the time.* Don't hesistate to reference this guide! 🐼


