In [1]:
import pandas as pd
from pathlib import Path

p = Path("Resources/order_data.csv")

df = pd.read_csv(p)

In [8]:
# Returns bool if series item is None, NaN or NaT
#df.isna()

#df.isnull()

# # Returns bool if any None, NaN or NaT values for series
df.isna().any()

order_no       False
customer_no     True
order_total     True
order_date     False
dtype: bool

In [12]:
# Apply to a series, versus data frame
df.customer_no.isna()

#df.customer_no.isna().any()

0    False
1    False
2    False
3    False
4    False
5    False
6    False
7     True
Name: customer_no, dtype: bool

In [13]:
# Opposite is notna()
df.order_no.notna() 
df.order_no.notna().any()

True

In [14]:
# Slicing
df.order_no[6:].notna()
pd.notna(df.order_no.iloc[-1])

True

In [15]:
# Replace $ char to cast to float
df.order_total = df.order_total.str.replace('$', '')

In [16]:
# Replace NA values with float;
df.fillna({
    'order_total': float(0)
}, inplace=True)
df.order_total.fillna(round(0, 2), inplace=True)

df

Unnamed: 0,order_no,customer_no,order_total,order_date
0,452517125,CM458565,141.25,01-10-2019
1,45251825,CJ458565,14.0,04/25/2019
2,4465241327,AK45765,1103.36,04-25-2019
3,4465241327,AK45765,1103.36,04-25-2019
4,413853121,CM458565,0.0,4/24/2019
5,45235825,TV4663,65.42,04-29-2019
6,2356363,2124,258936.12,04-29-2019
7,452519232,,141.25,01-10-2019


In [17]:
# Case to numeric
df.order_total = df.order_total.astype(float)

df.order_total

0       141.25
1        14.00
2      1103.36
3      1103.36
4         0.00
5        65.42
6    258936.12
7       141.25
Name: order_total, dtype: float64

In [18]:
# Convert dates to same format
df.order_date = pd.to_datetime(df.order_date, format='%m-%d-%Y', errors='coerce').fillna(pd.to_datetime(df.order_date, format='%m/%d/%Y', errors='coerce'))

# Convert to unix time stamp
#df.order_date = df.order_date.apply(lambda x: x.timestamp() if pd.notnull(x) else None)



In [19]:
# Sort data
df.sort_values(by=['order_date', 'order_total'], ascending=[False, False], inplace=True)
df

Unnamed: 0,order_no,customer_no,order_total,order_date
6,2356363,2124,258936.12,2019-04-29
5,45235825,TV4663,65.42,2019-04-29
2,4465241327,AK45765,1103.36,2019-04-25
3,4465241327,AK45765,1103.36,2019-04-25
1,45251825,CJ458565,14.0,2019-04-25
4,413853121,CM458565,0.0,2019-04-24
0,452517125,CM458565,141.25,2019-01-10
7,452519232,,141.25,2019-01-10


In [20]:
# Calculate metrics

highest_order = df.loc[df.order_total.idxmax()]

most_recent = df.loc[df.order_date.idxmax()]

orders_on_4_25 = df[df.order_date == pd.to_datetime('2019-04-25')]

missing_customer_no = df[df.customer_no.isnull()]

total_order_amount = df.order_total.sum()

print(f"""
Highest order: {highest_order.order_total}
Most recent order: {most_recent.order_date}
Orders on 4/25: {len(orders_on_4_25)}
Orders with missing customer no: {len(missing_customer_no)}
Total order amount: ${total_order_amount}
""")



Highest order: 258936.12
Most recent order: 2019-04-29 00:00:00
Orders on 4/25: 3
Orders with missing customer no: 1
Total order amount: $261504.76

