[Reference](https://betterprogramming.pub/9-pandas-functions-that-will-do-99-of-any-analytics-task-e6b6fb1b16bf)

In [1]:
import pandas as pd
import numpy as np

#Reading an Excel File
df = pd.read_excel('./Olist-full.xlsx')

#Showing columns names
df.columns

#Changing columns names
df = df.rename(columns = {
    'order_id': 'id_order_number',
    'customer_id': 'customer_number'
})

#Checking the new names
df.columns

#Basic information about our dataframe
df.info()

#Describing the 'price' variable
df['price'].describe()

#Describing the 'payment_type' variable
df['payment_type'].value_counts()

#Normalizing the results
df['payment_type'].value_counts(normalize = True)

#Checking if we have any NaN value
df.isna().any()

#Droping NaN values of the product_id column
df = df.dropna(subset = ['product_id'])

#Changing the dtype of the order_purchase_timestamp
df['order_purchase_timestamp'] = pd.to_datetime(df['order_purchase_timestamp'])

#Create a new column with the month of the order
df['order_month'] = df['order_purchase_timestamp'].dt.month

#Grouping for each 3 months by customer_state
buys_3m = df.groupby([pd.Grouper(key = 'order_purchase_timestamp', freq = '3M'), 'customer_state']).agg({
    'id_order_number': 'nunique',
    'price': ['sum', 'mean', 'max'],
    'freight_value': ['mean', 'median'],
}).reset_index()

buys_3m.columns = ['_'.join(col) for col in buys_3m.columns]

#Filtering for SP state and price up or equal 115
sp_above_mean = df[(df['price'] >= 115) & (df['seller_state'] == 'SP')]

#Filtering by the quantile - we can remove outliers with this
q1 = df['price'].quantile(0.01)
q2 = df['price'].quantile(0.99)

df_price_outliers = df[(df['price'] >= q1) & (df['price'] <= q2)]

#Creating a new column with apply
df['price_status'] = df['price'].apply(lambda x: 'UP' if x >= df['price'].mean() else 'DOWN')

#Creating a new column using map
df['seller_by_payment'] = df['payment_type'].map(credit_cards)