In [None]:
import pandas as pd
import numpy as np
import seaborn as sns

import matplotlib.pyplot as pt
%matplotlib inline

import datetime
import scipy.stats as stats
import statsmodels.api as sm
from statsmodels.formula.api import ols

In [None]:
orders = pd.read_csv('orders.csv', index_col=0)

orders.head()

orders.describe()


orders.dtypes

### Problem 1: Dataset Import & Cleaning
Check **"Profit"** and **"Sales"** in the dataset, convert these two columns to numeric type. 

In [None]:
########### Eliminate '$', '.', & ',' from profit to switch to numeric

orders['Profit'] = orders['Profit'].str.replace('$','')
orders['Profit'] = orders['Profit'].str.replace('.', '')
orders['Profit'] = orders['Profit'].str.replace(',', '')

########### Eliminate '$', '.', & ',' from Sales to switch to numeric

orders['Sales'] = orders['Sales'].str.replace('$','')
orders['Sales'] = orders['Sales'].str.replace('.', '')
orders['Sales'] = orders['Sales'].str.replace(',', '')



########## Switch Profit, Orders to numeric value
orders['Profit'] = pd.to_numeric(orders['Profit'])
orders['Sales'] = pd.to_numeric(orders['Sales'])


### Problem 2: Inventory Management
- Retailers that depend on seasonal shoppers have a particularly challenging job when it comes to inventory management. Your manager is making plans for next year's inventory.
- He wants you to answer the following questions:
    1. Is there any seasonal trend of inventory in the company?
    2. Is the seasonal trend the same for different categories?

- ***Hint:*** For each order, it has an attribute called `Quantity` that indicates the number of product in the order. If an order contains more than one product, there will be multiple observations of the same order.

In [None]:
######### Create new month column to mimic seasons

orders['Month'] = pd.DatetimeIndex(orders['Order.Date']).month

orders.dtypes

########## Create new DF with only the Columns to answer this question
orders_anova = orders[['Quantity','Month', 'Category']]
orders_anova.describe()
orders_anova.dtypes



######## ANOVA test to see a seasonal difference in Quantites sold
x = [1 , 2, 3, 4, 5, 6, 7, 8, 9, 10,11,12] 
month_dict = {}

for month in x:
    month_dict[month] = orders_anova[orders_anova['Month'] == month]['Quantity']

######### ONE WAY for Quantites in Months
stats.f_oneway(
             month_dict[1], month_dict[2], \
             month_dict[3], month_dict[4], \
             month_dict[5], month_dict[6], \
             month_dict[7], month_dict[8], \
             month_dict[9], month_dict[10], \
             month_dict[11], month_dict[12]
)  ### F_onewayResult(statistic=3.4131077340608647, pvalue=9.374920484624957e-05) 


######## ONE Way ANOVA for trends in Categories per season

import scipy.stats as ss
for month in orders_anova.groupby('Month'):
    samples = [category[1] for category in month[1].groupby('Category')['Quantity']]
    f_val, p_val = ss.f_oneway(*samples)
    print(': {}, F value: {:.3f}, p value: {:.3f}'.format(month[0], f_val, p_val))

#Month: 1, F value: 2.214, p value: 0.109
#Month: 2, F value: 1.911, p value: 0.148
#Month: 3, F value: 2.767, p value: 0.063
#Month: 4, F value: 0.364, p value: 0.695
#Month: 5, F value: 0.122, p value: 0.885
#Month: 6, F value: 0.303, p value: 0.738
#Month: 7, F value: 1.000, p value: 0.368
#Month: 8, F value: 0.173, p value: 0.841
#Month: 9, F value: 0.768, p value: 0.464
#Month: 10, F value: 0.869, p value: 0.420
#Month: 11, F value: 1.109, p value: 0.330
#Month: 12, F value: 0.485, p value: 0.616


### Post Hoc

from statsmodels.stats.multicomp import pairwise_tukeyhsd
for month, grouped_orders in orders_anova.groupby('Month'):
    print('Month {}'.format(month), pairwise_tukeyhsd(grouped_orders['Quantity'], grouped_orders['Category']))

    
## No Difference in Category Quantity Sold per month



### Problem 3: Why did customers make returns?
- Your manager required you to give a brief report (**Plots + Interpretations**) on returned orders.

	1. How much profit did we lose due to returns each year?


	2. How many customer returned more than once? more than 5 times?


	3. Which regions are more likely to return orders?


	4. Which categories (sub-categories) of products are more likely to be returned?

- ***Hint:*** Merge the **Returns** dataframe with the **Orders** dataframe using `Order.ID`.
