In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
import plotly.express as px
from jupyterthemes import jtplot
import os
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 5000)
pd.set_option('display.width', 1000)
jtplot.style()
jtplot.style(figsize=(25, 8.5),spines=False, gridlines='--',ticks=True)

In [None]:
orders = pd.read_csv('Orders.csv', index_col=0)
returns = pd.read_csv('Returns.csv', index_col=0)

# Part 1

In [None]:
orders = pd.merge(orders,returns,
                 left_on='Order.ID', 
                 right_on = 'Order ID',
                 how='left')

In [None]:
def tranformrow(r):
    if pd.isna(r["Order ID"]):
        r["Order ID"] = 0
        return r
    else:
        r["Order ID"] = 1
        return r
    
orders = orders.apply(tranformrow,axis=1)

In [None]:
orders['Order.Date'] = pd.to_datetime(orders['Order.Date'])
orders['year'] = orders['Order.Date'].dt.year
orders['month'] = orders['Order.Date'].dt.month
orders['Sales'] = orders['Sales'].replace('[\$,]', '', regex=True).astype(float)
orders['Profit'] = orders['Profit'].replace('[\$,]', '', regex=True).astype(float)

In [None]:
_ = orders.hist(figsize=(20,15))

In [None]:
#1.Is there any seasonal trend of inventory in the company?
orderseason = orders.groupby(['Order.Date'])['Quantity'].sum()
orderseason.plot(figsize=(20,10), linewidth=5, fontsize=20)
plt.xlabel('Year', fontsize=20) #orderseason.index.year);

In [None]:
#2. Is the seasonal trend the same for different categories?
color = {'b', 'g', 'r'}
orderseason = orders.groupby(['Order.Date','Category'])['Quantity'].sum()
orderseason = pd.DataFrame(orderseason)
orderseason.reset_index(inplace=True)
fig = px.line(orderseason, x='Order.Date', y='Quantity', color="Category")
fig.update_traces(mode='lines', marker_line_width=.5, opacity=.5)
fig.show()


In [None]:
#fig = px.line(orderseason, x='Order.Date', y='Quantity')
#fig.show()
ordermonth = orders.groupby(['month','Category'])['Quantity'].sum()
ordermonth = pd.DataFrame(ordermonth)
ordermonth.reset_index(inplace=True)
fig = px.line(ordermonth, x='month', y='Quantity', color="Category")
fig.update_traces(mode='lines', marker_line_width=.5, opacity=.5)
fig.show()


In [None]:
#How much profit did we lose due to returns each year?
#2. How many customer returned more than once? more than 5 times?
#3. Which regions are more likely to return orders?
#4. Which categories (sub-categories) of products are more likely to be returned?

In [None]:
#How much profit did we lose due to returns each year?
orders['year'] = orders['Order.Date'].dt.year
return_yr = orders[orders['Order ID'] == 1]
return_yr.groupby(['year'])['Profit'].sum()


In [None]:
#How many customer returned more than once? more than 5 times?
cust_ret = return_yr.groupby(['Customer.ID'])['Order ID'].sum()

cust_ret2 = cust_ret[cust_ret.values >=2]
cust_ret2
two = cust_ret2.count()

cust_ret5 = cust_ret[cust_ret.values >=5]
cust_ret5
five = cust_ret5.count()

print(two,five)

In [None]:
#3. Which regions are more likely to return orders?
regions = orders.groupby(['Region_x']).agg({'Order ID' : ['sum'], 'Order.ID' : ['count']})
regions.reset_index(inplace=True)


In [None]:
regions['percent_returns'] = regions[('Order ID','sum')]/regions[('Order.ID','count')] 

In [None]:
fig = px.bar(regions,x='Region_x', y='percent_returns', color = 'percent_returns')
fig.show()

In [None]:
#4. Which categories (sub-categories) of products are more likely to be returned?
subcats = orders.groupby(['Category','Sub.Category']).agg({'Order ID' : ['sum'], 'Order.ID' : ['count']})
subcats.reset_index(inplace=True)
subcats['percent_returns'] = subcats[('Order ID','sum')]/subcats[('Order.ID','count')] 

In [None]:
fig = px.bar(subcats,x='Sub.Category', y='percent_returns', color = 'percent_returns')
fig.show()

In [None]:
fig = px.bar(subcats,x='Category', y='percent_returns', color = 'percent_returns')
fig.show()

In [None]:
orders.to_csv('POrders.csv', index=False)

# Part II 

In [2]:
orders = pd.read_csv('POrders.csv')

In [3]:
orders.rename(columns={"Order ID": "Returned"}, inplace=True)

In [4]:
returned = orders[['Returned']]

#### Step 2:
- Your manager believes that **how long it took the order to ship** would affect whether the customer would return it or not. 
- He wants you to generate a feature which can measure how long it takes the company to process each order.
- ***Hint:*** Process.Time = Ship.Date - Order.Date

In [5]:
orders['Ship.Date'] = pd.to_datetime(orders['Ship.Date'])
orders['Order.Date'] = pd.to_datetime(orders['Order.Date'])
orders['ProcessTime'] = [int(i.days) for i in (orders['Ship.Date'] - orders['Order.Date'])]

- If a product has been returned before, it may be returned again. 
- Let us generate a feature indictes how many times the product has been returned before.
- If it never got returned, we just impute using 0.
- ***Hint:*** Group by different Product.ID

In [6]:
returnnum = orders.groupby(['Product.ID']).agg({'Returned' : ['sum']})
returnnum = pd.DataFrame(returnnum)
returnnum.columns = [' '.join(col).strip() for col in returnnum.columns.values]

In [7]:
returnnum.columns

Index(['Returned sum'], dtype='object')

In [8]:
orders = orders.join(returnnum,how= 'left', on='Product.ID')

- You can use any binary classification method you have learned so far.
- Use 80/20 training and test splits to build your model. 
- Double check the column types before you fit the model.
- Only include useful features. i.e all the `ID`s should be excluded from your training set.
- Note that there are only less than 5% of the orders have been returned, so you should consider using the [createDataPartition](https://www.rdocumentation.org/packages/caret/versions/6.0-80/topics/createDataPartition) function from `caret` package and [StratifiedKfold](http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.StratifiedKFold.html#sklearn-model-selection-stratifiedkfold) from sklearn when running cross-validation.
- Do forget to `set.seed()` before the spilt to make your result reproducible.
- **Note:** We are not looking for the best tuned model in the lab so don't spend too much time on grid search. Focus on model evaluation and the business use case of each model.

In [9]:
import random 
import math as m
random.seed(0)

In [10]:
orders.drop(columns=['Returned','Order.ID','Customer.Name','Postal.Code', 'Region_y', 'Customer.ID','Product.Name','City','Product.ID','State'],inplace=True)

In [11]:
dates = orders[['Order.Date','Ship.Date']]
orders['weekday'] = orders['Order.Date'].dt.dayofweek
orders['quarter'] = orders['Order.Date'].dt.quarter
#orders.head()

In [12]:
orders.drop(columns=['Order.Date','Ship.Date'],inplace=True)

In [131]:
orders.head()
orders.shape
#(51290, 19)
cols = orders.columns.values.tolist()

In [13]:
ohc_orders = pd.get_dummies(orders,drop_first=True)

In [14]:
ohc_orders.head()
ohc_orders.shape
#(51290, 1328)
#m.sqrt(51290)

(51290, 227)

In [97]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss
scores = []
intercepts = []
coefs = []
prob = []
pred = []
losses = []
sdstep = []
classes = []
steps = range(100,10200,2000)
reg = LogisticRegression()
skf = StratifiedKFold(n_splits=5)
skf.get_n_splits(ohc_orders, returned)
StratifiedKFold(n_splits=5)


for step in steps:
    for train_index, test_index in skf.split(ohc_orders, returned):
        reg.set_params(solver = 'lbfgs', max_iter = step,  verbose=0)
        X_train, X_test = ohc_orders.iloc[train_index], ohc_orders.iloc[test_index]
        y_train, y_test = returned.iloc[train_index], returned.iloc[test_index]  
        reg.fit(X_train, y_train.values.ravel())
        scores.append(reg.score(X_test,y_test.values.ravel()))
        intercepts.append(reg.intercept_)
        coefs.append(reg.coef_)
        prob.append(reg.predict_log_proba(X_test))
        pred.append(reg.predict(X_test))
        losses.append(log_loss(returned.values.ravel(), reg.predict_proba(ohc_orders)))
        sdstep.append(step)
        classes.append(reg.classes_)


lbfgs failed to converge. Increase the number of iterations.


lbfgs failed to converge. Increase the number of iterations.


lbfgs failed to converge. Increase the number of iterations.


lbfgs failed to converge. Increase the number of iterations.


lbfgs failed to converge. Increase the number of iterations.


lbfgs failed to converge. Increase the number of iterations.


lbfgs failed to converge. Increase the number of iterations.


lbfgs failed to converge. Increase the number of iterations.



In [141]:
len(steps)

6

In [146]:
jtplot.style()
jtplot.style(figsize=(25, 8.5),spines=False, gridlines='--',ticks=True)
coefs2 = coefs
prob2 = prob
#pred
#losses
#sdstep

coefs3 = pd.DataFrame()
for c in coefs2:
    print("CO:",c[0])
    #coefs3 = coefs3.append(pd.DataFrame(c))

#coefs3.columns = cols
#df = pd.DataFrame(coefs2)

coefs4 = coefs3.transpose()
coefs4.head()

CO: [-5.14081365e-04 -1.50472318e-02  5.73902620e-02  1.07483403e-03
  1.43078409e-03 -1.75657431e-03  1.14987639e-02  4.17444522e-02
  2.97373390e-01 -3.12284240e-02  1.87742950e-02  1.37633997e-02
 -6.02358584e-02  7.96001408e-02 -5.14678389e-02 -9.03154267e-02
  7.28232331e-03 -3.71969321e-03 -2.04170910e-03 -1.24569960e-03
 -3.87935758e-04  9.03047827e-03 -5.21231847e-03  2.18452282e-04
 -8.05821337e-05 -5.08673396e-03 -2.64567915e-03  8.93975892e-03
 -1.30339762e-02 -3.27532520e-04  4.69602209e-03 -1.12691554e-04
 -2.07267460e-03 -2.16513734e-03 -7.93789570e-04  1.55407753e-02
  6.93244997e-03 -6.33734625e-04 -3.35233611e-04 -3.13831369e-03
 -8.23549455e-03 -1.05826622e-03 -5.99902119e-04 -2.09288227e-04
  2.68527256e-03  2.12150860e-02 -2.39853315e-02 -6.75068069e-04
 -2.00171202e-03 -3.40750076e-03 -9.12635825e-03 -7.84308144e-04
 -4.45670928e-03 -2.19838729e-02 -4.04324138e-03 -1.12285144e-03
 -2.36174232e-03  1.59094280e-04  1.20123205e-02 -1.03695291e-02
  8.42878867e-03 -2.8

### Problem 6: Evaluating Models
- What is the best metric to evaluate your model. Is accuracy good for this case?
- Now you have multiple models, which one would you pick? 
- Can you get any clue from the confusion matrix? What is the meaning of precision and recall in this case? Which one do you care the most? How will your model help the manager make decisions?
- **Note:** The last question is open-ended. Your answer could be completely different depending on your understanding of this business problem.

### Problem 7: Feature Engineering Revisit
- Is there anything wrong with the new feature we generated? How should we fix it?
- ***Hint***: For the real test set, we do not know it will get returned or not.