In [3]:
# %matplotlib inline
import pandas as pd
from fbprophet import Prophet
import numpy as np
from sklearn.metrics import mean_squared_error

import matplotlib.pyplot as plt
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 15, 6
plt.style.use('fivethirtyeight')

In [4]:
raw_data = pd.read_csv("Online_Retail.csv")
raw_data.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,01/12/10 08:26,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,01/12/10 08:26,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,01/12/10 08:26,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,01/12/10 08:26,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,01/12/10 08:26,3.39,17850.0,United Kingdom


In [5]:
sales_data = raw_data.drop(['InvoiceNo','Description','CustomerID','Country','UnitPrice'], axis = 1)
sales_data['InvoiceDate'] = pd.to_datetime(sales_data['InvoiceDate'])
print(sales_data.info())
sales_data.index

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541909 entries, 0 to 541908
Data columns (total 3 columns):
StockCode      541909 non-null object
Quantity       541909 non-null int64
InvoiceDate    541909 non-null datetime64[ns]
dtypes: datetime64[ns](1), int64(1), object(1)
memory usage: 12.4+ MB
None


RangeIndex(start=0, stop=541909, step=1)

In [6]:
stock_codes = list(sales_data.StockCode.unique())

In [7]:
def extract_product(product_id,freq):
    #pull out all transactions of one product and drop StockCode
    product = sales_data.loc[sales_data['StockCode'] == product_id].drop('StockCode',axis=1)
    product=product.set_index(['InvoiceDate']) #Convert to timeseries 

    #DownSampling to a day
    day_summary = pd.DataFrame()
    day_summary['Quantity'] = product.Quantity.resample(freq).sum()
    day_summary = day_summary.fillna(0) #replacing NaN with 0
    day_summary = day_summary.clip(0) #replace -ve values with 0

    day_summary = day_summary['2010-12-12':]#trim as almost no data available before this date
    day_summary['Quantity']+=1#ti avoid -inf while taking log transformation 
#     print(day_summary.describe())
    #plot
#     fig, ax1 = plt.subplots()
#     ax1.plot(day_summary.index, day_summary['Quantity'])
#     ax1.set_xlabel('Date')
#     ax1.set_ylabel('Quantity')
#     # ax1.set_ylim(0,100)
#     plt.show()
    return day_summary

In [99]:
def result_analysis(errors):
    errors = [value[1] for value in errors]
    None_count = errors.count(None)
    errors = [x for x in errors if x!=None]
    min_value = min(errors)
    max_value = max(errors)
    avg_value = sum(errors)/len(errors)
    print("min value = "+str(min_value))
    print("max value = "+str(max_value))
    print("avg value = "+str(avg_value))
    print("none count = "+str(None_count))
#     min_value = float('inf')
#     max_value = 0
#     None_count = 0
#     error_sum = 0
#     for pair in errors:
#         if pair[1] != None:
#             error_sum+=pair[1]
#             if pair[1]<min_value:
#                 min_value=pair[1]
#             if pair[1]>max_value:
#                 max_value=pair[1]
# #                 print(pair[0])
#         else:
#             None_count+=1
# #     print(max_value)
#     return ("min_value = "+str(min_value),"max_value = "+str(max_value),"avg value = "+str(error_sum/len(errors)),"number of none = "+str(None_count))

In [84]:
def get_relative_error(y,yhat):
#     error = 0
    relative_errors = [(x-y)/x for x,y in zip(y,yhat)]
#     print(relative_errors)
    root_mean_square_relative_error = np.sqrt(sum([x**2 for x in relative_errors])/len(relative_errors))
    return root_mean_square_relative_error

In [85]:
# get_relative_error(range(1,11),range(2,12))

In [96]:
errors = list()
count =1 
for product in stock_codes[:10]:
    print('\r', count, end='')
    count+=1
    freq = 'D'
    try:
        day_summary = extract_product(product,freq)
        # day_summary.head()
        day_summary.reset_index(inplace=True)
        day_summary = day_summary.rename(columns={'Quantity': 'y','InvoiceDate': 'ds'})
        train,test = day_summary[:int(len(day_summary)*0.9)],day_summary[int(len(day_summary)*0.9):]
#         print((len(train),len(test),len(day_summary)))
        my_model = Prophet(daily_seasonality=True,yearly_seasonality=True,weekly_seasonality=True)
        my_model.fit(train)
        future_dates = my_model.make_future_dataframe(periods=len(test),freq=freq)
        forecast = my_model.predict(future_dates)
        predicted = forecast[['ds', 'yhat']].tail(len(test))
#         temp = get_relative_error(test.y,predicted.yhat)
#         print(temp)
#         errors.append((product,temp))
        errors.append((product,get_relative_error(test.y,predicted.yhat)))
#         errors.append((product,np.sqrt(mean_squared_error(test.y,predicted.yhat))))

    except:
        errors.append((product,None))
#     print(np.sqrt(mean_squared_error(test.y,predicted.yhat)))
#     my_model.plot(forecast,uncertainty=True);
#     my_model.plot_components(forecast);
    
# result_analysis(errors)
with open("result.txt", 'w') as file:
    for item in errors:
        file.write("{}\n".format(item))


 10

In [97]:
errors

[('85123A', 729.09853469184554),
 ('71053', 2.7289113803710943),
 ('84406B', 1.6624599955888035),
 ('84029G', 10.885120913367652),
 ('84029E', 58.224569489278004),
 ('22752', 2.6688672058237937),
 ('21730', 2.655939136213501),
 ('22633', 24.59731118763515),
 ('22632', 12.569819580780807),
 ('84879', 78.99600508730812)]

In [98]:
result_analysis(errors)

('min_value = 1.66245999559',
 'max_value = 729.098534692',
 'avg value = 92.4087538668',
 'number of none = 0')