In [2]:
# for presentation purposes
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

# visualize 
import matplotlib.pyplot as plt
from matplotlib.ticker import StrMethodFormatter
from matplotlib.dates import DateFormatter
import seaborn as sns


# working with dates
from datetime import datetime

# to evaluated performance using rmse
from sklearn.metrics import mean_squared_error
from math import sqrt 

# for tsa 
import statsmodels.api as sm

# holt's linear trend model. 
from statsmodels.tsa.api import Holt

In [3]:
# read in the csv file
df = pd.read_csv('saas.csv')

df.head()

Unnamed: 0,Month_Invoiced,Customer_Id,Invoice_Id,Subscription_Type,Amount
0,2014-01-31,1000000,5000000,0.0,0.0
1,2014-01-31,1000001,5000001,2.0,10.0
2,2014-01-31,1000002,5000002,0.0,0.0
3,2014-01-31,1000003,5000003,0.0,0.0
4,2014-01-31,1000004,5000004,1.0,5.0


In [None]:
# Lowercase column names
df.columns = [col.lower() for col in df]
# Properly parse the datetime field
df.month_invoiced = pd.to_datetime(df.month_invoiced)

In [None]:
df.subscription_type = df.subscription_type.astype(int)

In [None]:
df.amount = df.amount.astype(int)

In [None]:
# a datetime index is probably a good idea for time series analysis
df = df.set_index('month_invoiced')

In [None]:
df.resample('M').size().plot()
plt.ylabel('# of Observations')
plt.title('How many observations are there each month?')

In [None]:
ax = df.resample('M').amount.sum().plot()
ax.yaxis.set_major_formatter(StrMethodFormatter('${x:,}'))
ax.set(title='Revenue Over time', ylabel='Revenue', xlabel='')

In [None]:
df = df.resample('M')[['amount']].sum()

In [None]:
int(len(df) * 0.5)

In [None]:
# set train size to be 50% of total 
train_size = int(len(df) * 0.5)
train_size

In [None]:
# set validate size to be 30% of total 
validate_size = int(len(df) * 0.3)
validate_size

In [None]:
# set test size to be number of rows remaining. 
test_size = int(len(df) - train_size - validate_size)
test_size

In [None]:
len(df) == train_size + validate_size + test_size

In [None]:
# set end for validate
validate_end_index = train_size + validate_size
validate_end_index

In [None]:
# train will go from 0 to 23
train = df[:train_size]

In [None]:
# validate will go from 24 to 37
validate = df[train_size:validate_end_index]

In [None]:
# test will include 38 to the end
test = df[validate_end_index:]

In [None]:
train.size, validate.size, test.size

In [None]:
# is len of train + validate + test == lenght of entire dataframe. 
len(train) + len(validate) + len(test) == len(df)

In [None]:
print(df.head(1) == train.head(1))