In [1]:
import pandas as pd
import numpy as np

In [37]:
def import_data():
    data = pd.read_csv('https://supchains.com/wp-content/uploads/2021/07/norway_new_car_sales_by_make1.csv') #import data
    data['Period'] = data['Year'].astype(str) + '-' + data['Month'].astype(str).str.zfill(2) #create period column and account for padded zeros
    df = pd.pivot_table(data = data, values='Quantity', index = 'Make', columns = 'Period', aggfunc = 'sum', fill_value=0)
    return df


In [38]:
def datasets(df, x_len = 12, y_len = 1, test_loops = 12):
    D = df.values
    rows,periods = df.shape
    
    #training set creation
    loops = periods + 1 - x_len - y_len
    train = []
    for col in range(loops):
        train.append(D[:,col:col+x_len+y_len])
    train = np.vstack(train)
    X_train, Y_train = np.split(train, [-y_len], axis = 1)

    #test set creation
    if test_loops > 0:
        X_train, X_test = np.split(X_train, [-rows*test_loops], axis = 0)
        Y_train, Y_test = np.split(Y_train, [-rows*test_loops], axis = 0)
    else: #no test set: X_test is used to generate the future forecast
        X_test = D[:,-x_len:]
        Y_test = np.full((X_test.shape[0],y_len), np.nan) #dummy value

    #formatting required for scikit-learn
    if y_len == 1:
        Y_train = Y_train.ravel()
        Y_test = Y_test.ravel()

    return(X_train, Y_train, X_test, Y_test)
    

In [51]:
def kpi_ML(Y_train, Y_train_pred, Y_test, Y_test_pred, name=''):
    df = pd.DataFrame(columns = ['MAE', 'RMSE', 'Bias'], index = ['Train', 'Test'])
    df.index.name = name

    df.loc['Train','MAE'] = 100*np.mean(abs(Y_train - Y_train_pred))/np.mean(Y_train)

    df.loc['Train','RMSE'] = 100*np.sqrt(np.mean((Y_train - Y_train_pred)**2))/np.mean(Y_train)

    df.loc['Train','Bias'] = 100*np.mean(abs(Y_train - Y_train_pred))/np.mean(Y_train)

    df.loc['Test','MAE'] = 100*np.mean(abs(Y_test - Y_test_pred))/np.mean(Y_test)

    df.loc['Test','RMSE'] = 100*np.sqrt(np.mean((Y_test - Y_test_pred)**2))/np.mean(Y_test)

    df.loc['Test','Bias'] = 100*np.mean(abs(Y_test - Y_test_pred))/np.mean(Y_test)

    df = df.astype(float).round(1)
    print(df)

In [52]:
df = import_data()
X_train, Y_train, X_test, Y_test = datasets(df, x_len=12, y_len=1, test_loops=12)

In [53]:
from sklearn.linear_model import LinearRegression
reg = LinearRegression() #create linear regression object
reg = reg.fit(X_train, Y_train) #fit it to training data

#create two predictions for the training and test sets
Y_train_pred = reg.predict(X_train)
Y_test_pred = reg.predict(X_test)


In [54]:
kpi_ML(Y_train, Y_train_pred, Y_test, Y_test_pred, name='Regression')

             MAE  RMSE  Bias
Regression                  
Train       17.8  43.9  17.8
Test        17.8  43.7  17.8


In [56]:
X_train, Y_train, X_test, Y_test = datasets(df, x_len=12, y_len=1, test_loops =0)
reg = LinearRegression()
reg = reg.fit(X_train, Y_train)
forecast = pd.DataFrame(data = reg.predict(X_test), index = df.index)
forecast.head() 

Unnamed: 0_level_0,0
Make,Unnamed: 1_level_1
Alfa Romeo,6.187217
Aston Martin,1.032483
Audi,646.568622
BMW,1265.032834
Bentley,1.218092


In [57]:
#hello