In [4]:
# load the data from the csv source file
src_data = '../data/norway_new_car_sales_by_make.csv'

In [2]:
import pandas as pd

In [5]:
df_data = pd.read_csv(src_data)

In [6]:
df_data

Unnamed: 0,Year,Month,Make,Quantity,Pct
0,2007,1,Toyota,2884,22.7
1,2007,1,Volkswagen,2521,19.9
2,2007,1,Peugeot,1029,8.1
3,2007,1,Ford,870,6.9
4,2007,1,Volvo,693,5.5
...,...,...,...,...,...
4372,2017,1,Nilsson,3,0.0
4373,2017,1,Maserati,2,0.0
4374,2017,1,Ferrari,1,0.0
4375,2017,1,Smart,1,0.0


In [7]:
# create a column 'Period' with both year and month data
df_data['Period'] = df_data['Year'].astype(str) + '-' + df_data['Month'].astype(str)

In [8]:
df_data

Unnamed: 0,Year,Month,Make,Quantity,Pct,Period
0,2007,1,Toyota,2884,22.7,2007-1
1,2007,1,Volkswagen,2521,19.9,2007-1
2,2007,1,Peugeot,1029,8.1,2007-1
3,2007,1,Ford,870,6.9,2007-1
4,2007,1,Volvo,693,5.5,2007-1
...,...,...,...,...,...,...
4372,2017,1,Nilsson,3,0.0,2017-1
4373,2017,1,Maserati,2,0.0,2017-1
4374,2017,1,Ferrari,1,0.0,2017-1
4375,2017,1,Smart,1,0.0,2017-1


In [9]:
# convert Period from a string type data to datetime
df_data['Period'] = pd.to_datetime(df_data['Period']).dt.strftime('%Y-%m')

In [10]:
df_data

Unnamed: 0,Year,Month,Make,Quantity,Pct,Period
0,2007,1,Toyota,2884,22.7,2007-01
1,2007,1,Volkswagen,2521,19.9,2007-01
2,2007,1,Peugeot,1029,8.1,2007-01
3,2007,1,Ford,870,6.9,2007-01
4,2007,1,Volvo,693,5.5,2007-01
...,...,...,...,...,...,...
4372,2017,1,Nilsson,3,0.0,2017-01
4373,2017,1,Maserati,2,0.0,2017-01
4374,2017,1,Ferrari,1,0.0,2017-01
4375,2017,1,Smart,1,0.0,2017-01


In [11]:
# create a pivot of the data to show the periods
# on columns and the car makers on the row
df_cleaned = pd.pivot_table(data=df_data, values="Quantity", index="Make", columns="Period", aggfunc="sum", fill_value=0)

In [12]:
df_cleaned

Period,2007-01,2007-02,2007-03,2007-04,2007-05,2007-06,2007-07,2007-08,2007-09,2007-10,...,2016-04,2016-05,2016-06,2016-07,2016-08,2016-09,2016-10,2016-11,2016-12,2017-01
Make,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Alfa Romeo,16,9,21,20,17,21,14,12,15,10,...,3,1,2,1,6,15,3,4,3,6
Aston Martin,0,0,1,0,4,3,3,0,0,0,...,0,0,1,0,0,0,0,0,0,0
Audi,599,498,682,556,630,498,562,590,393,554,...,685,540,551,687,794,688,603,645,827,565
BMW,352,335,365,360,431,477,403,348,271,562,...,1052,832,808,636,1031,1193,1096,1663,866,1540
Bentley,0,0,0,0,0,1,0,0,0,0,...,0,0,1,1,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Think,2,0,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
Toyota,2884,1885,1833,1300,1866,1620,1901,1783,1303,1648,...,1432,1687,1603,1127,1824,1823,1327,1375,1238,1526
Volkswagen,2521,1517,1428,1257,1934,1531,1777,1665,1373,1941,...,3017,2222,2287,2076,2359,2084,2161,2106,2239,1688
Volvo,693,570,656,587,805,662,1064,498,662,1014,...,748,619,766,635,463,763,732,754,1235,1158


In [13]:
out_file = '../data/cleaned_data.csv'
df_cleaned.to_csv(out_file)

In [None]:
# function to process the data for training and test sets
def get_datasets(df, x_len=12, y_len=1, y_test_len=12):
    # fetch all the data in a 2D array
    data = df.values
    periods = data.shape[1]

    # training set creation
    loops = periods + 1 - x_len - y_len - y_test_len
    

array([[  16,    9,   21, ...,    4,    3,    6],
       [   0,    0,    1, ...,    0,    0,    0],
       [ 599,  498,  682, ...,  645,  827,  565],
       ...,
       [2521, 1517, 1428, ..., 2106, 2239, 1688],
       [ 693,  570,  656, ...,  754, 1235, 1158],
       [   0,    0,    0, ...,    0,    0,    0]])