Importing the necessary library to build a machine learning model. 
Most of them are from sklearn. We will focus on:
1. Linear Regression
2. Polynomial Features
3. Decision Tree
4. Random Forest

In [1]:
# Import library
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
from datetime import datetime
import time 

# import machine learning model libs
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error

# list of input files
for dirname, _, filenames in os.walk('input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

input/sampleSubmission.csv
input/merged_data.csv
input/testing_data.csv
input/weather.csv
input/train.csv
input/train_data.csv
input/key.csv


In [2]:
# Import the training data
train_data = pd.read_csv('input/train_data.csv')

Additional Data preprocessing
We create more features by split the value of each columns
From the label fo the train_data we can split them to
1. store number
2. item number
3. date
Then with the date value, we can add 3 more features:
1. day
2. month
3. year

In [3]:
# Add Store number, item number and date features
train_data[['store_nbr', 'item_nbr','date']] = train_data['label'].str.split('_',expand=True)

# Convert the date to format to day, month, year features
train_data[['year', 'month','day']] = train_data['date'].str.split('-',expand=True)

# Convert any non numerical value from the average temperature to numerical values
train_data['tavg'] = train_data['tavg'].replace(['M'], -1)
train_data['tavg'] = train_data['tavg'].replace(['  T'], -2)

In the following step, I created functions to aid in building the models.

In [4]:
# Build dataset
def build_dataset(train_data, features):
    XX = train_data[features]
    YY = train_data["units"]
    X_train, X_test, y_train, y_test = train_test_split(XX, YY, train_size=0.8,test_size=0.2, random_state=42, shuffle=True)
    return X_train, X_test, y_train, y_test

# Build Linear Regression
def build_LR(X_train, X_test, Y_train, Y_test):
  # Create Linear features
    model = LinearRegression()

    # Fit the model
    model.fit(X_train, Y_train)

    # predict the target variable using the trained model and new data
    y_pred = model.predict(X_test)
    
    return y_pred

# Build Random Forest Regressor
def build_PR(X_train, X_test, Y_train, Y_test):
  # Create polynomial features
    poly = PolynomialFeatures(degree=4)
    X_poly = poly.fit_transform(X_train)

    # Create linear regression object
    model = LinearRegression()

    # Fit the model
    model.fit(X_poly, Y_train)

    # predict the target variable using the trained model and new data
    X_new_poly = poly.transform(X_test)
    y_poly_pred = model.predict(X_new_poly)
    
    return y_poly_pred

# Build Linear Regression
def build_DT(X_train, X_test, Y_train, Y_test):
  # Create Decision Tree Classifier
    model = DecisionTreeClassifier()

    # Fit the model
    model.fit(X_train, Y_train)

    # predict the target variable using the trained model and new data
    y_pred = model.predict(X_test)
    
    return y_pred

# Build Random Forest Regressor
def build_RFR(X_train, X_test, Y_train, Y_test):
    # Instantiate the Random Forest Regressor with the desired parameters
    rf_regressor = RandomForestRegressor(n_estimators=100, max_depth=50, random_state=42)

    # Train the Random Forest Regressor on the training data
    rf_regressor.fit(X_train, Y_train)

    # Make predictions on the test data
    y_pred = rf_regressor.predict(X_test)
    
    return y_pred

# Predict accuracy 
def predict_accuracy(y_test, y_pred):
    mae = mean_absolute_error(y_test, y_pred)
    accuracy = 100 * (1 - mae / np.mean(y_test))
    return accuracy

Here we define features that we would like to explore. After performing some trial and error, we conclude that it is impossible to build models which include the weather codes features. It is due to the lack of computational power. 

In [5]:
performance = {}
performance['Features'] = []
performance['Features'].append('station_nbr,store_nbr,item_nbr')
performance['Features'].append('station_nbr,store_nbr,item_nbr,day,month,year')
performance['Features'].append('station_nbr,store_nbr,item_nbr,day,month,year,dayNum,weekday,weekend')
performance['Features'].append('tavg,day,month,year')
performance['Features'].append('station_nbr,store_nbr,item_nbr,day,month,year,dayNum,weekday,weekend,tavg')

# We cannot use these features because they are computational expensive
# performance['Features'].append('station_nbr,store_nbr,item_nbr,day,month,year,dayNum,weekday,weekend,FC,FC+,FC,TS,GR,RA,DZ,SN,SG,GS,PL,IC,FG+,FG,BR,UP,HZ,FU,VA,DU,DS,PO,SA,SS,PY,SQ,DR,SH,FZ,MI,PR,BC,BL,VC')
# performance['Features'].append('day,month,year,dayNum,weekday,weekend,FC,FC+,FC,TS,GR,RA,DZ,SN,SG,GS,PL,IC,FG+,FG,BR,UP,HZ,FU,VA,DU,DS,PO,SA,SS,PY,SQ,DR,SH,FZ,MI,PR,BC,BL,VC')

performance['Linear Regression'] = []
performance['Linear Regression MSE'] = []
performance['Polynomial Regression'] = []
performance['Polynomial Regression MSE'] = []
performance['Decision Tree'] = []
performance['Decision Tree MSE'] = []
performance['Random Forest Regressor'] = []
performance['Random Forest Regressor MSE'] = []

model = {}
model['Linear Regression'] = build_LR
model['Polynomial Regression'] = build_PR
model['Decision Tree'] = build_DT
model['Random Forest Regressor'] = build_RFR

In this step, we built the each models base on different features.
We evaluated each models by analyse the mean square error. This is a popular metric used for regression tasks. It measures the average squared difference between the predicted values and the true values.

In [6]:
model_no = 4
feature_no = len(performance['Features'])
feature_lists = performance['Features']
for ii in range(feature_no):
    feature_list = feature_lists[ii].split(',')
    features = feature_list
    # features = ['station_nbr','dayNum','weekday','weekend','store_nbr', 'item_nbr','day','month','year']
    print(f"Features: {','.join(features)}")
    X_train, X_test, y_train, y_test = build_dataset(train_data, features)
    keys = list(model.keys())
    for i in range(model_no):
        start = time.time()
        model_name = keys[i]
        y_pred = model[model_name](X_train, X_test, y_train, y_test)
        accuracy_score = predict_accuracy(y_test, y_pred)
        mse = mean_squared_error(y_test, y_pred)
        end = time.time()
        performance[model_name].append(round(accuracy_score,2))
        performance[model_name+" MSE"].append(round(mse,2))
        print(f"Accuracy score: {accuracy_score:.2f}% MSE: {mse:.2f} running time: {(end-start):.2f}s")

Features: station_nbr,store_nbr,item_nbr
Accuracy score: 19.60% MSE: 1908.05 running time: 0.05s
Accuracy score: 31.29% MSE: 1365.30 running time: 0.16s
Accuracy score: 62.16% MSE: 730.74 running time: 0.10s
Accuracy score: 65.11% MSE: 545.05 running time: 2.51s
Features: station_nbr,store_nbr,item_nbr,day,month,year
Accuracy score: 20.05% MSE: 1880.41 running time: 0.09s
Accuracy score: 32.40% MSE: 1308.70 running time: 0.87s
Accuracy score: 61.03% MSE: 704.66 running time: 0.85s
Accuracy score: 69.41% MSE: 619.39 running time: 13.53s
Features: station_nbr,store_nbr,item_nbr,day,month,year,dayNum,weekday,weekend
Accuracy score: 20.49% MSE: 1859.31 running time: 0.12s
Accuracy score: 33.14% MSE: 1278.60 running time: 3.74s
Accuracy score: 63.45% MSE: 645.26 running time: 1.04s
Accuracy score: 73.00% MSE: 349.84 running time: 17.47s
Features: tavg,day,month,year
Accuracy score: 15.58% MSE: 2001.52 running time: 0.05s
Accuracy score: 16.71% MSE: 1973.95 running time: 0.31s
Accuracy score

We printed out the score of each models and their measure squared error to further analyze.

In [8]:
perform_df = pd.DataFrame(performance)
perform_df.to_csv('performance.csv', index=False)

In [9]:
# import test data for submission
test_data = pd.read_csv('input/testing_data.csv', low_memory=False)

# Preprocess test data
test_data[['store_nbr', 'item_nbr','date']] = test_data['id'].str.split('_',expand=True)
test_data[['year','month','day']] = test_data['date'].str.split('-',expand=True)

feature_test = "station_nbr,store_nbr,item_nbr,day,month,year,dayNum,weekday,weekend"
feature_list = feature_test.split(',')
X_train = train_data[feature_list]
Y_train = train_data["units"]
X_test = test_data[feature_list]
Y_test = []

y_pred = build_RFR(X_train, X_test, Y_train, Y_test)

d = {'id': test_data['id'].values, 'units': y_pred}
output = pd.DataFrame(d)
output.to_csv('test_pred_RFR.csv', index=False)

It achieved a score of 1.77669 which is lower than before by 50%.