In [1]:
# import important libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, root_mean_squared_error

In [2]:
# load the transaction dataset
df_retail = pd.read_csv('data/wrangled_transactions.csv')
df_retail.head()

Unnamed: 0,customer_id,revenue_2019,customer_relation_days,last_purchase_days,number_of_purchases,avg_order_cost,revenue_2020
0,12747,706.27,26,18,2,353.135,3489.74
1,12748,4228.13,30,8,14,302.009286,29491.6
2,12826,155.0,22,22,1,155.0,1319.72
3,12829,85.75,17,17,1,85.75,207.25
4,12838,390.79,30,30,1,390.79,292.34


### Model Building

In [3]:
X = df_retail[['revenue_2019', 'customer_relation_days', 'last_purchase_days', 'number_of_purchases', 'avg_order_cost']]
y = df_retail['revenue_2020']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=11)

In [4]:
# build the linear regression model and train it
model = LinearRegression()
model.fit(X_train, y_train)

In [5]:
# coefficients
model.coef_

array([-1.07823552e+00, -9.50680681e+01,  2.11229946e+02,  3.83052255e+03,
        1.69250468e+01])

### Evaluation 1

In [6]:
# predict single customer revenue 2020
test_customer = pd.DataFrame({
    'revenue_2019': [5000],
    'customer_relation_days': [200],
    'last_purchase_days': [10],
    'number_of_purchases': [20],
    'avg_order_cost': [1]
})

model.predict(test_customer)

array([45017.79104095])

### Evaluation 2

In [7]:
# predict
y_pred = model.predict(X_test)

# prepare a dataframe with actual value and prediction
predictions = pd.DataFrame({'y_test': y_test.array, 'y_pred': y_pred})
predictions = predictions[predictions['y_test'] < 10000]

# plot predictions against the actual value
fig = px.scatter(predictions, x=predictions.index, y=['y_test', 'y_pred'],
                 color_discrete_sequence=['blue', 'red'],
                 labels={'x': 'Index', 'value': 'Revenue'},
                 title='Actual vs. Predicted Revenue')
fig.show()

In [8]:
np.corrcoef(model.predict(X_test), y_test)

array([[1.        , 0.67411707],
       [0.67411707, 1.        ]])

## Evaluation 3

In [9]:
print("Root mean squared error: %.2f" % root_mean_squared_error(y_test, model.predict(X_test)))

Root mean squared error: 6301.33


# Model with positively correlated features

In [10]:
X = df_retail[['revenue_2019', 'number_of_purchases']]
y = df_retail['revenue_2020']

# test train split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=11)

# build the linear regression model and train it
model = LinearRegression()
model.fit(X_train, y_train)

# predict
y_pred = model.predict(X_test)

# prepare a dataframe with actual value and prediction
predictions = pd.DataFrame({'y_test': y_test.array, 'y_pred': y_pred})
predictions = predictions[predictions['y_test'] < 10000]

fig = px.scatter(predictions, x=predictions.index, y=['y_test', 'y_pred'],
                 color_discrete_sequence=['blue', 'red'],
                 labels={'x': 'Index', 'value': 'Revenue'},
                 title='Actual vs. Predicted Revenue')
fig.show()

In [11]:
print("Root mean squared error: %.2f" % root_mean_squared_error(y_test, y_pred))

Root mean squared error: 5139.34
