# Task: User Story: User should be able to provide a Customer ID and Date, and program should be able to predict quantity

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Load the data
df = pd.read_csv("Online Retail.xlsx")

# Handle missing values
df.dropna(inplace=True)
# Clean the data
df.drop(columns=['InvoiceNo',"StockCode","Description","UnitPrice","Country"], inplace=True)
df['CustomerID'] = df['CustomerID'].astype(int)
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])
df = df[(df['Quantity'] >= 0) & (df['Quantity'] <= 1000)]
# # Feature engineering
df['day_of_week'] = df['InvoiceDate'].dt.dayofweek
df['month'] = df['InvoiceDate'].dt.month
# Split the data
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)
# Model training
features = ['CustomerID', 'day_of_week', 'month']
target = 'Quantity'
model = LinearRegression()
model.fit(train_data[features], train_data[target])
# # Model evaluation
test_data['predicted_quantity'] = model.predict(test_data[features])
mae = mean_absolute_error(test_data['Quantity'], test_data['predicted_quantity'])
mse = mean_squared_error(test_data['Quantity'], test_data['predicted_quantity'])
# print(f'Mean Absolute Error: {mae:.2f}')
# print(f'Mean Squared Error: {mse:.2f}')
# Model deployment
new_data = pd.DataFrame({'CustomerID': [12345], 'InvoiceDate': ['2022-08-01']})
new_data['InvoiceDate'] = pd.to_datetime(new_data['InvoiceDate'])
new_data['day_of_week'] = new_data['InvoiceDate'].dt.dayofweek
new_data['month'] = new_data['InvoiceDate'].dt.month
new_data['predicted_quantity'] = model.predict(new_data[features])
print(new_data)

   CustomerID InvoiceDate  day_of_week  month  predicted_quantity
0       12345  2022-08-01            0      8           15.684307
