In [67]:
import pandas as pd


## IMPORTING THE DATA

In [68]:
# Using a raw string
data = pd.read_csv("C:\\Users\\Dell G3\\Documents\\PYTHON Projects\\Projects\\retailData\\file_out2.csv", index_col=0)

#5 first rows
print(data.head())

   InvoiceID        Date  ProductID  TotalSales    Discount  CustomerID  \
0        328  2019-12-27       1684  796.610169  143.389831         185   
1        329  2019-12-27        524  355.932203   64.067797         185   
2        330  2019-12-27        192  901.694915  162.305085         230   
3        330  2019-12-27        218  182.754237   32.895763         230   
4        330  2019-12-27        247  780.101695  140.418305         230   

   Quantity  
0         4  
1         2  
2         4  
3         1  
4         4  


## PREPROCESSING THE DATA

In [69]:
#Get info of the data
print(data.info())

<class 'pandas.core.frame.DataFrame'>
Index: 29103 entries, 0 to 29102
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   InvoiceID   29103 non-null  int64  
 1   Date        29103 non-null  object 
 2   ProductID   29103 non-null  int64  
 3   TotalSales  29103 non-null  float64
 4   Discount    29103 non-null  float64
 5   CustomerID  29103 non-null  int64  
 6   Quantity    29103 non-null  int64  
dtypes: float64(2), int64(4), object(1)
memory usage: 1.8+ MB
None


In [70]:
#Describing the data
print(data.describe())

          InvoiceID     ProductID     TotalSales      Discount    CustomerID  \
count  29103.000000  29103.000000   29103.000000  29103.000000  29103.000000   
mean    7221.321445    869.953819    2552.679147    451.325294    247.290279   
std     3443.397539    583.414204    4568.256400    650.359457    138.701207   
min        0.000000      0.000000       0.000000     -0.004694      0.000000   
25%     4919.000000    379.000000     779.661017    137.288136    134.000000   
50%     7588.000000    660.000000    1366.101695    244.067797    230.000000   
75%     9536.000000   1456.000000    2847.457627    508.658644    349.000000   
max    14078.000000   1939.000000  332574.460000  14110.169492    506.000000   

           Quantity  
count  29103.000000  
mean       5.440367  
std        6.804637  
min        0.000000  
25%        2.000000  
50%        4.000000  
75%        4.000000  
max      250.000000  


In [71]:
#checking if there is missing values
print(data.isnull().sum())

InvoiceID     0
Date          0
ProductID     0
TotalSales    0
Discount      0
CustomerID    0
Quantity      0
dtype: int64


In [72]:
#Converting to Date format
data["Date"] = pd.to_datetime(data["Date"], errors = "coerce") 
data["Day"] = data["Date"].dt.day
data["Month"] = data["Date"].dt.month
data["Year"] = data["Date"].dt.year

print(data["Date"].dtype)


datetime64[ns]


In [73]:
#Most taken product
product_counts = data.value_counts("ProductID")
most_taken_product = product_counts.idxmax()
print(most_taken_product)

192


In [74]:
from sklearn.preprocessing import MinMaxScaler

#Selecting relevent features
features = data[["Day", "Month", "Year", "ProductID", "Discount", "CustomerID", "Quantity"]]
target = data["TotalSales"]

#Scaling features
scaler = MinMaxScaler()
scaled_features = scaler.fit_transform(features)
scaled_target = scaler.fit_transform(target.values.reshape(-1, 1)).flatten()


## TIME SERIES FORMATTING

In [75]:
print(data.sort_values("Date"))

       InvoiceID       Date  ProductID   TotalSales     Discount  CustomerID  \
3605        2661 2019-01-02        901   621.533898   111.876102         189   
3624        2673 2019-01-02       1128   796.610169   143.389831         499   
3623        2673 2019-01-02        883   830.508475   149.491525         499   
3622        2673 2019-01-02        885   762.711864   137.288136         499   
3621        2672 2019-01-02        823   610.169492   109.830508          66   
...          ...        ...        ...          ...          ...         ...   
28678      14025 2023-03-25       1609  2542.372881   457.627119         290   
28677      14024 2023-03-25       1510  5374.271186   967.368814          21   
28676      14023 2023-03-25        627  3073.728814   553.271186         230   
28696      11573 2023-03-25       1443  2906.779661   523.220339         429   
28686      11563 2023-03-25        210  6561.016949  1180.983051         250   

       Quantity  Day  Month  Year  
360

In [76]:
import numpy as np
from sklearn.model_selection import train_test_split

#Create sequences for LSTM
sequence_length = 10
sequences = []
targets = []

for i in range(len(scaled_features) - sequence_length):
    sequences.append(scaled_features[i:i+sequence_length])
    targets.append(scaled_target[i + sequence_length])
    
X = np.array(sequences)
y = np.array(targets)  

#Splitting data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state= 42)

## MODEL ARCHITECTURE

In [77]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

# Build LSTM model
model = Sequential()
model.add(LSTM(units=50, activation='relu', input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(Dense(units=1))  # Assuming you want to predict a single value (total sales)

model.compile(optimizer='adam', loss='mse')  # Use Mean Squared Error for regression tasks

## TRAINING THE MODEL

In [78]:
# Train the model
model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_test, y_test))

# Make predictions
predicted_sales = model.predict(X_test)

# Inverse transform predictions to original scale
predicted_sales_original_scale = scaler.inverse_transform(np.reshape(predicted_sales, (predicted_sales.shape[0], 1)))

Epoch 1/50


Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


## EVALUATING THE MODEL

In [79]:
from sklearn.metrics import mean_squared_error

mse = mean_squared_error(y_test, predicted_sales)
print(f"Mean Squared Error: {mse}")

Mean Squared Error: 0.00010724634449639545
