### Importing libraries

In [1]:
import numpy as np 
import pandas as pd 
from mlxtend.frequent_patterns import apriori, association_rules
import plotly.graph_objects as go
import plotly.express as px
import matplotlib.pyplot as plt
from matplotlib.dates import DateFormatter, DayLocator

### Read the Dataset 

In [2]:
df_ = pd.read_csv("file_out2.csv")
df = df_.copy()
df.head()
#df.shape

Unnamed: 0.1,Unnamed: 0,InvoiceID,Date,ProductID,TotalSales,Discount,CustomerID,Quantity
0,0,328,2019-12-27,1684,796.610169,143.389831,185,4
1,1,329,2019-12-27,524,355.932203,64.067797,185,2
2,2,330,2019-12-27,192,901.694915,162.305085,230,4
3,3,330,2019-12-27,218,182.754237,32.895763,230,1
4,4,330,2019-12-27,247,780.101695,140.418305,230,4


### Feature Engineering

In [3]:
df.drop(["Unnamed: 0","TotalSales","Discount","CustomerID"], axis=1, inplace = True)
##set axis= 1 to remove columns , inplace means edit the original df

In [4]:
df.head()

Unnamed: 0,InvoiceID,Date,ProductID,Quantity
0,328,2019-12-27,1684,4
1,329,2019-12-27,524,2
2,330,2019-12-27,192,4
3,330,2019-12-27,218,1
4,330,2019-12-27,247,4


In [5]:
df.describe()

Unnamed: 0,InvoiceID,ProductID,Quantity
count,29103.0,29103.0,29103.0
mean,7221.321445,869.953819,5.440367
std,3443.397539,583.414204,6.804637
min,0.0,0.0,0.0
25%,4919.0,379.0,2.0
50%,7588.0,660.0,4.0
75%,9536.0,1456.0,4.0
max,14078.0,1939.0,250.0


In [6]:

df['Date'] = pd.to_datetime(df['Date'])
df.set_index('Date', inplace=True)


### Data Cleaning

In [8]:
df.isnull().sum()

df['Date']=pd.to_datetime(df['Date'])
df.info()

KeyError: 'Date'

In [None]:
df.sort_values("Quantity") # here we found zero values for some transactions
df1=df
#delete each transaction with Quantity =0
df=df.drop( df1.query(" `Quantity`==0 ").index)
df.sort_values("Quantity")



In [None]:
def generate_product_features(PRODUCT_ID):
    specific_product_data = df.loc[df['ProductID']==PRODUCT_ID]

    specific_product_data['Day_of_Week'] = specific_product_data['Date'].dt.dayofweek
    specific_product_data['Month'] = specific_product_data['Date'].dt.month
    specific_product_data['Year'] = specific_product_data['Date'].dt.year
    specific_product_data['Day_of_Month'] = specific_product_data['Date'].dt.day
    specific_product_data['Week_Number'] = specific_product_data['Date'].dt.isocalendar().week
    # Assuming you want to predict daily orders/sales for the next month for the specific product
    specific_product_data['Next_Month'] = specific_product_data['Date'] + pd.DateOffset(months=1)  # Next month's date
    return specific_product_data


specific_product_data=generate_product_features(192)    


In [None]:
import pandas as pd

# Assuming 'df' is your DataFrame containing columns: InvoiceID, Date, ProductID, Quantity

# Convert 'Date' column to datetime format
df['Date'] = pd.to_datetime(df['Date'])

# Feature Engineering
df['Day_of_Week'] = df['Date'].dt.dayofweek
df['Month'] = df['Date'].dt.month
df['Year'] = df['Date'].dt.year
df['Day_of_Month'] = df['Date'].dt.day
df['Week_Number'] = df['Date'].dt.isocalendar().week

# Assuming you want to predict item sales for the next month
df['Next_Month'] = df['Date'] + pd.DateOffset(months=1)  # Next month's date

# Adding lag features for daily sales
df['Prev_Day_Sales'] = df.groupby('ProductID')['Quantity'].shift(1)
df['Prev_Week_Sales'] = df.groupby('ProductID')['Quantity'].shift(7)
# Add other lag features as needed

# Calculate rolling statistics (e.g., 7-day rolling mean)
df['Rolling_7_Day_Avg'] = df.groupby('ProductID')['Quantity'].transform(lambda x: x.rolling(window=7).mean())

df.head(30)

### Data Splitting:

In [None]:
from sklearn.model_selection import train_test_split

# Assuming 'specific_product_data' contains the engineered features for the specific product
train_data, test_data = train_test_split(specific_product_data, test_size=0.2, random_state=42)


### Model Selection and Training:

In [None]:
from xgboost import XGBRegressor

# Define features and target variable
features = ['Day_of_Week', 'Month', 'Year', 'Day_of_Month', 'Week_Number']  # Adjust with your features
target = 'Next_Month'  # The column representing the target (e.g., sales/orders for next month)

# Train the model
model = XGBRegressor()
model.fit(train_data[features], train_data[target])


### Model Evaluation:

In [None]:
from sklearn.metrics import mean_squared_error

# Convert the target column (dates) to numeric format (timestamps) for comparison
test_data_timestamps = test_data[target].apply(lambda x: x.timestamp())

# Calculate evaluation metric (e.g., RMSE) using numeric values
rmse = mean_squared_error(test_data_timestamps, predictions, squared=False)
print(f"Root Mean Squared Error (RMSE): {rmse}")



### Top 10 frequently sold products

In [None]:
def ditribution_plot(x,y,name=None,xaxis=None,yaxis=None):
    fig = go.Figure([
        go.Bar(x=x, y=y)
    ])

    fig.update_layout(
        title_text=name,
        xaxis_title=xaxis,
        yaxis_title=yaxis
    )
    fig.show()

In [None]:
x = df['ProductID'].astype(str).value_counts()
x = x.sort_values(ascending=False) 
x = x[:10]

ditribution_plot(x=x.index, y=x.values, yaxis="Count", xaxis="ProductsID")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29103 entries, 0 to 29102
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  29103 non-null  int64  
 1   InvoiceID   29103 non-null  int64  
 2   Date        29103 non-null  object 
 3   ProductID   29103 non-null  int64  
 4   TotalSales  29103 non-null  float64
 5   Discount    29103 non-null  float64
 6   CustomerID  29103 non-null  int64  
 7   Quantity    29103 non-null  int64  
dtypes: float64(2), int64(5), object(1)
memory usage: 1.8+ MB


In [None]:
df.loc[df['ProductID']==192]

#### Timeline of orders

In [None]:
def timeline_orders_for_aProduct(ProductId):
   productData=df.loc[df['ProductID']==ProductId]
   timeline_orders=productData.groupby('Date')['Quantity'].sum().reset_index()
   return timeline_orders
timeline_orders_for_aProduct(192)

Unnamed: 0,Date,Quantity
0,2019-01-02,12
1,2019-01-04,4
2,2019-01-12,4
3,2019-01-17,2
4,2019-01-23,1
...,...,...
465,2022-05-21,50
466,2022-05-23,12
467,2022-05-25,2
468,2022-05-27,20


In [None]:
def plot_product_timeline(timeline_orders):
    timeline_orders['Date'] = pd.to_datetime(timeline_orders['Date'])
    timeline_orders = timeline_orders.sort_values(by='Date')
    
    fig = px.line(timeline_orders, x='Date', y='Quantity', 
              labels={'Date': 'Date', 'Quantity': 'Total Orders'},
              title='Timeline of Orders')
    fig.update_xaxes(tickangle=45) 
    fig.show()


In [None]:
timeline_orders_for_192 =timeline_orders_for_aProduct(192)
plot_product_timeline(df)

In [None]:
timeline_orders = df.groupby('Date')['Quantity'].sum().reset_index()
plot_product_timeline(timeline_orders)

#### Timeline of weekly orders

In [None]:
df['Week'] = df['Date'].dt.strftime('%Y-%U')

# Group data by 'Week' and sum 'Quantity' for each week
weekly_orders = df.groupby('Week')['Quantity'].sum()

# Plotting the timeline for weekly orders
plt.figure(figsize=(12, 6))
weekly_orders.plot(kind='line', marker='o', color='b')
plt.title('Timeline of Weekly Orders')
plt.xlabel('Week')
plt.ylabel('Total Orders')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

#### Train / Test split

### Transactions
###### Note: if a customer bought multiple products on same day, We will consider it one transaction

In [None]:

basket = (df 
          .groupby(['InvoiceID', 'ProductID'])['Quantity'] 
          .sum().unstack().reset_index().fillna(0) 
          .set_index('InvoiceID'))
basket


### Apriori Algorithm

In [None]:
frq_items = apriori(basket.astype(bool), min_support=0.001, use_colnames=True)
frq_items.sort_values("support",ascending=False)

### Association rules

In [None]:
# Generate association rules
rules = association_rules(frq_items, metric="lift", min_threshold=1)
rules = rules.sort_values(['confidence', 'lift'], ascending=[False, False])
rules

### FPGrowth

In [None]:
import pandas as pd
# Load your dataset into a DataFrame (assuming it's stored in a CSV file)
df = pd.read_csv('file_out2.csv')

from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import fpgrowth


# Group products by InvoiceID
transactions = df.groupby('InvoiceID')['ProductID'].apply(list).tolist()

# Initialize TransactionEncoder
encoder = TransactionEncoder()

# Transform the transaction data into a binary encoded DataFrame
transaction_matrix = encoder.fit(transactions).transform(transactions)

# Create a new DataFrame with binary values
binary_df = pd.DataFrame(transaction_matrix, columns=encoder.columns_)

# Set the minimum support threshold
min_support = 0.001  # Adjust this based on your dataset and requirements

# Generate frequent itemsets using FP-growth
frequent_itemsets = fpgrowth(binary_df, min_support=min_support, use_colnames=True)

# Print frequent itemsets
#print(frequent_itemsets)
from mlxtend.frequent_patterns import association_rules
# Set minimum confidence threshold for the association rules
min_confidence = 0.1  # You can adjust this based on your requirements

# Generate association rules from frequent itemsets
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=min_confidence)
rules = rules.sort_values(['confidence', 'lift'], ascending=[False, False])
# Print the association rules
frequent_itemsets
rules


#