### Predictive analytics for inventory management in e commerce- big data approach

##### Load the Dataset

In [None]:
import pandas as pd

# Load the dataset
file_path = "online_retail_II.csv"
data = pd.read_csv(file_path)

# Display the first few rows
data.head(10)


##### Data Cleaning

In [None]:
# Check for missing values
print(data.isnull().sum())


In [None]:

# Drop rows with missing Customer ID (as they might not be useful)
data = data.dropna(subset=['Customer ID'])


In [None]:
# Remove duplicates
data = data.drop_duplicates()


In [None]:
# Convert InvoiceDate to datetime
data['InvoiceDate'] = pd.to_datetime(data['InvoiceDate'])


In [None]:
# Remove any rows with negative or zero Quantity or Price
data = data[(data['Quantity'] > 0) & (data['Price'] > 0)]

data.info()

##### Exploratory Data Analysis (EDA)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Basic statistics
data.describe()


In [None]:
# Distribution of Quantity and Price
plt.figure(figsize=(12, 6))
sns.histplot(data['Quantity'], bins=50, kde=True)
plt.title('Quantity Distribution')
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
sns.histplot(data['Price'], bins=50, kde=True)
plt.title('Price Distribution')
plt.show()

In [None]:
# Sales by Country
sales_by_country = data.groupby('Country')['Quantity'].sum().sort_values(ascending=False)
plt.figure(figsize=(12, 8))
sales_by_country.plot(kind='bar')
plt.title('Total Quantity Sold by Country')
plt.show()

In [None]:
# Sales by Country
sales_by_country = data.groupby('Country')['Price'].sum().sort_values(ascending=False)
plt.figure(figsize=(12, 8))
sales_by_country.plot(kind='bar')
plt.title('Total Price by Country')
plt.show()

#### Feature Engineering

In [None]:
# Create a new feature for Total Amount
data['TotalAmount'] = data['Quantity'] * data['Price']

# Extracting additional date features
data['InvoiceYear'] = data['InvoiceDate'].dt.year
data['InvoiceMonth'] = data['InvoiceDate'].dt.month
data['InvoiceDay'] = data['InvoiceDate'].dt.day
data['InvoiceHour'] = data['InvoiceDate'].dt.hour

data.head()


##### Split the Data

In [None]:
from sklearn.model_selection import train_test_split

# Define the feature matrix X and target vector y
X = data[['Quantity', 'Price', 'InvoiceYear', 'InvoiceMonth', 'InvoiceDay', 'InvoiceHour']]
y = data['TotalAmount']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train.shape, X_test.shape


##### Model Building

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Initialize the model
model = LinearRegression()

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')


##### Model Building (Multiple Models)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score

# Initialize models
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(random_state=42),
    'Random Forest': RandomForestRegressor(random_state=42, n_estimators=100),
    'Gradient Boosting': GradientBoostingRegressor(random_state=42, n_estimators=100)
}


##### Train and Evaluate Models

In [None]:
results = {}

for model_name, model in models.items():
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Evaluate the model
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    # Store the results
    results[model_name] = {'MSE': mse, 'R2': r2}

    print(f'{model_name}:')
    print(f'  Mean Squared Error: {mse}')
    print(f'  R-squared: {r2}')
    print('-' * 40)


###### Cross-Validation for Model Selection

In [None]:
cv_results = {}

for model_name, model in models.items():
    cv_scores = cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error')
    cv_results[model_name] = cv_scores.mean()
    print(f'{model_name} CV Mean Squared Error: {-cv_scores.mean()}')


### Basic Structure of the Tkinter App

In [None]:
import tkinter as tk
from tkinter import ttk
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

# Load Data
def load_data():
    # Sample data
    data = pd.DataFrame({
        'Quantity': [10, 20, 30, 40, 50],
        'Price': [5, 15, 25, 35, 45],
        'TotalAmount': [50, 300, 750, 1400, 2250]
    })
    return data

# Train Model
def train_model(data):
    X = data[['Quantity', 'Price']]
    y = data['TotalAmount']
    model = LinearRegression()
    model.fit(X, y)
    return model

# Predict Function
def predict():
    quantity = int(quantity_var.get())
    price = float(price_var.get())
    
    prediction = model.predict([[quantity, price]])
    result_label.config(text=f"Predicted Total Amount: ${prediction[0]:.2f}")

# Initialize Tkinter Window
root = tk.Tk()
root.title("Predictive Model App")

# Load data and train model
data = load_data()
model = train_model(data)

# Create Input Fields
ttk.Label(root, text="Quantity").grid(row=0, column=0)
quantity_var = tk.StringVar()
quantity_entry = ttk.Entry(root, textvariable=quantity_var).grid(row=0, column=1)

ttk.Label(root, text="Price").grid(row=1, column=0)
price_var = tk.StringVar()
price_entry = ttk.Entry(root, textvariable=price_var).grid(row=1, column=1)

# Predict Button
predict_button = ttk.Button(root, text="Predict", command=predict)
predict_button.grid(row=2, column=0, columnspan=2)

# Result Label
result_label = ttk.Label(root, text="Predicted Total Amount: ")
result_label.grid(row=3, column=0, columnspan=2)

# Run the Tkinter Event Loop
root.mainloop()
