<a href="https://colab.research.google.com/github/Esther10203/python/blob/main/predict_customer's_orde.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Model to predict customer's order depending on previous Order**

**Import needed Libraries**

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree
import matplotlib.pyplot as plt

from sklearn.metrics import accuracy_score, classification_report, log_loss
from datetime import datetime
from collections import Counter
from sklearn.metrics import accuracy_score, classification_report, mean_squared_error
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder

**Data Processing Function**

In [2]:
# 1. Data Preprocessing
def preprocess_data(df):
    # Function to safely convert date strings to datetime
    def safe_date_parse(date_string):
        try:
            return pd.to_datetime(date_string)
        except:
            return None

    # Convert date_created to datetime, setting invalid dates to None
    df['date_created'] = df['date_created'].apply(safe_date_parse)

    # Sort the dataframe by date_created
    df = df.sort_values('date_created')

    # Forward fill the None values with the next valid date
    df['date_created'] = df['date_created'].fillna(method='ffill')

    # If there are still None values at the beginning, back fill them
    df['date_created'] = df['date_created'].fillna(method='bfill')

    # Extract features from date
    df['day_of_week'] = df['date_created'].dt.dayofweek
    df['month'] = df['date_created'].dt.month
    df['hour'] = df['date_created'].dt.hour

    # Handle missing values for other columns
    df = df.fillna({'order_notes': 'No notes', 'cooking_ref': 'Standard', 'addon': 'None', 'ingredients': 'Standard'})

    return df

**Removing missing variables**

In [3]:
# 2. Feature Engineering
def engineer_features(df):
    # Encode categorical variables
    cat_columns = ['client_id', 'item_id', 'merchant_id', 'size', 'cooking_ref', 'addon', 'ingredients']
    le = LabelEncoder()
    for col in cat_columns:
        df[col] = le.fit_transform(df[col].astype(str))

    # Create binary features from order_notes
    df['has_notes'] = (df['order_notes'] != 'No notes').astype(int)

    # Create price difference feature
    df['price_diff'] = df['normal_price'] - df['discounted_price']

    return df

**Read data from file**

In [4]:
# Load the data
df = pd.read_csv('data.csv')

# Preprocess and engineer features
df = preprocess_data(df)
df = engineer_features(df)

df

Unnamed: 0,id,order_id,client_id,item_id,merchant_id,date_created,item_name,order_notes,normal_price,discounted_price,...,qty,cooking_ref,addon,ingredients,non_taxable,day_of_week,month,hour,has_notes,price_diff
2,327976,146417,537,2457,120,2020-09-01,Tally Sandwich packet (10 pcs),No notes,1000.0,1000.0,...,1,0,0,0,1,1,9,0,0,0.0
1028,329002,146882,727,171,26,2020-09-01,Chilli red 1Kg,No notes,2400.0,2400.0,...,1,0,0,0,1,1,9,0,0,0.0
1027,329001,146882,727,650,26,2020-09-01,Garlic peeled 500mg,No notes,1800.0,1800.0,...,1,0,0,0,1,1,9,0,0,0.0
1026,329000,146882,727,150,26,2020-09-01,Spring onion(bunch),No notes,480.0,480.0,...,1,0,0,0,1,1,9,0,0,0.0
1025,328999,146882,727,577,26,2020-09-01,Dry Chilli 500mg,No notes,2400.0,2400.0,...,1,0,0,0,1,1,9,0,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9978,337952,150763,1814,948,65,2020-09-08,Eggplant (Piece),No notes,200.0,200.0,...,4,0,0,0,1,1,9,0,0,0.0
9971,337945,150761,1390,2644,2,2020-09-08,Coca -cola Zero,No notes,600.0,600.0,...,8,0,0,0,1,1,9,0,0,0.0
9999,337973,150769,1814,1455,84,2020-09-08,Iced arnold palmer,No notes,2700.0,2700.0,...,1,0,0,0,1,1,9,0,0,0.0
0,1095,622,1701,2212,0,2020-09-08,Fish and Chips,No notes,5500.0,5500.0,...,1,0,0,0,1,1,9,0,0,0.0


Specify feature and target values

In [5]:
# Prepare features and target
features = ['client_id', 'item_id', 'merchant_id', 'day_of_week', 'month', 'hour',
            'normal_price', 'discounted_price', 'size', 'qty', 'cooking_ref', 'addon',
            'ingredients', 'non_taxable', 'has_notes', 'price_diff']

X = df[features]
y = df['item_name']

# Encode the target variable
le = LabelEncoder()
y = le.fit_transform(y)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


Add scaler

In [6]:
# Scale numerical features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

Random Forest model

In [None]:
# Model Selection and Training
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_scaled, y_train)

# Accuracy: 0.74

**Decision Tree model**

It is very fast

In [8]:
# Model Selection and Training
model = DecisionTreeClassifier(random_state=42)
model.fit(X_train_scaled, y_train)

# Accuracy: 0.78

**Gradient Boost model**

It takes a lot of memory compared to Random Forest

In [7]:
# Model Selection and Training
model = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
model.fit(X_train_scaled, y_train)

KeyboardInterrupt: 

**Model evaluation**

In [9]:
# Model Evaluation
y_pred = model.predict(X_test_scaled)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Accuracy: 0.78

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         1
           1       1.00      0.50      0.67         2
           4       1.00      1.00      1.00         1
           6       0.33      1.00      0.50         1
           7       1.00      1.00      1.00         1
           8       1.00      1.00      1.00         1
           9       1.00      1.00      1.00         1
          10       1.00      0.50      0.67         2
          11       0.50      0.50      0.50         2
          12       0.00      0.00      0.00         0
          14       0.00      0.00      0.00         1
          15       1.00      1.00      1.00         3
          17       0.50      1.00      0.67         1
          21       1.00      1.00      1.00         4
          22       1.00      1.00      1.00         1
          23       0.00      0.00      0.00         0
          24       0.86      1.00      0.9

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
# Calculate MSE
# First, we need to one-hot encode our target variable
# onehot = OneHotEncoder(sparse=False)
# y_test_onehot = onehot.fit_transform(y_test.values.reshape(-1, 1))
# y_pred_proba = model.predict_proba(X_test_scaled)

mse = mean_squared_error(y_test, y_pred)
print(f"\nMean Squared Error: {mse:.4f}")




Mean Squared Error: 196003.8400


In [10]:
# Function to get the most frequent item for a client
def get_most_frequent_item(client_id, df):
    client_orders = df[df['client_id'] == client_id]['item_name']
    if client_orders.empty:
        return "No orders found for this client"
    item_counts = Counter(client_orders)
    most_common_item = item_counts.most_common(1)[0][0]
    return most_common_item


In [11]:

# Function to predict next order for a given client
def predict_next_order(client_id, model, scaler):
    # Get the latest order for the client
    latest_order = df[df['client_id'] == client_id].sort_values('date_created').iloc[-1]

    # Prepare the features for prediction
    features = ['client_id', 'item_id', 'merchant_id', 'day_of_week', 'month', 'hour',
                'normal_price', 'discounted_price', 'size', 'qty', 'cooking_ref', 'addon',
                'ingredients', 'non_taxable', 'has_notes', 'price_diff']

    input_data = latest_order[features].values.reshape(1, -1)
    input_data_scaled = scaler.transform(input_data)

    # Make prediction
    predicted_item = model.predict(input_data_scaled)[0]

    return predicted_item



In [12]:

# Updated function to predict next order and get most frequent item for a given client
def analyze_client_orders(client_id, model, scaler, df):
    # Get the latest order for the client
    client_df = df[df['client_id'] == client_id]
    if client_df.empty:
        return "No orders found for this client", None

    latest_order = client_df.sort_values('date_created').iloc[-1]

    # Prepare the features for prediction
    features = ['client_id', 'item_id', 'merchant_id', 'day_of_week', 'month', 'hour',
                'normal_price', 'discounted_price', 'size', 'qty', 'cooking_ref', 'addon',
                'ingredients', 'non_taxable', 'has_notes', 'price_diff']

    input_data = latest_order[features].values.reshape(1, -1)
    input_data_scaled = scaler.transform(input_data)

    # Make prediction
    predicted_item = model.predict(input_data_scaled)[0]

    # Get most frequent item
    most_frequent_item = get_most_frequent_item(client_id, df)

    return predicted_item, most_frequent_item


In [15]:

# Example usage
client_id_to_analyze = 10367
predicted_item, most_frequent_item = analyze_client_orders(client_id_to_analyze, model, scaler, df)
print(f"Analysis for client {client_id_to_analyze}:")
print(f"Predicted next order: {predicted_item}")
print(f"Most frequently ordered item: {most_frequent_item}")

# # Example usage
# client_id_to_predict = 1857
# predicted_item = predict_next_order(client_id_to_predict, model, scaler)
# print(f"Predicted next order for client {client_id_to_predict}: {predicted_item}")

Analysis for client 10367:
Predicted next order: No orders found for this client
Most frequently ordered item: None
