In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


Memory Reduction

In [3]:
def reduce_memo(df):
    for col in df.columns:
        if df[col].dtype =="float64":
            df[col]=df[col].astype('float32')

        elif df[col].dtype=='int64':
            df[col]=df[col].astype('int8')

        elif df[col].dtype=='object':
            df[col]=df[col].astype('category')
    return df

LOAD THE DATA

In [4]:
departments=pd.read_csv("/content/drive/MyDrive/Dataset/departments.csv")
orders=pd.read_csv("/content/drive/MyDrive/Dataset/orders.csv")
order_product=pd.read_csv("/content/drive/MyDrive/Dataset/order_products__prior.csv")
product=pd.read_csv("/content/drive/MyDrive/Dataset/products.csv")
aisle=pd.read_csv("/content/drive/MyDrive/Dataset/aisles.csv")

In [5]:
orders= reduce_memo(orders)
departments=reduce_memo(departments)
order_product=reduce_memo(order_product)
product=reduce_memo(product)
aisle=reduce_memo(aisle)

Look at the Data

In [6]:
aisle

Unnamed: 0,aisle_id,aisle
0,1,prepared soups salads
1,2,specialty cheeses
2,3,energy granola bars
3,4,instant foods
4,5,marinades meat preparation
...,...,...
129,-126,hot cereal pancake mixes
130,-125,dry pasta
131,-124,beauty
132,-123,muscles joints pain relief


In [7]:
order_product.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,2,96,1,1
1,2,57,2,1
2,2,111,3,0
3,2,94,4,1
4,2,83,5,0


We will join our data into one df

In [None]:
complete_df=pd.merge(order_product,product,how='inner',on='product_id')
complete_df=pd.merge(complete_df,departments,how='inner',on='department_id')
complete_df=pd.merge(complete_df,aisle,how='inner',on='aisle_id')
complete_df=pd.merge(complete_df,orders,how='inner',on='order_id')
complete_df.head()
sample_df=complete_df.sample(200000,random_state=42)

EDA


Look at data

In [None]:
complete_df.info()

Summary of Data

In [None]:
complete_df.describe()

Distrubution of days since prior order

In [None]:
plt.figure(figsize=(10,5))
sns.histplot(complete_df['days_since_prior_order'],bins=30,kde=False)
plt.title("Distrubution of days Since Prior Order")
plt.xlabel("Days Since Prior Order")
plt.ylabel("Count of Orders")
plt.show()

Top 30 Product Orderd

In [None]:
most_ordered=complete_df['product_name'].value_counts().head(30)
plt.figure(figsize=(12,6))
sns.barplot(x=most_ordered,y=most_ordered.index)
plt.xlabel("count of orders")
plt.title("Top 30 Product Ordred")

Day Of Week Plot

In [None]:
sns.histplot(complete_df['order_dow'],bins=7,kde=False)

Order Per department

In [None]:
top_departments=complete_df['department'].value_counts()
plt.figure(figsize=(12,6))
sns.barplot(x=top_departments,y=top_departments.index)
plt.xlabel("count of orders")
plt.title("ORDER PER DEPARTMENT")
plt.show()

Number of orders per users

In [None]:
user_order=complete_df.groupby('user_id')['order_number'].max()
plt.figure(figsize=(10,5))
sns.histplot(user_order,bins=30)
plt.title("Number of Orders Per User")
plt.show()


Reorderd Vs orderd


In [None]:
plt.figure(figsize=(6,6))
sns.countplot(x=complete_df['reordered'])
plt.title("Reorderd vs Not Reorderd")
plt.show()

The Top 20 aisle

In [None]:
top_aisle=complete_df['aisle'].value_counts().head(20)
plt.figure(figsize=(12,6))
sns.barplot(x=top_aisle,y=top_aisle.index)
plt.title("Top 20 aisle")
plt.xlabel("count (Number of products)")
plt.ylabel("aisle")
plt.show()

REorder VS Order number

In [None]:
order_ordernum=complete_df.groupby('order_number')['reordered'].mean().reset_index()
plt.figure(figsize=(10,5))
sns.histplot(data=order_ordernum,x='order_number',y='reordered')
plt.title("Reorder Rate VS Order Number")
plt.show()

Handle Numrical Features

In [None]:
num_cols=complete_df.select_dtypes(include=['int32','float32','int64','float64']).columns
num_cols

Handle Categorical Features

In [None]:
cat_cols=complete_df.select_dtypes(include=['object']).columns
cat_cols

Correlation For Numrical Featuers ("Heatmap")

In [None]:
plt.figure(figsize=(10,5))
correlation_matrix=complete_df[num_cols].corr()
sns.heatmap(correlation_matrix,annot=True,cmap='coolwarm')
plt.title("Numerical Feature Correlation")
plt.show()

Correlation For Numrical Featuers ("pairwise scatter")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(8,9))
sample_df=complete_df.sample(1000,random_state=42)
sns.pairplot(sample_df[num_cols])
plt.title("Pairplot of Numerical Features")
plt.show()

Time Of Day plot

In [None]:
whole_day=complete_df['order_hour_of_day'].value_counts().sort_index()
plt.figure(figsize=(12,6))
sns.barplot(x=whole_day.index,y=whole_day.values)
plt.title("Orders by Hour of Day")
plt.show()


Orders in the Whole Week

In [None]:
whole_week=complete_df['order_dow'].value_counts().sort_index()
plt.figure(figsize=(10,5))
sns.barplot(x=whole_week.index,y=whole_week.values)
plt.title("Orders by Day of Week")
plt.show()

In [None]:
# --- IGNORE ---

HANDLE MISSING VALUE

In [None]:
missing_value= complete_df.isnull().sum()
missing_value

In [None]:
missing_count=complete_df.isnull().sum()
missing_count=missing_count[missing_count>0].sort_values(ascending=False)
plt.figure(figsize=(12,4))
sns.barplot(x=missing_count.index,y=missing_count.values)
plt.xticks(rotation=90)
plt.title("Missing Value Per Column")
plt.show()

In [None]:
print ("..")

3. Cleaning & Imputation

Median Imputation

In [None]:
from sklearn.impute import SimpleImputer
median_df=complete_df.copy()
median_imputer=SimpleImputer(strategy='median')
median_df['days_since_prior_order']=median_imputer.fit_transform(median_df[['days_since_prior_order']])
median_df.isnull().sum()

 Median Visualize

In [None]:
median_df.isnull().sum()
plt.figure(figsize=(10,5))
sns.histplot(median_df['days_since_prior_order'], bins=30, kde=False)
plt.title("Histogram of Days Since Prior Order(Median Imputation)")
plt.xlabel("Days")
plt.ylabel("Frequency")
plt.show()

Most Frequent (Mode) Imputation

In [None]:
from sklearn.impute import SimpleImputer
freq_df=complete_df.copy()
freq_imputer = SimpleImputer(strategy='most_frequent')
freq_df['days_since_prior_order'] = freq_imputer.fit_transform(freq_df[['days_since_prior_order']])
freq_df.isnull().sum()

Most Frequent (Mode) Visualize

In [None]:
plt.figure(figsize=(10,5))
sns.histplot(freq_df['days_since_prior_order'], bins=30, kde=False)
plt.title("Histogram of Days Since Prior Order (Mode Imputation)")
plt.xlabel("Days")
plt.ylabel("Frequency")
plt.show()

Sentinel Imputation

In [None]:
from sklearn.impute import SimpleImputer
sentinel_df=complete_df.copy()
sentinel_vl=0
sentinel_df['days_since_prior_order']=sentinel_df['days_since_prior_order'].fillna(sentinel_vl)
sentinel_df.isnull().sum()

Sentinel Visualize

In [None]:
plt.figure(figsize=(10,5))
sns.histplot(sentinel_df['days_since_prior_order'], bins=30, kde=False)
plt.title("Histogram of Days Since Prior Order (Sentinel Value Imputation)")
plt.xlabel("Days")
plt.ylabel("Frequency")
plt.show()

Model-Based Imputation (KNN Imputer)

In [None]:
from sklearn.impute import KNNImputer
sample_knn = complete_df.sample(100000, random_state=42)
knn_imputer = KNNImputer(n_neighbors=5)
knn_sample_df = sample_knn.copy()
knn_sample_df[['days_since_prior_order']] = knn_imputer.fit_transform(knn_sample_df[['days_since_prior_order']])
knn_sample_df.isnull().sum()

Model Based Visualize

In [None]:
plt.figure(figsize=(10,5))
sns.histplot(knn_sample_df['days_since_prior_order'], bins=30, kde=False)
plt.title("Histogram of Days Since Prior Order (KNN Imputation)")
plt.xlabel("Days")
plt.ylabel("Frequency")
plt.show()

In [None]:
print("hkjkjhkj")

3-Outlier Detection & Treatment (Z-Score Method)

In [None]:
outlier_columns = []
for col in complete_df.select_dtypes(include=['int', 'float']):
    Q1 = complete_df[col].quantile(0.25)
    Q3 = complete_df[col].quantile(0.75)
    IQR = Q3 - Q1

    if ((complete_df[col] < (Q1 - 1.5 * IQR)) | (complete_df[col] > (Q3 + 1.5 * IQR))).any():
        outlier_columns.append(col)

print("Outlier columns:", outlier_columns)


In [None]:
import numpy as np
mean_val =complete_df[outlier_columns].mean()
std_val  = complete_df[outlier_columns].std()
z_score = (complete_df[outlier_columns]- mean_val) / std_val
z_score



Extracting Outliers

In [None]:
outlier_z = z_score[(z_score > 3) | (z_score < -3)]
print(outlier_z.shape)
len(outlier_z)

Boxplot Before Removing Outliers (Z-score) Visualize

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(10,4))
sample_df=complete_df.sample(20000,random_state=42)
sns.boxplot(data=sample_df[outlier_columns])
plt.title("Boxplot Before Removing Outliers (Z-score)")
plt.show()


Histogram Before Removing Outliers (Z-score) Visualize

In [None]:
plt.figure(figsize=(10,4))
sns.histplot(data=sample_df[outlier_columns], bins=30, kde=False,palette="coolwarm")
plt.title("Histogram Before Removing Outliers (Z-score)")
plt.xlabel("Days")
plt.ylabel("Frequency")
plt.show()

Removing Outliers

In [None]:
clean_z_df =sample_df[(z_score <= 3) & (z_score >= -3)]
clean_z_df.shape

Boxplot After Removing Outliers

In [None]:
plt.figure(figsize=(10,4))
sns.boxplot(data=clean_z_df[outlier_columns])
plt.title("Boxplot After Removing Outliers (Z-score)")
plt.show()


Histogram After Removing Outliers

In [None]:
plt.figure(figsize=(10,5))
sns.histplot(data=clean_z_df[outlier_columns], bins=30, kde=False)
plt.title("Histogram After Removing Outliers (Z-score)")
plt.xlabel("Days")
plt.ylabel("Frequency")
plt.show()

4-Encoding Categorical Variables

One-Hot Encoding (for low-cardinality categories)

In [None]:
model_df=complete_df.copy()
model_df.info()


In [None]:
low_card= ['department', 'aisle','eval_set']
one_hot_df = pd.get_dummies(model_df,columns=low_card,drop_first=True)
one_hot_df = one_hot_df.replace({True: 1, False: 0})
one_hot_df.head()

Target / Mean Encoding (with smoothing + CV leakage protection) for high cardinality (product_id, user_id)

In [None]:
from sklearn.model_selection import KFold
df= model_df.copy()
target='reordered'#اللي بدنا نحسب متوسطه
global_mean=df[target].mean()
global_mean


In [None]:

from sklearn.model_selection import KFold

def target_simple_encod(df, target, cat_column, n_splits=5, alpha=10):
    encoded_col = pd.Series(index=df.index, dtype=float)
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    global_mean = df[target].mean()

    for train_index, val_index in kf.split(df):
        train_data = df.iloc[train_index]
        val_data = df.iloc[val_index]

        category_means = train_data.groupby(cat_column)[target].mean()
        category_counts = train_data.groupby(cat_column)[target].count()

        smooth_means = (category_means * category_counts + global_mean * alpha) / (category_counts + alpha)

        encoded_col.iloc[val_index] = val_data[cat_column].map(smooth_means).fillna(global_mean)

    df[cat_column + "_encoded_smooth"] = encoded_col
    return df


target_columns = ["product_id", "user_id"]

for col in target_columns:
    model_df = target_simple_encod(model_df, target, col, n_splits=5, alpha=10)

model_df.head()


Frequency Encoding

In [None]:
for columns in target_columns:
    freq_encod= model_df[columns].value_counts()/len(model_df)
    model_df[columns + '_freq_encod']= model_df[columns].map(freq_encod)
model_df.head()


5. Feature Scaling

Feature Scaling("StandardScaler")

In [None]:
from sklearn.preprocessing import StandardScaler
standard_df=model_df.copy()
scaler=StandardScaler()
standard_df[num_cols]=scaler.fit_transform(model_df[num_cols])
standard_df.head()


Feature Scaling("MinMaxScaler")

In [None]:
from sklearn.preprocessing import MinMaxScaler
min_max_df=model_df.copy()
MMS=MinMaxScaler()
min_max_df[num_cols]=MMS.fit_transform(model_df[num_cols])
min_max_df.head()

In [None]:
model_df.head()

6- Feature Enginnering

-User Level Features

In [None]:
model_df.head()

In [None]:
user_orders = model_df.groupby("user_id")["order_id"].nunique() #جإجمالي عدد الطلبات الفريده
basket_size = model_df.groupby(["user_id", "order_id"]).size().groupby("user_id").mean() #حساب لمتوسط حجم السلة لكل مستخدم
user_reorder_ratio = model_df.groupby("user_id")["reordered"].mean() #ححساب نسبه اعادة طلب لهاد اليوزر
mean_days = model_df.groupby("user_id")["days_since_prior_order"].mean()#حسب متوسط الفاصل الزمني بين طلبات المستخدم
last_order = model_df.groupby("user_id")["order_number"].max() #جاب آخر رقم طلب لكل مستخدم

model_df["user_total_orders"] = model_df["user_id"].map(user_orders) #رإضافة عمود جديد لعدد الطلبات الفريدة لكل مستخدم
model_df["user_avg_basket"] = model_df["user_id"].map(basket_size) # لإضافة عمود جديد لحجم السلة (عدد العناصر في كل طلب)
model_df["user_reorder_ratio"] = model_df["user_id"].map(user_reorder_ratio) #حسب نسبة المنتجات المعاد طلبها لكل مستخدم
model_df["user_mean_days"] = model_df["user_id"].map(mean_days) #متوسط عدد الأيام بين طلبات المستخدم الواحد و رجّع المتوسط الخاص بكل صف
model_df["user_last_order_recency"] = model_df["user_id"].map(last_order)
model_df.head()


- Product-level features

In [None]:
product_reorder_rate = model_df.groupby("product_id")["reordered"].mean() # نسبة المرات اللي تم فيها إعادة شراء هذا المنتج من إجمالي مرات شرائه
avg_cart_position = model_df.groupby("product_id")["add_to_cart_order"].mean() #المتوسط العام لترتيب إضافة المنتج داخل السلة
product_popularity = model_df.groupby("product_id")["order_id"].count() #عدد المرات التي ظهر فيها المنتج في الطلبات.
#ذا يعطي عدد المرات التي تم فيها شراء المنتج من قبل أي مستخدم

model_df["product_reorder_rate"] = model_df["product_id"].map(product_reorder_rate) # نضيف المعدل لكل صف حتى يستخدمه المودل.
model_df["product_avg_cart_pos"] = model_df["product_id"].map(avg_cart_position) #نربط هذا السلوك بكل ظهور للمنتج
model_df["product_popularity"] = model_df["product_id"].map(product_popularity) #نضيف الشعبية لكل صف.



User×Product interaction features

In [None]:
user_prod_purchase_count = model_df.groupby(["user_id", "product_id"])['order_id'].count()
last_purchase_day = model_df.groupby(['user_id', 'product_id'])['days_since_prior_order'].max()
user_prod_reoder=model_df.groupby(['user_id', 'product_id'])['reordered'].mean()

model_df['user_prod_purchase_count'] = model_df.groupby(['user_id', 'product_id'])['order_id'].transform('count')
model_df['user_prod_last_purchase_day'] = model_df.groupby(['user_id', 'product_id'])['days_since_prior_order'].transform('max')
model_df['user_prod_reorder_rate'] = model_df.groupby(['user_id', 'product_id'])['reordered'].transform('mean')
model_df.head()

Temporal features

In [None]:
model_df['order_hour']=model_df['order_hour_of_day']#Hour

model_df['order_day']=model_df['order_dow']#Day

start_year=2024
days_per_year=365
model_df['order_year'] = start_year +model_df['days_since_prior_order']//days_per_year #Year
model_df['order_month'] = ((model_df['days_since_prior_order'] % days_per_year) // 30) + 1 #Month

def get_season(month):
    if month in [3, 4, 5]:
        return 'Spring'
    elif month in [6, 7, 8]:
        return 'Summer'
    elif month in [9, 10, 11]:
        return 'Fall'
    else:
        return 'Winter'
model_df['order_season'] = model_df['order_month'].apply(get_season)#Season



#Holiday

In [None]:
model_df.columns

In [None]:
scale_cols = [
    'user_total_orders',
    'user_avg_basket',
    'user_reorder_ratio',
    'user_mean_days',
    'user_last_order_recency',
    'product_reorder_rate',
    'product_avg_cart_pos',
    'product_popularity',
    'user_prod_purchase_count',
    'user_prod_last_purchase_day',
    'user_prod_reorder_rate',
    'order_hour',
    'order_day',
    'order_month',

]


In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

model_df.loc[:, scale_cols] = scaler.fit_transform(
    model_df[scale_cols]
)


In [None]:
model_df

Dimensionality & collinearity

In [None]:
num_cols = model_df.select_dtypes(include=['number']).columns
num_cols



In [None]:
corr_cols=num_cols = [
    'user_total_orders',
    'user_avg_basket',
    'user_reorder_ratio',
    'user_mean_days',
    'user_last_order_recency',
    'product_reorder_rate',
    'product_avg_cart_pos',
    'product_popularity',
    'user_prod_purchase_count',
    'user_prod_last_purchase_day',
    'user_prod_reorder_rate',
    'order_hour',
    'order_day',
    'order_month'
]
sample=model_df[corr_cols].sample(50000,random_state=42)
correlation=sample.corr()
correlation

In [None]:
plt.figure(figsize=(10,8))
sns.heatmap(correlation,annot=True,cmap='coolwarm')
plt.title("Correlation Heatmap")
plt.show()

VIF

In [None]:
!pip install statsmodels

In [None]:
model_df.isna().sum()

In [None]:
reduce_memo(model_df)

In [None]:
null_columns = model_df.columns[model_df.isna().any()].tolist()
null_columns


In [None]:
model_df.info()

In [None]:
from sklearn.impute import SimpleImputer

median_imputer = SimpleImputer(strategy='median')

model_df[null_columns] = median_imputer.fit_transform(
    model_df[null_columns]
)


In [None]:
import pandas as pd
from statsmodels.stats.outliers_influence import variance_inflation_factor

vif_cols = [
    'user_total_orders',
    'user_avg_basket',
    'user_reorder_ratio',
    'user_mean_days',
    'user_last_order_recency',
    'product_reorder_rate',
    'product_avg_cart_pos',
    'product_popularity',
    'user_prod_purchase_count',
    'user_prod_last_purchase_day',
    'user_prod_reorder_rate',
    'order_hour',
    'order_day',
    'order_month'
]

X = model_df[vif_cols]

vif_df = pd.DataFrame()
vif_df['feature'] = X.columns
vif_df['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

vif_df
