In [1]:
# import pkgs
import pandas as pd
import glob
import locale
import numpy as np
from datetime import date



In [2]:
import locale
locale.setlocale(locale.LC_ALL, 'fa_IR.UTF-8')


'fa_IR.UTF-8'

In [2]:
# Functions

def quantile_score(vec, score):
    scorevec = np.zeros(len(vec))
    qu = np.quantile(vec, np.linspace(0, 1, score + 1))
    scorevec[(vec <= qu[1]) & (vec >= qu[0])] = 1
    for i in range(1, score - 1):
        scorevec[(vec <= qu[i + 1]) & (vec > qu[i])] = i + 1
    scorevec[vec > qu[score]] = score
    return scorevec

def generate_dates(years, months, days30, days31):
    dates = []
    for month in months:
        if month in Months:
            dates.append(f"{years}{month}{days31}")
        else:
            dates.append(f"{years}{month}{days30}")
    return dates


In [3]:
# Parameters

start_date_jalali = 14030601
target_date_jalali = 14031130
start_date = date(2024, 8, 22)  # Start train
target_date = date(2025, 2, 18)  # End train
moduleSelected = "Onlineshopping"


In [4]:
# Read Data
# Folder containing Parquet files
folder_path = "Data/data/*.parquet"

# List all parquet files
parquet_files = glob.glob(folder_path)

# Read and concatenate all files
df = pd.concat([pd.read_parquet(f) for f in parquet_files], ignore_index=True)

DimDate = pd.read_csv("Data/data/DimDate.csv")


In [5]:
# Date Manipulations and Preperations
DimDateS = DimDate[["Miladi3", "Jalali_1"]]
# Convert Miladi column to string
DimDateS["Miladi3"] = DimDateS["Miladi3"].astype(str)
DimDateS.rename(columns={"Miladi3": "date_CHR"}, inplace=True)


df['date_CHR'] = df['date'].astype(str)

# Left join on Miladi_Num
df = df.merge(DimDateS, on="date_CHR", how="left")

# Create Shamsi_Date and convert to numeric
df["Shamsi_Date_Num"] = (df["Jalali_1"].str[:4] + df["Jalali_1"].str[5:7] + df["Jalali_1"].str[8:10]).astype(int)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  DimDateS["Miladi3"] = DimDateS["Miladi3"].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  DimDateS.rename(columns={"Miladi3": "date_CHR"}, inplace=True)


In [6]:
# Filter & Select Trains
# Assuming df is already a pandas DataFrame
TrainRFM = df[df["module"] == moduleSelected]

TrainRFM = TrainRFM[TrainRFM["payment_status"] == "payed"]
TrainRFM = TrainRFM[TrainRFM["status"] == "finished"]

print(TrainRFM.columns)
print(TrainRFM.shape)

# select data for modeling
TrainRFM = TrainRFM[['user_id', '_id', 'initial_total', 'module','Shamsi_Date_Num', 'date']]

# print(TrainRFM.head(5))

Index(['_id', 'user_id', 'date', 'client_type', 'status', 'payment_status',
       'type', 'module', 'product_name', 'initial_total', 'product_price',
       'converted_value', 'qty', 'module_unit_price', 'Amount_in_dollars',
       'first_purchased', 'date_CHR', 'Jalali_1', 'Shamsi_Date_Num'],
      dtype='object')
(503726, 19)


In [7]:
# make R, F, M, L

# !pip install jdatetime
import jdatetime
import datetime
from datetime import date
# from datetime import datetime

# Group by user_id and calculate min and max dates
TrainRFM_RB = TrainRFM.groupby("user_id", as_index=False).agg(
    minDate=("Shamsi_Date_Num", "min"),
    maxDate=("Shamsi_Date_Num", "max")
)



def jalali_to_miladi(jalali_date):
    # Extract year, month, day from the integer Jalali date
    year = jalali_date // 10000
    month = (jalali_date % 10000) // 100
    day = jalali_date % 100

    # Convert to Gregorian using jdatetime
    gregorian_date = jdatetime.date(year, month, day).togregorian()

    # Return formatted Gregorian date
    return gregorian_date




TrainRFM_RB['minDate_Miladi'] = TrainRFM_RB['minDate'].apply(jalali_to_miladi)
TrainRFM_RB['maxDate_Miladi'] = TrainRFM_RB['maxDate'].apply(jalali_to_miladi)

# TrainRFM_RB['maxDate_Miladi'] = TrainRFM_RB['maxDate_Miladi'].apply(
#     lambda x: datetime.strptime(x, '%Y-%m-%d').date()
# )

TrainRFM_RB = TrainRFM_RB[
    (TrainRFM_RB['maxDate'] >= start_date_jalali) & (TrainRFM_RB['maxDate'] <= target_date_jalali)]

TrainRFM_RB['R'] = TrainRFM_RB['maxDate_Miladi'].apply(lambda d: (target_date - d).days)


In [8]:
# calculate F
TrainRFM_FB = TrainRFM.groupby('user_id').size().reset_index(name='F')

TrainRFM_RFB = TrainRFM_RB.merge(TrainRFM_FB, how='left', on='user_id')


In [9]:
# Calculate M

TrainRFM_MB = TrainRFM.groupby('user_id').agg(M=('initial_total', 'sum')).reset_index()

TrainRFMRFMB = TrainRFM_RFB.merge(TrainRFM_MB, on='user_id', how='left')

TrainRFMRFMB['M'] = TrainRFMRFMB['M'].astype(float).astype(int)


In [10]:
#Calculate L

TrainRFMRFMBL = TrainRFMRFMB

TrainRFMRFMBL['L'] = TrainRFMRFMBL.apply(lambda row: (row['maxDate_Miladi'] - row['minDate_Miladi']).days, axis=1) + 1


In [11]:
# Score R, F, M

# R
TrainRFMRFMBL['R_Norm'] = (TrainRFMRFMBL['R'] - TrainRFMRFMBL['R'].min()) / (TrainRFMRFMBL['R'].max() - TrainRFMRFMBL['R'].min())

TrainRFMRFMBL['RNormScore'] = quantile_score(TrainRFMRFMBL['R_Norm'], 5)

TrainRFMRFMBL['RNormScore'] = 6 - TrainRFMRFMBL['RNormScore']





In [12]:
# Keep F Original
TrainRFMRFMBL['F_Org'] = TrainRFMRFMBL['F'].copy()

# set F more than 200 equals 200

TrainRFMRFMBL.loc[TrainRFMRFMBL['F'] > 200, 'F'] = 200

TrainRFMRFMBL['F_Norm'] = (TrainRFMRFMBL['F'] - TrainRFMRFMBL['F'].min()) / \
                          (TrainRFMRFMBL['F'].max() - TrainRFMRFMBL['F'].min())

# Assuming quantileScore is a custom function that needs to be defined
TrainRFMRFMBL['FNormScore'] = quantile_score(TrainRFMRFMBL['F_Norm'], 5)

In [14]:
# Keep original M
TrainRFMRFMBL['M_Org'] = TrainRFMRFMBL['M'].copy()

# set M more than 4,000,000,000 equals to 4,000,000,000
TrainRFMRFMBL['M'] = np.where(TrainRFMRFMBL['M'] >= 4000000000, 4000000000, TrainRFMRFMBL['M'])


# M
TrainRFMRFMBL['M_Norm'] = (TrainRFMRFMBL['M'] - TrainRFMRFMBL['M'].min()) / \
                           (TrainRFMRFMBL['M'].max() - TrainRFMRFMBL['M'].min())

# Assuming quantileScore is a custom function defined elsewhere
TrainRFMRFMBL['MNormScore'] = quantile_score(TrainRFMRFMBL['M_Norm'], 5)