In [10]:
import pandas as pd
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score

# Load the dataset from a CSV file
df = pd.read_csv("/content/Online Retail Dataset.csv")

print(df)


       InvoiceNo StockCode  ... CustomerID         Country
0         536365    85123A  ...    17850.0  United Kingdom
1         536365     71053  ...    17850.0  United Kingdom
2         536365    84406B  ...    17850.0  United Kingdom
3         536365    84029G  ...    17850.0  United Kingdom
4         536365    84029E  ...    17850.0  United Kingdom
...          ...       ...  ...        ...             ...
334154    566221     21933  ...        NaN  United Kingdom
334155   C566222     23118  ...    13458.0  United Kingdom
334156   C566223    84625C  ...    16145.0  United Kingdom
334157    566224     21928  ...        NaN  United Kingdom
334158    566224    84968A  ...        NaN             NaN

[334159 rows x 8 columns]


In [11]:
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,01/12/2010 8:26,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,01/12/2010 8:26,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,01/12/2010 8:26,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,01/12/2010 8:26,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,01/12/2010 8:26,3.39,17850.0,United Kingdom


In [13]:
# Convert InvoiceDate to datetime format
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'], format='%d/%m/%Y %H:%M', errors='coerce')

In [14]:
df = df.dropna(subset=['CustomerID'])


In [15]:
print(df['InvoiceDate'])

0        2010-12-01 08:26:00
1        2010-12-01 08:26:00
2        2010-12-01 08:26:00
3        2010-12-01 08:26:00
4        2010-12-01 08:26:00
                 ...        
333965   2011-09-09 15:53:00
333966   2011-09-09 15:53:00
333967   2011-09-09 15:53:00
334155   2011-09-09 16:34:00
334156   2011-09-09 16:36:00
Name: InvoiceDate, Length: 242232, dtype: datetime64[ns]


In [17]:
# Remove negative values (returns or cancellations)
df = df[df['Quantity'] > 0]
df = df[df['UnitPrice'] > 0]

In [18]:
df['CustomerID'] = df['CustomerID'].astype(int)

In [19]:
# Calculate the total price for each transaction
df['TotalPrice'] = df['Quantity'] * df['UnitPrice']

# Get the latest date in the dataset as a reference point
snapshot_date = df['InvoiceDate'].max()

# Compute RFM (Recency, Frequency, Monetary) features for each customer
rfm_df = df.groupby('CustomerID').agg({
    'InvoiceDate': lambda x: (snapshot_date - x.max()).days,  # Recency: days since last purchase
    'InvoiceNo': 'count',  # Frequency: number of purchases
    'TotalPrice': 'sum'  # Monetary Value: total amount spent
})

# Rename columns for better understanding
rfm_df.rename(columns={
    'InvoiceDate': 'Recency',
    'InvoiceNo': 'Frequency',
    'TotalPrice': 'MonetaryValue'
}, inplace=True)

# Define independent variables (features) and dependent variable (target)
X = rfm_df[['Recency', 'Frequency', 'MonetaryValue']]
y = rfm_df['MonetaryValue']  # Predicting future monetary value

# Split dataset into training (80%) and testing (20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train a Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate model performance using Mean Absolute Error (MAE) and R² Score
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Display results
print("Predicted CLV for test data:", y_pred[:5])
print("Mean Absolute Error:", mae)
print("R² Score:", r2)

# Predict CLV for a new customer with given Recency, Frequency, and Monetary Value
new_customer = pd.DataFrame([[30, 5, 200]], columns=['Recency', 'Frequency', 'MonetaryValue'])
predicted_clv = model.predict(new_customer)
print("Predicted CLV for new customer:", predicted_clv)

Predicted CLV for test data: [ 811.45  101.55  253.31  160.54 3578.54]
Mean Absolute Error: 1.4556275736365054e-12
R² Score: 1.0
Predicted CLV for new customer: [200.]
