In [50]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder
import numpy as np

# Load the data
data = pd.read_csv('customer_acquisition_data.csv')



In [51]:
data.head()

Unnamed: 0,customer_id,channel,cost,conversion_rate,revenue
0,1,referral,8.320327,0.123145,4199
1,2,paid advertising,30.450327,0.016341,3410
2,3,email marketing,5.246263,0.043822,3164
3,4,social media,9.546326,0.167592,1520
4,5,referral,8.320327,0.123145,2419


In [52]:
data.isna().sum()

Unnamed: 0,0
customer_id,0
channel,0
cost,0
conversion_rate,0
revenue,0


In [53]:
# Step 1: Data Preprocessing
# Handle missing values
data = data.dropna()

In [54]:
# Compute CLV
# CLV = Revenue - Cost
data['CLV'] = data['revenue'] - data['cost']



In [55]:
data.head()

Unnamed: 0,customer_id,channel,cost,conversion_rate,revenue,CLV
0,1,referral,8.320327,0.123145,4199,4190.679673
1,2,paid advertising,30.450327,0.016341,3410,3379.549673
2,3,email marketing,5.246263,0.043822,3164,3158.753737
3,4,social media,9.546326,0.167592,1520,1510.453674
4,5,referral,8.320327,0.123145,2419,2410.679673


In [56]:
# Encode categorical variables
encoder = OneHotEncoder(sparse_output=False)
channel_encoded = encoder.fit_transform(data[['channel']])
channel_encoded_df = pd.DataFrame(channel_encoded, columns=encoder.get_feature_names_out(['channel']))

In [57]:
# Concatenate encoded columns with the original data
data = pd.concat([data.reset_index(drop=True), channel_encoded_df], axis=1)
data = data.drop(columns=['channel', 'customer_id'])


In [58]:
data.head()

Unnamed: 0,cost,conversion_rate,revenue,CLV,channel_email marketing,channel_paid advertising,channel_referral,channel_social media
0,8.320327,0.123145,4199,4190.679673,0.0,0.0,1.0,0.0
1,30.450327,0.016341,3410,3379.549673,0.0,1.0,0.0,0.0
2,5.246263,0.043822,3164,3158.753737,1.0,0.0,0.0,0.0
3,9.546326,0.167592,1520,1510.453674,0.0,0.0,0.0,1.0
4,8.320327,0.123145,2419,2410.679673,0.0,0.0,1.0,0.0


In [59]:
# Step 2: Split data into training and testing sets
X = data.drop(columns=['CLV'])
y = data['CLV']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [60]:
# Step 3: Train the model
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)


In [61]:
# Step 4: Evaluate the model
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)



In [62]:
# Print evaluation metrics
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
print(f"R-squared (R2): {r2:.2f}")

Mean Absolute Error (MAE): 8.90
Mean Squared Error (MSE): 124.61
Root Mean Squared Error (RMSE): 11.16
R-squared (R2): 1.00


In [64]:
# use linear regression

import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import numpy as np

# Load the data
data = pd.read_csv('customer_acquisition_data.csv')

# Step 1: Data Preprocessing
# Handle missing values
data = data.dropna()

# Compute CLV
# CLV = Revenue - Cost
data['CLV'] = data['revenue'] - data['cost']

# Encode categorical variables
encoder = OneHotEncoder(sparse_output=False) # Replace 'sparse' with 'sparse_output'
channel_encoded = encoder.fit_transform(data[['channel']])
channel_encoded_df = pd.DataFrame(channel_encoded, columns=encoder.get_feature_names_out(['channel']))

# Concatenate encoded columns with the original data
data = pd.concat([data.reset_index(drop=True), channel_encoded_df], axis=1)
data = data.drop(columns=['channel', 'customer_id'])

# Standardize numerical features
scaler = StandardScaler()
numerical_features = ['cost', 'conversion_rate', 'revenue']
data[numerical_features] = scaler.fit_transform(data[numerical_features])

# Step 2: Split data into training and testing sets
X = data.drop(columns=['CLV'])
y = data['CLV']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Train the Ridge Regression model with cross-validation
model_lr = Ridge(alpha=1.0)
cross_val_scores = cross_val_score(model_lr, X_train, y_train, cv=5, scoring='r2')
model_lr.fit(X_train, y_train)

# Step 4: Evaluate the model
y_pred = model_lr.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

# Print evaluation metrics
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
print(f"R-squared (R2): {r2:.2f}")
print(f"Cross-Validation R2 Scores: {cross_val_scores}")
print(f"Mean Cross-Validation R2: {np.mean(cross_val_scores):.2f}")

# Step 5: Show predictions on sample data
sample_data = X_test.head(5)
sample_predictions = model.predict(sample_data)

# Display sample predictions
print("Sample Data:")
print(sample_data)
print("\nPredicted CLV:")
print(sample_predictions)


Mean Absolute Error (MAE): 1.76
Mean Squared Error (MSE): 4.25
Root Mean Squared Error (RMSE): 2.06
R-squared (R2): 1.00
Cross-Validation R2 Scores: [0.99999595 0.9999962  0.999996   0.99999603 0.99999594]
Mean Cross-Validation R2: 1.00
Sample Data:
         cost  conversion_rate   revenue  channel_email marketing  \
696 -0.486856         0.618402  0.069790                      0.0   
667 -0.363219         1.364497 -1.268038                      0.0   
63  -0.363219         1.364497  1.081900                      0.0   
533 -0.486856         0.618402 -0.236067                      0.0   
66   1.744861        -1.174407 -0.190785                      0.0   

     channel_paid advertising  channel_referral  channel_social media  
696                       0.0               1.0                   0.0  
667                       0.0               0.0                   1.0  
63                        0.0               0.0                   1.0  
533                       0.0               1.0