In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data = pd.read_csv('/kaggle/input/fintech-customer-life-time-value-ltv-dataset/digital_wallet_ltv_dataset.csv')
data.head()

In [None]:
data.info()

In [None]:
data.isnull().sum()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.linear_model import LinearRegression


X = data.drop(['Customer_ID', 'LTV'], axis=1)
y = data['LTV']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

numeric_features = ['Age', 'Total_Transactions', 'Avg_Transaction_Value', 'Max_Transaction_Value',
                    'Min_Transaction_Value', 'Total_Spent', 'Active_Days', 'Last_Transaction_Days_Ago',
                    'Loyalty_Points_Earned', 'Referral_Count', 'Cashback_Received',
                    'Support_Tickets_Raised', 'Issue_Resolution_Time', 'Customer_Satisfaction_Score']
categorical_features = ['Location', 'Income_Level', 'App_Usage_Frequency', 'Preferred_Payment_Method']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)])


models= {'Random Forest': RandomForestRegressor(n_estimators=200, random_state=42), 'Extra Trees': ExtraTreesRegressor(n_estimators=200, random_state=42), 'Linear Regression': LinearRegression()}
for name, model in models.items():
    pipe = Pipeline([('preprocessor', preprocessor), ('model', model)])
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    print(f"{name}:")
    print(f"Mean Absolute Error: {mean_absolute_error(y_test, y_pred)}")
    print(f"Mean Squared Error: {mean_squared_error(y_test, y_pred)}")
    print(f"R-squared Score: {r2_score(y_test, y_pred)}")
    
    plt.figure(figsize=(10, 6))
    plt.scatter(y_test, y_pred, alpha=0.5)
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
    plt.xlabel("Actual LTV")
    plt.ylabel("Predicted LTV")
    plt.title("Actual vs Predicted LTV")
    plt.tight_layout()
    plt.show()
    print("--------------------------------------")