In [1]:
# Import libraries

import pandas as pd
import numpy as np
from scipy import stats
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, KNNImputer
from sklearn.linear_model import LinearRegression
import random

In [2]:
# Load data

df = pd.read_csv('clv_data.csv')

# It is not the correct way to calculate lifetime value but as an example, we take a very simple approach to multiply purchases by 20.
df['lifetime_value'] = df['purchases']*20

df.head(3)

Unnamed: 0.1,Unnamed: 0,id,age,gender,income,days_on_platform,city,purchases,lifetime_value
0,0,0,,Male,126895,14.0,San Francisco,0,0
1,1,1,,Male,161474,14.0,Tokyo,0,0
2,2,2,24.0,Male,104723,34.0,London,1,20


In [3]:
#Number of nulls

df.isnull().sum()

Unnamed: 0             0
id                     0
age                 2446
gender                 0
income                 0
days_on_platform     141
city                   0
purchases              0
lifetime_value         0
dtype: int64

In [4]:
# All data points

len(df)

5000

In [5]:
# Method 1: Dropping Null values

drop_df = df.copy()
drop_df = drop_df.dropna()
drop_df.isnull().sum()

Unnamed: 0          0
id                  0
age                 0
gender              0
income              0
days_on_platform    0
city                0
purchases           0
lifetime_value      0
dtype: int64

In [6]:
# Method 2: Mean/Median/Mode imputation

imp_mean_df = df.copy()
imp_mean_df['age'] = imp_mean_df['age'].fillna(np.mean(imp_mean_df['age']))
imp_mean_df.isnull().sum()

# Can use np.median or np.mode

Unnamed: 0            0
id                    0
age                   0
gender                0
income                0
days_on_platform    141
city                  0
purchases             0
lifetime_value        0
dtype: int64

In [7]:
# Method 3: Multiple imputation using regression

reg_df = df.copy()
x_reg = reg_df[['age', 'days_on_platform', 'income']]
y_reg = reg_df['lifetime_value']

imp_reg = IterativeImputer(estimator=LinearRegression(), random_state=0, add_indicator=True) 
# add_indicator is an indicator (dummy var.) that shows if the row is imputed or not
imp_reg.fit(x_reg)

IterativeImputer(add_indicator=True, estimator=LinearRegression(),
                 random_state=0)

In [8]:
# Method 4: Nearest Neighbor imputation

nn_df = df.copy()
x_nn = nn_df[['age', 'days_on_platform', 'income']]
y_nn = nn_df['lifetime_value']

imp_nn = KNNImputer(n_neighbors=5, weights='uniform')
# n_neighbors=5: Means for this Null value, we are looking at the 5 closest values for imputation
# weights='uniform': Means weigh them equally, but can use 'distance' as well.
imp_nn.fit(x_nn)

KNNImputer()