In [2]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error

from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.pipeline import make_pipeline

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor

pd.set_option('display.max_columns', None)

In [3]:
df = pd.read_csv('Analyzed_DataFrame.csv')

In [4]:
df.head(3)

Unnamed: 0.1,Unnamed: 0,total_sessions_day0,total_sessions_day1,total_sessions_day3,total_sessions_day7,chapters_finished_day0,chapters_finished_day1,chapters_finished_day3,chapters_finished_day7,chapters_opened_day0,chapters_opened_day1,chapters_opened_day3,chapters_opened_day7,chapters_closed_day0,chapters_closed_day1,chapters_closed_day3,chapters_closed_day7,diamonds_received_day0,diamonds_received_day1,diamonds_received_day3,diamonds_received_day7,diamonds_spent_day0,diamonds_spent_day1,diamonds_spent_day3,diamonds_spent_day7,tickets_spent_day0,tickets_spent_day1,tickets_spent_day3,tickets_spent_day7,retained_day1,retained_day3,retained_day7,chapters_finished_session1,chapters_finished_session3,chapters_finished_session9,chapters_opened_session1,chapters_opened_session3,chapters_opened_session9,chapters_closed_session1,chapters_closed_session3,chapters_closed_session9,diamonds_spent_session1,diamonds_spent_session3,diamonds_spent_session9,tickets_spent_session1,tickets_spent_session3,tickets_spent_session9,app_sub_ltv_day0,app_sub_ltv_day1,app_sub_ltv_day3,app_iap_ltv_day0,app_iap_ltv_day1,app_iap_ltv_day3,media_source,install_date,country_code,ad_ltv_day0,ad_ltv_day1,ad_ltv_day3,platform,target_sub_ltv_day30,target_iap_ltv_day30,target_ad_ltv_day30,target_full_ltv_day30
0,0,1.0,1.0,1.0,1.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,SOURCE_23,2021-12-03,COUNTRY_135,0.0,0.0,0.0,1,0.0,0.0,0.0,0.0
1,1,1.0,1.0,1.0,1.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,SOURCE_2,2021-12-03,COUNTRY_141,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0
2,2,1.0,2.0,2.0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,SOURCE_17,2021-12-03,COUNTRY_141,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0


# Modeling

In [5]:
X = df.select_dtypes(include=['float64','int64'])
X.drop(['target_sub_ltv_day30','target_iap_ltv_day30','target_ad_ltv_day30','target_full_ltv_day30'],axis = 1,inplace = True)
y = df['target_full_ltv_day30']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Creating a model

In [None]:
models = [LinearRegression(), Ridge(), DecisionTreeRegressor(), ExtraTreesRegressor(), 
          RandomForestRegressor(), Lasso()]
rows = []
for clf in models:
    start = datetime.now()
    clf.fit(X_train, y_train) 
    print(f'\n {clf}')
    time = %time clf.fit(X_train, y_train) 
    y_pred = clf.predict(X_test)    
    rows.append([clf, round(mean_absolute_error(y_pred, y_test), 4), round(np.sqrt(mean_squared_error(y_pred, y_test)), 4), 
         round(mean_absolute_percentage_error(y_pred, y_test), 4), round(clf.score(X_test,y_test), 4)])

    
model_df = pd.DataFrame(rows, columns=['Model name','Test MAE', 'Test RMSE', 'Test MAPE', 'Model test_score']) 


 LinearRegression()
CPU times: user 7.53 s, sys: 674 ms, total: 8.2 s
Wall time: 2.71 s

 Ridge()
CPU times: user 1.06 s, sys: 1.11 s, total: 2.18 s
Wall time: 424 ms

 DecisionTreeRegressor()
CPU times: user 25.7 s, sys: 322 ms, total: 26 s
Wall time: 26.2 s

 ExtraTreesRegressor()
CPU times: user 13min 34s, sys: 1.89 s, total: 13min 36s
Wall time: 13min 36s

 RandomForestRegressor()


In [None]:
model_df

##### LinearRegression() and Ridge() showed the best results. Let`s use Ridge further, because it is faster.

In [None]:
parameters = {'alpha': range(100, 5000, 200)}
model = Ridge()

Best_model= GridSearchCV(model, parameters, cv=5)
Best_model.fit(X_train,y_train)

print(f'Best parameters: {Best_model.best_estimator_}')

In [None]:
model = Ridge(alpha=3300)
model.fit(X_train, y_train) 
y_pred = model.predict(X_test) 
print(f'-----------Ridge-----------\n'
      f'MAE: {mean_absolute_error(y_pred, y_test)}\n'
      f'RMSE: {np.sqrt(mean_squared_error(y_pred, y_test))}\n'
      f'MAPE: {mean_absolute_percentage_error(y_pred, y_test)}\n'
      f'Model test_score: {model.score(X_test, y_test)}\n')

In [None]:
pipeline = make_pipeline(StandardScaler(), Ridge(alpha=3300))
pipeline.fit(X_train, y_train)
pipeline.score(X_test, y_test)

y_pred = pipeline.predict(X_test) 
print(f'-----------Ridge-----------\n'
      f'MAE: {mean_absolute_error(y_pred, y_test)}\n'
      f'RMSE: {np.sqrt(mean_squared_error(y_pred, y_test))}\n'
      f'MAPE: {mean_absolute_percentage_error(y_pred, y_test)}\n'
      f'Train score: {pipeline.score(X_train, y_train)}\n'
      f'Test score: {pipeline.score(X_test, y_test)}\n')

In [None]:
import pickle

with open('finally_model', 'wb') as f:
    pickle.dump(pipeline, f)
    
with open('finally_model', 'rb') as f:
    finally_model = pickle.load(f)