In [1]:
# Libraries

import pandas as pd
import json     
import snowflake.connector
import datetime

#----------CONNECTION TO SNOWFLAKE-------------
SF_ACCOUNT = 'gfa04036.us-east-1'
SF_WH = 'TRANSFORMING'
SF_USERNAME = 'DBT_USER'
SF_PASSWORD = '2C>`8Q!8y*Sz]h/):Xxy&WNJv'

# Connecting to Snowflake using the default authenticator
ctx = snowflake.connector.connect(
  user=SF_USERNAME,    #username,
  password=SF_PASSWORD,    #password,
  account=SF_ACCOUNT,
  warehouse=SF_WH,
  database='ANALYTICS',
  schema='PROD_STAGING'
)

cur=ctx.cursor()

sql = '''
      SELECT
          clients.ltv_usd,
          CASE WHEN clients.auto_registered THEN 1 ELSE 0 END AS auto_registered,
          CASE WHEN clients.first_hunter_id IS NULL THEN 0 ELSE 1 END AS had_hunter,
          COALESCE(SPLIT(TRIM(segment_users.network_carrier,' '),' ')[0], 'not have') AS network_carrier,
          segment_users.os_version,
          SPLIT(TRIM(segment_users.device_manufacturer,' '),' ')[0] AS device_manufacturer,
          SPLIT(TRIM(segment_users.device_type,' '),' ')[0] AS device_type,
          segment_users.screen_resolution::TEXT AS screen_resolution,
          COALESCE(segment_users.email_domain,'not have') AS email_domain,
          COALESCE(segment_users.hardware_store_type,'not have') AS hardware_store_type,
          CASE WHEN segment_users.have_credit_card THEN 1 ELSE 0 END AS have_credit_card,
          COALESCE(segment_users.role,'not have') AS role,
          segment_users.specialty,
          segment_users.categories
      FROM "ANALYTICS"."PROD_MODELED"."CLIENTS" AS clients
      INNER JOIN "ANALYTICS"."PROD_MODELED"."SEGMENT_USERS" AS segment_users
          ON clients.id = segment_users.client_id
          AND clients.source_country = segment_users.source_country
          AND clients.source_client = 'Client'
      WHERE
        clients.ltv_usd IS NOT NULL
      '''
cur.execute(sql)

# Fetch the result set from the cursor and deliver it as the Pandas DataFrame.
clients = cur.fetch_pandas_all()
clients.reset_index(inplace=True)

print(len(clients))

6718


In [2]:
# Only Series (one column df)
def listTypeEncoder(original_df, feature_to_encode):
    subdf = original_df[feature_to_encode].fillna('')
    data = subdf.to_dict()
    posibilities = set([ j for i in data for j in data[i].split(',') if j != ''])
    newList=[]
    for i in data:
        try:
            compare = data[i].split(',')
        except:
            compare = []
        addDict = {}
        for p in posibilities:
            addDict[p] = 0
            if p in compare:
                addDict[p] = 1
        newList.append(addDict)
    res = pd.concat([original_df, pd.DataFrame(newList)], axis = 1)
    res = res.drop([feature_to_encode], axis = 1)
    return res

def encode_and_bind(original_dataframe, feature_to_encode):
    dummies = pd.get_dummies(original_dataframe[[feature_to_encode]], drop_first=True)
    res = pd.concat([original_dataframe, dummies], axis=1)
    res = res.drop([feature_to_encode], axis=1)
    return(res) 

In [3]:
# Preprocessing - One Hot Encoding
import math
import numpy as np

Y = np.log(clients['LTV_USD'])

# Categorical
Xca = clients[['NETWORK_CARRIER', 'OS_VERSION', 'DEVICE_MANUFACTURER', 'DEVICE_TYPE', 'SCREEN_RESOLUTION',
                'EMAIL_DOMAIN', 'HARDWARE_STORE_TYPE', 'ROLE']]               
# Continuous or alredy encoded
Xco = clients[['AUTO_REGISTERED', 'HAD_HUNTER', 'HAVE_CREDIT_CARD']]

# # List type data
# Xli = clients[['SPECIALTY','CATEGORIES']]

for feature in list(Xca.columns):
    Xca = encode_and_bind(Xca, feature)
# for feature in list(Xli.columns):
#     Xli = listTypeEncoder(Xli, feature)

X = pd.concat([Xco, Xca], axis=1)
print(X.head())

   AUTO_REGISTERED  HAD_HUNTER  HAVE_CREDIT_CARD  \
0                0           1                 0   
1                0           1                 0   
2                0           1                 0   
3                0           1                 0   
4                0           1                 0   

   NETWORK_CARRIER_#movistartedamas  NETWORK_CARRIER_#quedateencasa  \
0                                 0                               0   
1                                 0                               0   
2                                 0                               0   
3                                 0                               0   
4                                 0                               0   

   NETWORK_CARRIER_732123  NETWORK_CARRIER_akimovil  NETWORK_CARRIER_at&t  \
0                       0                         0                     0   
1                       0                         0                     0   
2                       0    

In [4]:
# Model design
import numpy as np
import statsmodels.api as sm

x = sm.add_constant(X)
model = sm.OLS(Y, x)
results = model.fit()
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                LTV_USD   R-squared:                       0.203
Model:                            OLS   Adj. R-squared:                  0.170
Method:                 Least Squares   F-statistic:                     6.144
Date:                Tue, 04 Jan 2022   Prob (F-statistic):          1.89e-169
Time:                        08:46:59   Log-Likelihood:                -10907.
No. Observations:                6718   AIC:                         2.235e+04
Df Residuals:                    6449   BIC:                         2.419e+04
Df Model:                         268                                         
Covariance Type:            nonrobust                                         
                                                                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------