In [1]:
import pandas as pd
import numpy as np
import random
import shimoku_api_python as shimoku

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction import DictVectorizer



In [2]:
merge_df = pd.read_csv("../Data/Processed/merge_processed.csv")

In [3]:
# Creating X and y variables
X = merge_df.drop(["Status"], axis=1)
y = merge_df["Status"]

# Creating training and test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train.drop(["Id"], axis=1, inplace=True)

In [4]:
# Create a Logistic Regression model
clf = RandomForestClassifier(n_estimators=100)
clf.fit(X_train, y_train)
# Perform 5-fold cross validation
scores = cross_val_score(clf, X_train, y_train, cv=5)

# Print cross validation scores
print("Cross-validation scores: ", scores)

# Print the average of the cross-validation scores
print("Average cross-validation score: ", scores.mean())

Cross-validation scores:  [0.75252525 0.76767677 0.72895623 0.75589226 0.72390572]
Average cross-validation score:  0.7457912457912459


In [5]:
test_prediction = clf.predict_proba(X_test.drop(["Id"], axis=1))[:, 1]

In [6]:
df_test = X_test.copy()
df_test["Status"] = y_test

binary_prediction_table = pd.DataFrame({
    'Lead ID': df_test['Id'].values,
    'Probability': [round(100 * p, 2) for p in test_prediction],
    'Lead Scoring': ['High' if v > 0.75 else 'Medium' if v > 0.5 else 'Low' for v in test_prediction],
})

total_occurrences = len(binary_prediction_table)
high_conversion_occurrences = len(binary_prediction_table[binary_prediction_table['Lead Scoring'] == 'High'])
moderate_conversion_occurrences = len(binary_prediction_table[binary_prediction_table['Lead Scoring'] == 'Medium'])
low_conversion_occurrences = len(binary_prediction_table[binary_prediction_table['Lead Scoring'] == 'Low'])

high_conversion = high_conversion_occurrences / total_occurrences
moderate_conversion = moderate_conversion_occurrences / total_occurrences
low_conversion = low_conversion_occurrences / total_occurrences

In [7]:
df_train = X_train.copy()
df_train["Status"] = y_train

train_dic = df_train.to_dict(orient='records')
dv = DictVectorizer(sparse=False)
dv.fit(train_dic)

### Define Auxiliary Funtions

In [8]:
def get_label_columns(table_data: pd.DataFrame):
    low_threshold = table_data["Probability"][table_data["Lead Scoring"] == "Low"].max() + 1e-10
    mid_threshold = table_data["Probability"][table_data["Lead Scoring"] == "Medium"].max() + 1e-10
    return {
        ('Positive Impact Factors', 'outlined'): '#20C69E',
        ('Negative Impact Factors', 'outlined'): '#ED5627',
        'Lead Scoring': {
            'Low': '#F86C7D',
            'High': '#001E50',
            'Medium': '#F2BB67',
        },
        'Probability': {
            (0, low_threshold): '#F86C7D',
            (low_threshold, mid_threshold): '#F2BB67',
            (mid_threshold, np.inf): '#001E50',
        },
    }

### Client Initialization

In [9]:
api_key: str = "90336deb-e537-40a5-98e8-a91eb731a823"
universe_id: str = "c2edae80-3e21-4f15-8c51-c394b34475cf"
workspace_id: str = "e96f1077-ae84-4068-9333-457b5d65ec37"


s = shimoku.Client(
    access_token=api_key,
    universe_id=universe_id,
    async_execution=True,
    verbosity='INFO',
)
s.set_workspace(workspace_id)
s.set_menu_path('Lead Scoring')

2023-12-14 21:29 | INFO | Starting execution: [4mset_workspace[0m
2023-12-14 21:29 | INFO | Finished execution: [4mset_workspace[0m, elapsed time: 1223.62 ms
2023-12-14 21:29 | INFO | Starting execution: [4mset_menu_path[0m
2023-12-14 21:29 | INFO | Retrieved menu path Lead Scoring with id a5126f90-6ecd-4fa4-adbb-9893a64289dc
2023-12-14 21:29 | INFO | Retrieved board Default Name with id bfebf856-afc6-4e02-bac1-c09e615f127c
2023-12-14 21:29 | INFO | Finished execution: [4mset_menu_path[0m, elapsed time: 4306.19 ms


### Clear Menu Path

In [10]:
s.plt.clear_menu_path()

2023-12-14 21:29 | INFO | Starting execution: [4mclear_menu_path[0m
2023-12-14 21:29 | INFO | Deleted 12 components
2023-12-14 21:29 | INFO | Deleted 4 unused datasets from the menu path Lead Scoring
2023-12-14 21:29 | INFO | Finished execution: [4mclear_menu_path[0m, elapsed time: 9475.37 ms


### Page Header

In [11]:
prediction_header = (
    "<head>"
    "<style>"  # Styles title
    ".component-title{height:auto; width:100%; "
    "border-radius:16px; padding:16px;"
    "display:flex; align-items:center;"
    "background-color:var(--chart-C1); color:var(--color-white);}"
    "</style>"
    # Start icons style
    "<style>.big-icon-banner"
    "{width:48px; height: 48px; display: flex;"
    "margin-right: 16px;"
    "justify-content: center;"
    "align-items: center;"
    "background-size: contain;"
    "background-position: center;"
    "background-repeat: no-repeat;"
    "background-image: url('https://uploads-ssl.webflow.com/619f9fe98661d321dc3beec7/63594ccf3f311a98d72faff7_suite-customer-b.svg');}"
    "</style>"
    # End icons style
    "<style>.base-white{color:var(--color-white);}</style>"
    "</head>"  # Styles subtitle
    "<div class='component-title'>"
    "<div class='big-icon-banner'></div>"
    "<div class='text-block'>"
    "<h1>Predictions</h1>"
    "<p class='base-white'>"
    "Lead scoring prediction</p>"
    "</div>"
    "</div>"
)
s.plt.html(html=prediction_header, order=0)

2023-12-14 21:29 | INFO | html added to the task pool


### General Indicators

In [12]:
prediction_indicators = [
    {
        'description': f"{100 * high_conversion:.2f}% of total {total_occurrences}",
        'title': 'High conversion (#)',
        'value': int(total_occurrences * high_conversion),
        'color': 'success',
        'taregPath': 'www.shimoku.com',
    },
    {
        'description': '% of leads that will purchase within 120 days. Time saved: 80%',
        'title': 'High conversion expected',
        'value': '85%',
        'color': 'success',
        'variant': 'contained',
        'targetPath': 'www.shimoku.com',
    },
    {
        'description': f"{100 * moderate_conversion:.2f}% of total {total_occurrences}",
        'title': 'Moderate conversion (#)',
        'value': int(total_occurrences * moderate_conversion),
        'color': 'warning',
        'taregPath': 'www.shimoku.com',
    },
    {
        'description': '% of leads that will purchase within 120 days. Time saved: 70%',
        'title': 'Medium conversion expected',
        'value': '40%',
        'color': 'warning',
        'variant': 'contained',
        'taregPath': 'www.shimoku.com',
    },
    {
        "description": f"{100 * low_conversion:.2f}% of total {total_occurrences}",
        "title": 'Low conversion (#)',
        "value": int(total_occurrences * low_conversion),
        "color": 'error',
    },
    {
        'description': '% of leads that will purchase within 120 days.',
        'title': 'Low conversion expected',
        'value': '5%',
        'color': 'error',
        'variant': 'contained',
    },
]

In [13]:
for i in range(3):
    s.plt.indicator(
        data=prediction_indicators[i * 2:i * 2 + 2], order=i*2+1, rows_size=1, cols_size=12,
    )

2023-12-14 21:29 | INFO | Starting execution: [4mindicator[0m
2023-12-14 21:29 | INFO | create indicator added to the task pool
2023-12-14 21:29 | INFO | create indicator added to the task pool
2023-12-14 21:29 | INFO | Finished execution: [4mindicator[0m, elapsed time: 7.82 ms
2023-12-14 21:29 | INFO | Starting execution: [4mindicator[0m
2023-12-14 21:29 | INFO | create indicator added to the task pool
2023-12-14 21:29 | INFO | create indicator added to the task pool
2023-12-14 21:29 | INFO | Finished execution: [4mindicator[0m, elapsed time: 8.05 ms
2023-12-14 21:29 | INFO | Starting execution: [4mindicator[0m
2023-12-14 21:29 | INFO | create indicator added to the task pool
2023-12-14 21:29 | INFO | create indicator added to the task pool
2023-12-14 21:29 | INFO | Finished execution: [4mindicator[0m, elapsed time: 5.09 ms


### Prediction Table

In [14]:
prediction_table_header = (                                                     
    '<div style="width:100%; height:90px; "><h4>Lead predicton & factors</h4>'  
    '<p>Affectation values for each lead</p></div>'                             
)                                                                               
s.plt.html(html=prediction_table_header, order=7)              

label_columns = get_label_columns(binary_prediction_table)

s.plt.table(                                              
    order=8, data=binary_prediction_table[:200],                 
    label_columns=label_columns, categorical_columns=['Lead Scoring'], 
    columns_options={                                                  
        'Lead ID': {'width': 100},                                     
        'Lead Scoring': {'width': 120},                                
        'Probability': {'width': 120},                                 
        'Positive Impact Factors': {'width': 590},                     
        'Negative Impact Factors': {'width': 590}                      
    }
)                                                                 

table_explanaiton = (
    "<head>"
    "<style>.banner"
    "{height:100%; width:100%; border-radius:var(--border-radius-m); padding:24px;"
    "background-size: cover;"
    "background-image: url('https://ajgutierrezcommx.files.wordpress.com/2022/12/bg-info-predictions.png');"
    "color:var(--color-white);}"
    "</style>"
    "</head>"
    "<a href='https://shimoku.webflow.io/product/churn-prediction' target='_blank'>"  # link
    "<div class='banner'>"
    "<p class='base-white'>"
    "This table shows the impact values that effect each prediction of each policy. "
    "With it you can make the best decisions. <br>"
    "By filtering the data, by the values that interest you the most or by the probability of "
    "conversion that you want to improve, you will be able to take the necessary actions "
    "to obtain the maximum benefit or reduce the losses to a minimum."
    "</p>"
    "<div class='button'>Know more</div>"  # Text button
    "</div>"
    "</a>"
)
s.plt.html(html=table_explanaiton, order=9)

2023-12-14 21:29 | INFO | html added to the task pool
2023-12-14 21:29 | INFO | table added to the task pool
2023-12-14 21:29 | INFO | html added to the task pool


### Distribution 

In [15]:
distribution_header_html = (                                                                              
    '<div style="width:100%; height:90px; "><h4>Lead distribution according to % scoring prediction</h4>' 
    '<p>Total and disaggregated distribution and porcentage</p></div>'                                    
)                                                                                                         
s.plt.html(html=distribution_header_html, order=10)      

2023-12-14 21:29 | INFO | html added to the task pool


In [16]:
doughnut_chart_data = f"""
    {{
        tooltip: {{
        trigger: 'item'
    }},
    legend: {{
        top: '5%',
        left: 'center'
    }},
    series: [
    {{
        name: 'Access From',
        type: 'pie',
        radius: ['40%', '70%'],
        avoidLabelOverlap: false,
        itemStyle: {{
            borderRadius: 0,
            borderColor: '#fff',
            borderWidth: 0
        }},
        label: {{
            show: false,
            position: 'center'
        }},
        emphasis: {{
            label: {{
            show: false,
            fontSize: '40',
            fontWeight: 'bold'
        }}
      }},
      labelLine: {{
        show: false
      }},
      data: [
        {{ value: {high_conversion_occurrences}, name: 'High > 75%' }},
        {{ value: {moderate_conversion_occurrences}, name: 'Medium [50% - 75%]' }},
        {{ value: {low_conversion_occurrences}, name: 'Low < 50%' }}
      ]
    }}
  ]
}};      
"""

In [17]:
s.plt.free_echarts(
    raw_options=doughnut_chart_data,
    order=11, cols_size=5, rows_size=2
)

2023-12-14 21:29 | INFO | free_echarts added to the task pool


### Feature Importance

In [18]:
feature_importance = pd.DataFrame({
    'Feature': dv.feature_names_,
    'Importance (%)': clf.feature_importances_[0]
})

In [19]:
s.plt.bar(
    data=feature_importance.sort_values('Importance (%)', ascending=False)[:10],
    x='Feature', y=['Importance (%)'], order=12, rows_size=2, cols_size=7
)

2023-12-14 21:29 | INFO | bar_chart added to the task pool


### Next Best Product

In [20]:
next_best_product_header_html = (
    '<div style="width:100%; height:90px; "><h4>Next best product prediction</h4>'
    '<p>Products with a high probability of conversion for each lead</p></div>'
)
s.plt.html(html=next_best_product_header_html, order=13)

2023-12-14 21:29 | INFO | html added to the task pool


In [21]:
product_recommendation_indicators = [
    {
        "color": "warning",
        "backgroundImage": "https://img.rawpixel.com/s3fs-private/rawpixel_images/website_content/freerangepexels00938-image-kwvx0t9j.jpg?w=800&dpr=1&fit=default&crop=default&q=65&vib=3&con=3&usm=15&bg=F4F4F3&ixlib=js-2.2.1&s=c1efe8c454d88010fe6ff98c8746397d",
        "variant": "outlined", "description": "", "title": "Autos (# prospects)",
        "align": "left", "value": int(len(test_prediction) * 0.4)
    },
    {
        "color": "warning", "backgroundImage": "https://cotizator.com/wp-content/uploads/2020/06/imagen.png",
        "variant": "outlined", "description": "", "title": "New life (# prospects)",
        "align": "left", "value": int(len(test_prediction) * 0.3)
    },
    {
        "color": "warning",
        "backgroundImage": "https://i.ibb.co/Vw4f63j/SALUD-P-BLICA-Y-GESTI-N-SANITARIA-2.jpg",
        "variant": "outlined", "description": "", "title": "Health (# prospects)",
        "align": "left", "value": int(len(test_prediction) * 0.2)
    }
]

In [22]:
s.plt.indicator(                                   
    data=product_recommendation_indicators, order=14,        
    value='value', header='title', align='align', color='color',
    variant='variant', background_image='backgroundImage',      
)        

2023-12-14 21:29 | INFO | Starting execution: [4mindicator[0m
2023-12-14 21:29 | INFO | create indicator added to the task pool
2023-12-14 21:29 | INFO | create indicator added to the task pool
2023-12-14 21:29 | INFO | create indicator added to the task pool
2023-12-14 21:29 | INFO | Finished execution: [4mindicator[0m, elapsed time: 9.57 ms


17

In [23]:
product_recommendation_table = binary_prediction_table[['Lead ID', 'Probability', 'Lead Scoring']].copy(deep=True)

product_recommendation_table['Next Best Product'] = \
    [f"{['Autos', 'New life', 'Health', 'House'][np.random.choice([0, 1, 2, 3], p=[0.4, 0.3, 0.2, 0.1])]} " \
     f"({random.randint(1, 100)}%)"
     for i in range(len(product_recommendation_table))]


In [24]:
label_columns = get_label_columns(product_recommendation_table)
s.plt.table(                                                     
    data=product_recommendation_table[:200], order=17,                     
    categorical_columns=['Lead Scoring'], label_columns=label_columns,        
    columns_options={                                                         
        'Lead ID': {'width': 360},                                            
        'Lead Scoring': {'width': 360},                                       
        'Probability': {'width': 360},                                        
        'Next Best Product': {'width': 360},                                  
    }                                                                         
)                                                                             

2023-12-14 21:29 | INFO | table added to the task pool


### Execute all tasks

In [25]:
s.run()

2023-12-14 21:29 | INFO | Executing task pool
2023-12-14 21:29 | INFO | Starting execution: [4mhtml[0m
2023-12-14 21:29 | INFO | Starting execution: [4mcreate indicator[0m
2023-12-14 21:29 | INFO | Starting execution: [4mcreate indicator[0m
2023-12-14 21:29 | INFO | Starting execution: [4mcreate indicator[0m
2023-12-14 21:29 | INFO | Starting execution: [4mcreate indicator[0m
2023-12-14 21:29 | INFO | Starting execution: [4mcreate indicator[0m
2023-12-14 21:29 | INFO | Starting execution: [4mcreate indicator[0m
2023-12-14 21:29 | INFO | Starting execution: [4mhtml[0m
2023-12-14 21:29 | INFO | Starting execution: [4mtable[0m
2023-12-14 21:29 | INFO | Starting execution: [4mhtml[0m
2023-12-14 21:29 | INFO | Starting execution: [4mhtml[0m
2023-12-14 21:29 | INFO | Starting execution: [4mfree_echarts[0m
2023-12-14 21:29 | INFO | Starting execution: [4mbar_chart[0m
2023-12-14 21:29 | INFO | Starting execution: [4mhtml[0m
2023-12-14 21:29 | INFO | Starting executi