In [213]:
import pandas as pd
import numpy as np
import random
import shimoku_api_python as shimoku

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction import DictVectorizer

In [214]:
merge_df = pd.read_csv("../Data/Processed/merged_raw.csv")
raw_leads = pd.read_csv("../Data/Raw/leads.csv")
raw_offers = pd.read_csv("../Data/Raw/offers.csv")
merge_processed = pd.read_csv("../Data/Processed/merge_processed.csv")

In [215]:
api_key: str = "90336deb-e537-40a5-98e8-a91eb731a823"
universe_id: str = "c2edae80-3e21-4f15-8c51-c394b34475cf"
workspace_id: str = "e96f1077-ae84-4068-9333-457b5d65ec37"


s = shimoku.Client(
    access_token=api_key,
    universe_id=universe_id,
    async_execution=True,
    verbosity='INFO',
)
s.set_workspace(workspace_id)
s.set_menu_path("catalog", "Test")

2023-12-15 09:59 | INFO | Starting execution: [4mset_workspace[0m
2023-12-15 09:59 | INFO | Finished execution: [4mset_workspace[0m, elapsed time: 2030.91 ms
2023-12-15 09:59 | INFO | Starting execution: [4mset_menu_path[0m
2023-12-15 09:59 | INFO | Retrieved menu path catalog with id 228ba832-394c-47e4-9a75-93431aea96c2
2023-12-15 09:59 | INFO | Retrieved board Default Name with id bfebf856-afc6-4e02-bac1-c09e615f127c
2023-12-15 09:59 | INFO | Finished execution: [4mset_menu_path[0m, elapsed time: 4709.83 ms


In [216]:
s.plt.clear_menu_path()

2023-12-15 09:59 | INFO | Starting execution: [4mclear_menu_path[0m
2023-12-15 09:59 | INFO | Deleted 6 components
2023-12-15 09:59 | INFO | Deleted 0 unused datasets from the menu path catalog
2023-12-15 09:59 | INFO | Finished execution: [4mclear_menu_path[0m, elapsed time: 8424.65 ms


## Dataframes to Plot

In [217]:
data_list = [{"Cols": list(raw_offers.isna().sum().index), "Null Values": list(raw_offers.isna().sum().values), "Non-null Values": list(raw_offers.shape[0] - value for value in raw_offers.isna().sum().values)}]
data = pd.concat([pd.DataFrame(d) for d in data_list], ignore_index=True)

In [218]:
data2_list = [{"Cols": list(raw_leads.isna().sum().index), "Null Values": list(raw_leads.isna().sum().values), "Non-null Values": list(raw_leads.shape[0] - value for value in raw_leads.isna().sum().values)}]
data2 = pd.concat([pd.DataFrame(d) for d in data2_list], ignore_index=True)

In [219]:
merge_list = [{"Cols": list(merge_df.isna().sum().index), "Null Values": list(merge_df.isna().sum().values), "Non-null Values": list(merge_df.shape[0] - value for value in merge_df.isna().sum().values)}]
merge_data = pd.concat([pd.DataFrame(d) for d in merge_list], ignore_index=True)

## Header

In [220]:
prediction_header = (
    "<head>"
    "<style>"  # Styles title
    ".component-title{height:auto; width:100%; "
    "border-radius:16px; padding:16px;"
    "display:flex; align-items:center;"
    "background-color:var(--complementary-violet); color:var(--color-white);}"
    "</style>"
    # Start icons style
    "<style>.big-icon-banner"
    "{width:48px; height: 48px; display: flex;"
    "margin-right: 16px;"
    "justify-content: center;"
    "align-items: center;"
    "background-size: contain;"
    "background-position: center;"
    "background-repeat: no-repeat;"
    "background-image: url('https://uploads-ssl.webflow.com/619f9fe98661d321dc3beec7/63594ccf3f311a98d72faff7_suite-customer-b.svg');}"
    "</style>"
    # End icons style
    "<style>.base-white{color:var(--color-white);}</style>"
    "</head>"  # Styles subtitle
    "<div class='component-title'>"
    "<div class='big-icon-banner'></div>"
    "<div class='text-block'>"
    "<h1>Thinking Process</h1>"
    "<p class='base-white'>"
    "And some considerations on the data and the problem by Alejandro Tovar</p>"
    "</div>"
    "</div>"
)
s.plt.html(html=prediction_header, order=0)

2023-12-15 09:59 | INFO | html added to the task pool


## Mising Values

In [221]:
distribution_header_html = (                                                                              
    '<div style="width:100%; height:90px; "><h3>Amount of Null Data for Each Dataset</h3>' 
    '''<p>Since there are some columns that have a high ratio of null data, I removed some columns (which didnt have an ID since those rows would be imposible to match between datasets),
      and filled the values for other colums, with a categorical variable or a numerical one depending of each variable</p></div>'''
)                                                                                                         
s.plt.html(html=distribution_header_html, order=1)  

2023-12-15 09:59 | INFO | html added to the task pool


In [222]:
s.plt.stacked_horizontal_bar(
    data=data, x="Cols",
    title='Number of Null Values for Offers',
    order=2,
    cols_size=6,
    rows_size=3,
    option_modifications={"color": ["var(--color-error)", "var(--color-success-light)"]}
    )

2023-12-15 09:59 | INFO | stacked_horizontal_bar_chart added to the task pool


In [223]:
s.plt.stacked_horizontal_bar(
    data=data2, x="Cols",
    title='Number of Null Values for Leads',
    order=3,
    cols_size=6,
    rows_size=3,
    option_modifications={"color": ["var(--color-error)", "var(--color-success-light)"]}
    )

2023-12-15 09:59 | INFO | stacked_horizontal_bar_chart added to the task pool


In [224]:
s.plt.stacked_horizontal_bar(
    data=merge_data, x="Cols",
    title='Number of Null values for the Merged Dataframe',
    order=4,
    cols_size=12,
    option_modifications={"color": ["var(--color-error)", "var(--color-success-light)"]}
    )

2023-12-15 09:59 | INFO | stacked_horizontal_bar_chart added to the task pool


## Unbalanced Data

In [225]:
distribution_header_html = (                                                                              
    '<div style="width:100%; height:90px; "><h3>Data Balance</h3>' 
    '''<p>The data is unbalanced for some specific features, Knowing this, I will be able to use a technique later to handle with this like resampling or SMOTE.</p></div>'''
)                                                                                                         
s.plt.html(html=distribution_header_html, order=5)  

2023-12-15 09:59 | INFO | html added to the task pool


In [226]:
Use_case_data = merge_df["Use Case_y"].value_counts()
Use_case_df = pd.DataFrame()
Use_case_df["label"] = Use_case_data.index 
Use_case_df["value"] = Use_case_data.values

In [227]:
Pain_data = merge_df["Pain"].value_counts()
Pain_df = pd.DataFrame()
Pain_df["label"] = Pain_data.index
Pain_df["value"] = Pain_data.values

In [228]:
s.plt.pie(
    data=Use_case_df, 
    names="label", 
    values="value",
    order=6, 
    rows_size=2, 
    cols_size=6,
    title= "Distribution of Use Case Data"
)

2023-12-15 09:59 | INFO | pie_chart added to the task pool


In [229]:
s.plt.pie(
    data=Pain_df, 
    names="label", 
    values="value",
    order=7,
    rows_size=2, 
    cols_size=6,
    title= "Distribution of Pain Data"
)

2023-12-15 09:59 | INFO | pie_chart added to the task pool


## Data Enrichment

In [242]:
distribution_header_html = (                                                                              
    '<div style="width:100%; height:90px; "><h3>Data Enrichment</h3>' 
    '''<p>In this step some correlation was checked and the data was encoded, One hot encoding was applied to features with few categories and binary encoded to
     features with several categories.</p>
     <p>Also some features were added and some unnecessary and data and low importance features were removed, as an example, the date columns where transformed to 
     days that take to close the deal that is more relevant data, there were a couple of columns with a very high correlation with the target variable and were remover also.</p></div>'''
)                                                                                                         
s.plt.html(html=distribution_header_html, order=8, rows_size=6)  

2023-12-15 10:03 | INFO | html added to the task pool


In [237]:
merge_processed.dtypes.value_counts()

bool      16
int64     15
object     2
Name: count, dtype: int64

In [238]:
s.plt.indicator(
    order=9, cols_size=9,
    padding="0,0,0,2",
    data=[
         {
             "description": "Feature Engineering",
             "title": "",
             "value": "Before",
             "align": "center",
             "color": "default",
             "variant": "contained"
        },
        {
             "description": "",
             "title": "Features",
             "value": merge_df.shape[1],
             "align": "center",
             "color": "default"
        },
        {
            "description": "object | int64 | float64",
            "title": "data types",
            "value": "16  |  1  |  1  ",
            "color": "default",

        },
    ],
)
s.plt.indicator(
    order=12, cols_size=9,
    padding="0,0,0,2",
    data=[
         {
             "description": " Feature Engineering",
             "title": "",
             "value": "After",
             "align": "center",
             "color": "success",
             "variant": "contained"
        },
        {
             "description": "",
             "title": "Features",
             "value": merge_processed.shape[1],
             "align": "center",
             "color": "success"
        },
        {
            "description": "object | int64 | bool",
            "title": "data types",
            "value": "2 | 15 | 16  ",
            "color": "success",
        },
    ],
)

2023-12-15 10:01 | INFO | Starting execution: [4mindicator[0m
2023-12-15 10:01 | INFO | create indicator added to the task pool
2023-12-15 10:01 | INFO | create indicator added to the task pool
2023-12-15 10:01 | INFO | create indicator added to the task pool
2023-12-15 10:01 | INFO | Finished execution: [4mindicator[0m, elapsed time: 10.56 ms
2023-12-15 10:01 | INFO | Starting execution: [4mindicator[0m
2023-12-15 10:01 | INFO | create indicator added to the task pool
2023-12-15 10:01 | INFO | create indicator added to the task pool
2023-12-15 10:01 | INFO | create indicator added to the task pool
2023-12-15 10:01 | INFO | Finished execution: [4mindicator[0m, elapsed time: 9.89 ms


15

In [243]:
s.run()

2023-12-15 10:03 | INFO | Executing task pool
2023-12-15 10:03 | INFO | Starting execution: [4mhtml[0m
2023-12-15 10:03 | INFO | Updated HTML at Test_8
2023-12-15 10:03 | INFO | Finished execution: [4mhtml[0m, elapsed time: 2426.69 ms
