In [273]:
import pandas as pd
import numpy as np
import random
import shimoku_api_python as shimoku

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction import DictVectorizer

In [274]:
merge_df = pd.read_csv("../Data/Processed/merged_raw.csv")
raw_leads = pd.read_csv("../Data/Raw/leads.csv")
raw_offers = pd.read_csv("../Data/Raw/offers.csv")

In [275]:
api_key: str = "90336deb-e537-40a5-98e8-a91eb731a823"
universe_id: str = "c2edae80-3e21-4f15-8c51-c394b34475cf"
workspace_id: str = "e96f1077-ae84-4068-9333-457b5d65ec37"


s = shimoku.Client(
    access_token=api_key,
    universe_id=universe_id,
    async_execution=True,
    verbosity='INFO',
)
s.set_workspace(workspace_id)
s.set_menu_path("catalog", "Test")

2023-12-14 22:47 | INFO | Starting execution: [4mset_workspace[0m
2023-12-14 22:47 | INFO | Finished execution: [4mset_workspace[0m, elapsed time: 4783.75 ms
2023-12-14 22:47 | INFO | Starting execution: [4mset_menu_path[0m
2023-12-14 22:47 | INFO | Retrieved menu path catalog with id 228ba832-394c-47e4-9a75-93431aea96c2
2023-12-14 22:47 | INFO | Retrieved board Default Name with id bfebf856-afc6-4e02-bac1-c09e615f127c
2023-12-14 22:47 | INFO | Finished execution: [4mset_menu_path[0m, elapsed time: 7812.41 ms


In [276]:
s.plt.clear_menu_path()

2023-12-14 22:47 | INFO | Starting execution: [4mclear_menu_path[0m
2023-12-14 22:47 | INFO | Deleted 6 components
2023-12-14 22:47 | INFO | Deleted 3 unused datasets from the menu path catalog
2023-12-14 22:47 | INFO | Finished execution: [4mclear_menu_path[0m, elapsed time: 14944.58 ms


## Dataframes to Plot

In [277]:
data_list = [{"Cols": list(raw_offers.isna().sum().index), "Null Values": list(raw_offers.isna().sum().values), "Non-null Values": list(raw_offers.shape[0] - value for value in raw_offers.isna().sum().values)}]
data = pd.concat([pd.DataFrame(d) for d in data_list], ignore_index=True)

In [278]:
data2_list = [{"Cols": list(raw_leads.isna().sum().index), "Null Values": list(raw_leads.isna().sum().values), "Non-null Values": list(raw_leads.shape[0] - value for value in raw_leads.isna().sum().values)}]
data2 = pd.concat([pd.DataFrame(d) for d in data2_list], ignore_index=True)

In [290]:
merge_list = [{"Cols": list(merge_df.isna().sum().index), "Null Values": list(merge_df.isna().sum().values), "Non-null Values": list(merge_df.shape[0] - value for value in merge_df.isna().sum().values)}]
merge_data = pd.concat([pd.DataFrame(d) for d in merge_list], ignore_index=True)

## Header

In [291]:
prediction_header = (
    "<head>"
    "<style>"  # Styles title
    ".component-title{height:auto; width:100%; "
    "border-radius:16px; padding:16px;"
    "display:flex; align-items:center;"
    "background-color:var(--complementary-violet); color:var(--color-white);}"
    "</style>"
    # Start icons style
    "<style>.big-icon-banner"
    "{width:48px; height: 48px; display: flex;"
    "margin-right: 16px;"
    "justify-content: center;"
    "align-items: center;"
    "background-size: contain;"
    "background-position: center;"
    "background-repeat: no-repeat;"
    "background-image: url('https://uploads-ssl.webflow.com/619f9fe98661d321dc3beec7/63594ccf3f311a98d72faff7_suite-customer-b.svg');}"
    "</style>"
    # End icons style
    "<style>.base-white{color:var(--color-white);}</style>"
    "</head>"  # Styles subtitle
    "<div class='component-title'>"
    "<div class='big-icon-banner'></div>"
    "<div class='text-block'>"
    "<h1>Thinking Process</h1>"
    "<p class='base-white'>"
    "And some considerations on the data and the problem by Alejandro Tovar</p>"
    "</div>"
    "</div>"
)
s.plt.html(html=prediction_header, order=0)

2023-12-14 22:49 | INFO | html added to the task pool


## Mising Values

In [308]:
distribution_header_html = (                                                                              
    '<div style="width:100%; height:90px; "><h3>Amount of Null Data for Each Dataset</h3>' 
    '''<p>Since there are some columns that have a high ratio of null data, I removed some columns (which didnt have an ID since those rows would be imposible to match between datasets),
      and filled the values for other colums, with a categorical variable or a numerical one depending of each variable</p></div>'''
)                                                                                                         
s.plt.html(html=distribution_header_html, order=1)  

2023-12-14 22:53 | INFO | html added to the task pool


In [309]:
s.plt.stacked_horizontal_bar(
    data=data, x="Cols",
    title='Number of Null Values for Offers',
    order=2,
    cols_size=6,
    rows_size=3,
    option_modifications={"color": ["var(--color-error)", "var(--color-success-light)"]}
    )

2023-12-14 22:53 | INFO | stacked_horizontal_bar_chart added to the task pool


In [310]:
s.plt.stacked_horizontal_bar(
    data=data2, x="Cols",
    title='Number of Null Values for Leads',
    order=3,
    cols_size=6,
    rows_size=3,
    option_modifications={"color": ["var(--color-error)", "var(--color-success-light)"]}
    )

2023-12-14 22:53 | INFO | stacked_horizontal_bar_chart added to the task pool


In [311]:
s.plt.stacked_horizontal_bar(
    data=merge_data, x="Cols",
    title='Number of Null values for the Merged Dataframe',
    order=4,
    cols_size=12,
    option_modifications={"color": ["var(--color-error)", "var(--color-success-light)"]}
    )

2023-12-14 22:53 | INFO | stacked_horizontal_bar_chart added to the task pool


## Unbalanced Data

In [312]:
distribution_header_html = (                                                                              
    '<div style="width:100%; height:90px; "><h3>Data Balance</h3>' 
    '''<p>The data is unbalanced for some specific features, Knowing this, I will be able to use a technique later to handle with this like resampling or SMOTE</p></div>'''
)                                                                                                         
s.plt.html(html=distribution_header_html, order=5)  

2023-12-14 22:53 | INFO | html added to the task pool


In [313]:
Use_case_data = merge_df["Use Case_y"].value_counts()
Use_case_df = pd.DataFrame()
Use_case_df["label"] = Use_case_data.index 
Use_case_df["value"] = Use_case_data.values

In [314]:
s.plt.pie(
    data=Use_case_df, 
    names="label", 
    values="value",
    order=6, 
    rows_size=2, 
    cols_size=5,
)

2023-12-14 22:53 | INFO | pie_chart added to the task pool


In [315]:
s.plt.pie(
    data=Use_case_df, 
    names="label", 
    values="value",
    order=7,
    rows_size=2, 
    cols_size=5,
)

2023-12-14 22:53 | INFO | pie_chart added to the task pool


In [316]:
s.run()

2023-12-14 22:53 | INFO | Executing task pool
2023-12-14 22:53 | INFO | Starting execution: [4mhtml[0m
2023-12-14 22:53 | INFO | No changes needed for HTML at Test_1
2023-12-14 22:53 | INFO | Finished execution: [4mhtml[0m, elapsed time: 1.58 ms
2023-12-14 22:53 | INFO | Starting execution: [4mstacked_horizontal_bar_chart[0m
2023-12-14 22:53 | INFO | Starting execution: [4mstacked_horizontal_bar_chart[0m
2023-12-14 22:53 | INFO | Starting execution: [4mstacked_horizontal_bar_chart[0m
2023-12-14 22:53 | INFO | Starting execution: [4mhtml[0m
2023-12-14 22:53 | INFO | No changes needed for HTML at Test_5
2023-12-14 22:53 | INFO | Finished execution: [4mhtml[0m, elapsed time: 2.18 ms
2023-12-14 22:53 | INFO | Starting execution: [4mpie_chart[0m
2023-12-14 22:53 | INFO | Starting execution: [4mpie_chart[0m
2023-12-14 22:53 | INFO | Created EChart with id 231e8419-b955-44f9-84fb-c803378a5cf3
2023-12-14 22:53 | INFO | Deleted data set with name 82b738ca-a2bc-4cd2-a3c7-11cd77