In [None]:
# import pandas

# import modin.pandas as pd
# from modin.config import Engine
# Engine.put("dask") 

In [None]:
import os
import daal4py as d4p
from xgboost import XGBClassifier
import time
import warnings
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.colors
import plotly.io as pio
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

pio.renderers.default='notebook_connected' 
intel_pal, color=['#0071C5','#FCBB13'], ['#7AB5E1','#FCE7B2']
temp=dict(layout=go.Layout(font=dict(family="Franklin Gothic", size=12), height=500, width=1000))

In [None]:
fig = go.Figure(data=go.Bar(y=[2, 3, 1]))
fig.show()

In [None]:
# from distributed import Client
# client = Client()

data = pd.read_csv("10k_dataset.csv")
data = data.drop('Index', axis=1)
print("Total rows in the DataFrame:", len(data))

print(data.dtypes)
display(data.isna().sum())

In [None]:
# handle missing values
start_fill = time.time()
for column in data.columns:
    if data[column].dtype == 'object':
        mode = data[column].mode().iloc[0]
        data[column].fillna(mode, inplace=True)
    else:
        median = data[column].median()
        data[column].fillna(median, inplace=True)

end_fill = time.time()
print("fill missing value took: {:.2f} s".format(end_fill - start_fill))

# drop duplicated rows
data = data.drop_duplicates()

# transform `Color` to number
color_labels, color_unique_values = pd.factorize(data['Color'])
data['Color'] = color_labels
# transform `Source` to number
source_labels, source_unique_values = pd.factorize(data['Source'])
data['Source'] = source_labels

In [None]:
display(data.isna().sum())
missing = data.isna().sum().sum()
duplicates = data.duplicated().sum()
print("\nThere are {:,.0f} missing values in the data.".format(missing))
print("There are {:,.0f} duplicate records in the data.".format(duplicates))

display(data.head())

In [None]:
def display_stats():
    
    """
    Function to display descriptive statistics of numerical variables,
    includes skewness & kurtosis.   
    """
    
    df = data.describe()
    skewness = pd.DataFrame(data.skew(numeric_only=True), columns=['skewness']).T
    kurtosis = pd.DataFrame(data.kurt(numeric_only=True), columns=['kurtosis']).T
    df = pd.concat([df, skewness, kurtosis], axis=0)
    display(df.style.format('{:,.3f}').background_gradient(subset=(df.index[1:],df.columns[:]), cmap='GnBu'))

display_stats()

In [None]:
# show target distribution
target=data['Target'].value_counts(normalize=True)
print(target)

In [None]:
cat_cols,float_cols=[],['Target']
for col in data.columns:
    if data[col].value_counts().count()<10:
        cat_cols.append(col)
    else:
        float_cols.append(col)
        
plot_df=data[float_cols]

# do not display distribution with column `Month` and `Day`
columns_to_drop = ['Month', 'Day']
plot_df.drop(columns_to_drop, axis=1, inplace=True)

fig, ax = plt.subplots(9,2, figsize=(10,20))
fig.suptitle('Distribution of Numerical Variables',fontsize=16)
row=0
col=[0,1]*9

for i, column in enumerate(plot_df.columns[1:-1]):
    if (i!=0)&(i%2==0):
        row+=1
    
    sns.kdeplot(
        x=column, hue='Target', palette=intel_pal[::-1], hue_order=[1, 0],
        label=['State 1', 'State 0'], data=plot_df,
        fill=True, linewidth=2.5, legend=False, ax=ax[row, col[i]]
    )
    
    ax[row,col[i]].tick_params(left=False, bottom=False)
    ax[row,col[i]].set(title='\n\n{}'.format(column), xlabel='', ylabel=('Density' if i%2==0 else ''))

handles, _ = ax[0,0].get_legend_handles_labels() 
fig.legend(labels=['State 1','State 0'], handles=reversed(handles), ncol=2, bbox_to_anchor=(0.18, 0.99))
sns.despine(bottom=True, trim=True)
plt.tight_layout()

In [None]:
# fig=make_subplots(rows=9,cols=2, subplot_titles=float_cols[1:])
# col=[1,2]*9
# row=0
# pal=sns.color_palette("GnBu",100).as_hex()[9:][::3]
# num_plots = len(data[float_cols].columns[1:-3])

# print(data[float_cols].columns[1:-3])

# for i, column in enumerate(data[float_cols].columns[1:-3]):
#     if i%2==0:
#         row+=1
#     df = pd.concat([data[column],data['Target']],axis=1)
        
#     df['bins'] = pd.cut(df[column], 100)
#     df['mean'] = df.bins.apply(lambda x: x.mid)
#     df = df.groupby('mean')[[column,'Target']].mean()
#     df = df.drop_duplicates(subset=[column]).sort_values(by=column)
    
#     display(df.head())
    
    
#     fig.add_trace(go.Scatter(x=df[column], y=df['Target'], name=column,
#                              marker_color=pal[i % len(pal)],showlegend=False),  # Use modulo to keep index within range
#                   row=row, col=col[i % num_plots])  # Use modulo to keep index within range
#     fig.update_xaxes(zeroline=False, row=row, col=col[i % num_plots])  # Use modulo to keep index within range
#     if i%2==0:
#         fig.update_yaxes(title='Target Probabilitiy',row=row,col=col[i % num_plots])  # Use modulo to keep index within range
# fig.update_layout(template=temp, title='Feature Relationships with Target', 
#                   hovermode="x unified",height=700,width=900)
# fig.show()
