In [2]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import joblib
from collections import Counter
import plotly.express as px

from jupyter_dash import JupyterDash
import dash_daq as daq
from dash.exceptions import PreventUpdate
import dash_bootstrap_components as dbc
from dash import html, dcc, Input, Output, State, callback_context, dash_table

import shap
from ast import literal_eval

In [6]:
assets_target = 'jobathon_nov2021'
train_data=pd.read_csv('../Data/Train.csv')
train_data.head()

Unnamed: 0,MMM-YY,Emp_ID,Age,Gender,City,Education_Level,Salary,Dateofjoining,LastWorkingDate,Joining Designation,Designation,Total Business Value,Quarterly Rating
0,2016-01-01,1,28,Male,C23,Master,57387,2015-12-24,,1,1,2381060,2
1,2016-02-01,1,28,Male,C23,Master,57387,2015-12-24,,1,1,-665480,2
2,2016-03-01,1,28,Male,C23,Master,57387,2015-12-24,2016-03-11,1,1,0,2
3,2017-11-01,2,31,Male,C7,Master,67016,2017-11-06,,2,2,0,1
4,2017-12-01,2,31,Male,C7,Master,67016,2017-11-06,,2,2,0,1


In [8]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19104 entries, 0 to 19103
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   MMM-YY                19104 non-null  object
 1   Emp_ID                19104 non-null  int64 
 2   Age                   19104 non-null  int64 
 3   Gender                19104 non-null  object
 4   City                  19104 non-null  object
 5   Education_Level       19104 non-null  object
 6   Salary                19104 non-null  int64 
 7   Dateofjoining         19104 non-null  object
 8   LastWorkingDate       1616 non-null   object
 9   Joining Designation   19104 non-null  int64 
 10  Designation           19104 non-null  int64 
 11  Total Business Value  19104 non-null  int64 
 12  Quarterly Rating      19104 non-null  int64 
dtypes: int64(7), object(6)
memory usage: 1.9+ MB


In [11]:
columns = ['Customer_ID', 'Gender', 'Age', 'Vintage', 'Is_Active', 'City_Category', 'Customer_Category',
           'Product_Holdin`g_B1', 'Product_Holding_B2']

columns_full_name = ['Customer ID', 'Gender', 'Age', 'Vintage', 'Activity Index', 'City Category', 'Customer Category',
                     'Current Product Holding', 'Product Holding Prediciton']
columns_description = [
    'Unique ID for the customer',
    'Gender of the Customer',
    'Age of the Customer (in Years)',
    'Vintage for the Customer (In Months)',
    'Activity Index, 0 :  Less frequent customer, 1 : More frequent customer',
    "Encoded Category of customer's city",
    "Encoded Category of the customer",
    "Current Product Holding (Encoded)",
    "Product Holding in next six months (Encoded) - Target Column"
]
columns_info = [
    " ",
    " ",
    " ",
    " ",
    " ",
    " ",
    " ",
    " ",
    ['Imbalanced dataset', 'Most people are not job seeking'],
]
data = {'Feature_name':columns, 'Feature_description':columns_description, 'Feature_meaning':columns_full_name,
        'Feature_info':columns_info}

pd.DataFrame(data).to_csv('assets/'+assets_target+'/feature_description.csv', index=False)


In [4]:
# feature_description
feature_description = pd.read_csv('assets/'+assets_target+'/feature_description.csv')

In [5]:
class webiste():
    def __init__(self):
        self.app = JupyterDash(__name__,
                               external_stylesheets=[dbc.themes.BOOTSTRAP],
                               meta_tags=[{'name': 'viewport',
                                           'content': 'width=device-width, initial-scale=1.0'}],)
        self.button_id=0
        
    def run_server(self, port = 8050):
        self.app.run_server(port=port, mode = 'external')
    
    def make_item(self, feature_name,input_feature=False):
        
        print(self.button_id, feature_name, input_feature)
        if input_feature == 'Intro':
            content = self.intro_content()
        elif input_feature == 'data':
            content = self.data_content()
        elif input_feature == 'graph':
            content = self.graph_content()
        elif input_feature == 'uni_graph':
            content = self.graph_content_univariate()
        elif input_feature == 'bi_graph':
            content = self.graph_content_bivariate()
        elif input_feature == 'lr_with_1_var':
            content = topic2_week1(i)
        elif input_feature == 'lr_1_var_mr':
            content = lr_1_var_mr_f()
        elif input_feature == 'lr_1_var_prac':
            content = lr_1_var_prac_f()
        elif input_feature == 'Week2':
            content = Week2_content(i)
        elif input_feature == 'lr_with_multi_var':
            content = lr_with_multi_var_f()
        elif input_feature == 'feature_scaling':
            content = feature_scaling_f()
        elif input_feature == 'learning_rate':
            content = learning_rate_f()
        elif input_feature == 'Week3':
            content = Week3_content(i)
        elif input_feature == 'log_r':
            content = log_r_f()
        elif input_feature == 'multi_log_r':
            content = multi_log_r_f()
        elif input_feature == 'overfitting':
            content = overfitting_f()
        self.button_id += 1
        return dbc.Card([
                dbc.CardHeader(
                    dbc.ListGroupItem(
                        html.H3(feature_name), id=f"{self.ml_num}_group_c1-{self.button_id}-toggle", n_clicks=0, action=True
                    )),
                dbc.Collapse( content, id=f"{self.ml_num}_collapse_c1-{self.button_id}", is_open=False, className='m-1'),
            ])
        
    def make_bivariate(self, _print=False):
        object_col_type = self.df.select_dtypes(include='object').columns.tolist()
        col_use = object_col_type
        target=self.target
        col_use = [i for i in col_use if i != target]
#         print(self.not_to_use)
        col_use = [i for i in col_use if i not in self.not_to_use]
        
#         print(col_use)
        ax = self.df[col_use[0]].value_counts().rename('count')
        bx = round(self.df[col_use[0]].value_counts(normalize=True)*100,2).rename('percentage')
        cx = round(self.df.groupby(col_use[0])[self.target].value_counts(normalize=True).unstack(
            self.target)*100,2).reset_index().set_index(col_use[0])
        dx = round(self.df.groupby(col_use[0])[self.target].value_counts(normalize=False).unstack(
            self.target),2).reset_index().set_index(col_use[0])
        ax = pd.concat([ax, bx, cx, dx], axis=1).reset_index()
        columns = ax.columns.tolist()
        columns[0] = 'index'
        ax.columns = columns
        ax['feature'] = col_use[0]
#         print(ax)
        for i in range(1, len(col_use)):
            bx = self.df[col_use[i]].value_counts().rename('count')
            cx = round(self.df[col_use[i]].value_counts(normalize=True)*100,2).rename('percentage')
            dx = round(self.df.groupby(col_use[i])[self.target].value_counts(normalize=True).unstack(
                self.target)*100,2).reset_index().set_index(col_use[i])
            ex = round(self.df.groupby(col_use[i])[self.target].value_counts(normalize=False).unstack(
                self.target),2).reset_index().set_index(col_use[i])
            bx = pd.concat([bx, cx, dx, ex], axis=1).reset_index()
            bx['feature'] = col_use[i]
            #some columns have first column go with their name IDKW
            columns = bx.columns.tolist()
            columns[0] = 'index'
            bx.columns = columns

            ax=pd.concat([ax, bx], axis=0,)
#             print(ax)
        ax.reset_index(drop=True, inplace=True)
        self.bivariate_data = ax.copy()
        ax.to_csv('assets/'+assets_target+'/bivariate_object.csv', index=None)
        if _print:
            print(ax)
      
    def make_bivariate_numerical(self, _print=False):
        col_use = self.df.select_dtypes(include=['int64', 'float64']).columns.tolist()
        col_use = [i for i in col_use if i not in self.not_to_use]
        target = self.target

        ax = round(self.df.groupby(col_use[0])[target].value_counts(normalize=False).unstack(target),2).reset_index()
        cx = self.df[col_use[0]].value_counts().rename('count').reset_index()
        ax['feature'] = col_use[0]
        ax.rename(columns = {col_use[0]:'feature_value'}, inplace = True)
        ax = pd.merge(cx, ax, right_on='feature_value', left_on='index')

        for i in range(1, len(col_use)):
            bx = round(self.df.groupby(col_use[i])[target].value_counts(normalize=False).unstack(target),2).reset_index()
            cx = self.df[col_use[i]].value_counts().rename('count').reset_index()
            bx['feature'] = col_use[i]
            bx.rename(columns = {col_use[i]:'feature_value'}, inplace = True)
            bx = pd.merge(cx, bx, right_on='feature_value', left_on='index')
            ax=pd.concat([ax, bx], axis=0,)

        self.bivariate_numerical_data = ax.copy()
        ax.to_csv('assets/'+assets_target+'/numerical_count.csv', index=False)
        if _print:
            print(ax)
        
    def paragraph_html(self, data):
        In_para_list = []
        for par in data:
            paragraph = []
            for i, j in par.items():
                if i == 'title':
                    paragraph.append(html.H3(j))
                elif i[0] == 'B':
                    paragraph.append(html.B(j+" "))
                elif i[0] == 'T':
                    paragraph.append(j+" ")
                elif i[0] == 'A':
                    paragraph.append(html.A(j[0], href=j[1], target="_blank"))
            In_para_list.append(paragraph)
        to_return = [html.P(i) for i in In_para_list]
        return to_return 

    
    def Introduction(self):
        
        if self.In_true:
#             self.button_id += 1
            Intro = dbc.Col([self.make_item('Introduction', 'Intro')], sm=12)
            return Intro
        else:
            return ""
    def intro_content(self):
        return html.Div(
            [
                dbc.Row(
                    [
                        dbc.Col(
                            self.paragraph_html(self.In_para), md=12, lg=7,
                            className='d-flex flex-column justify-content-center'
                        ),
                        dbc.Col(
                            [
                                html.Div(
                                    [
                                        html.Img(src=self.In_image[0],style={'height':'50%', 'width':'70%'})
                                    ],className="d-flex justify-content-center"
                                ),
                                html.P(self.In_image[1], className='mt-1 d-flex justify-content-center')
                            ],md=12, lg=5, align='center'
                        )
                    ]
                )
            ], className='px-5 py-3'
        )

    
    def available_data(self):
        
        if self.Da_true:
#             self.button_id += 1
            ava_data = dbc.Col([self.make_item('Data', 'data')], sm=12)
            return ava_data
        else:
            return ""
    def data_content(self):
        return html.Div(
            [
                dbc.Row(
                    dbc.Col(
                        self.paragraph_html(self.Da_para),
                        className='d-flex flex-column justify-content-center'
                    )
                ),
                dbc.Row(
                    dbc.Col(
                        dash_table.DataTable(
                            columns=[{"name": i, "id": i} for i in self.df.columns],
                            data=self.df.iloc[:5, :].to_dict('records'),
                            style_cell={
                                'whiteSpace': 'normal',
                                'height': 'auto',
                                'maxWidth': 0,
                            },
                            style_header={
                                'backgroundColor': 'gray',
                                'fontWeight': 'bold',
                                'color':'white'
                            },
                        )
                    )
                )
            ], className='px-5 py-3'
        )
        
    
    def plot_graph(self):
        
        if self.Pl_true:
#             self.button_id += 1
            plot_g = dbc.Col([self.make_item('Graph', 'graph')], sm=12)
            return plot_g
        else:
            return ""
    
    def graph_content(self):
        return html.Div(
            [
                dbc.Row(
                    [
                        dbc.Col([self.make_item( 'uni_graph', 'uni_graph')], sm=12)
                    ], justify = 'center', className = 'm-1'),
                dbc.Row(
                    [
                        dbc.Col([self.make_item('bi_graph', 'bi_graph')], sm=12)
                    ], justify = 'center', className = 'm-1'),

            ]
        )
        return plot_g
    
    def graph_content_univariate(self):
        
        return dbc.Row(
            [
                dbc.Col(
                    [
                        dbc.Row(
                            [
                                dbc.Col(
                                    [
                                        'Select Feature:',
                                        dbc.Select(
                                            id = f"select-feature_{self.ml_num}",
                                            options=[{'label': i, 'value': i} for i in self.Pl_variable_list],
                                            className='m-2',
#                                             value="",#self.Pl_variable_list[0],
                                            
                                            style={'width':'200px'}
                                        )
                                        
                                    ], className='d-flex justify-content-center align-items-center',
                                    style={'height':'100px'},
                                )
                            ], justify=True,
                        ),
                        dbc.Row(
                            [
                                dbc.Col(
                                    [
                                        html.P(id=f'feature_description_{self.ml_num}', className='p-2')
                                    ], className='d-flex justify-content-center align-items-center'
                                )
                            ]
                        )
                    ], md=12, lg=5,style={'background':''}, align=True, className='d-flex flex-column'
                ),
                dbc.Col(
                    [
                        html.Div(
                            [
                                dcc.Graph(id=f'feature-graph_{self.ml_num}',
                                          config=dict(modeBarButtons=[['toImage']], displaylogo=False),
                                          style={'position':'absolute','z-index': -1 , 'height':'100%'}),
                                daq.BooleanSwitch(id=f'switch_{self.ml_num}', on = False, className = "mt-1"),
                                dbc.Select(id=f"select-bins_{self.ml_num}",
                                           options=[{'label': i, 'value': i} for i in np.arange(10, 110, 10)],
                                           className='mt-1', value=20),
                                daq.BooleanSwitch(id=f'switch_{self.ml_num}_2', on = False, className = 'mt-1')
                            ], style={'position':'relative', 'height': "500px"}
                        )
                    ], md=12, lg=7
                )
            ]
        )
        
    def graph_content_bivariate(self):
        return html.Div('sadf')

        
        
        
    def layout(self, problem_type, dataframe, feature_description,classification_target=None,
               ml_number='ml2', title="New Website",
               In_true=True, In_para = [{'title':'title'}, {'B1':'bold'}, {'T1':'text'}],
               In_image = ['image_url', 'image_detail'], 
               Da_true = True, Da_para = [{'B1':'bold'}, {'T1':'text'}],
               Pl_true = True, Pl_variable_list = [None], Pl_no_graph = ['tar'],
               donut_title='write about target', donut_center_text = "target unique value",
               colors = {'histogram':['rgb(152, 152, 152, 0.5)', '#0e4f66', '#002d1d', 'black'], 'graph_title':'black',
                         'legend_colors':['rgb(152, 152, 152, 0.5)', '#0e4f66', 'pink'],
                         'legend_border':'pink',
                         'background_color':"#fbfbfb",},
              not_to_use=[None]):
        
        self.problem_type = problem_type
        self.df = dataframe
        self.fd = feature_description
        self.target = classification_target
        self.ml_num = ml_number
        self.title = title
        self.In_true = In_true
        self.In_para = In_para
        self.In_image = In_image
        self.Da_true = Da_true
        self.Da_para = Da_para
        self.Pl_true = Pl_true
        self.Pl_variable_list = Pl_variable_list
        self.no_graph = Pl_no_graph
        self.donut_center_text = donut_center_text
        self.donut_title = donut_title
        self.colors = colors
        self.not_to_use = not_to_use
        try:
            self.bivariate_data = pd.read_csv('assets/'+assets_target+'/bivariate_object.csv')
        except:
            self.make_bivariate()
        
        try:
            self.bivariate_numerical_data = pd.read_csv('assets/'+assets_target+'/numerical_count.csv')
        except:
            self.make_bivariate_numerical()
            
        
        self.app.layout = html.Div(
            [
                dbc.Row(
                    [
                        title
                    ], justify="center", align='center',style={'color':'#FFFFFF','font-weight': 'bold',
                                                               'font-size':'25px','background':'#0096A9',
                                                               'height':'6vh', 'margin-bottom':'3px'}),

                dbc.Row(
                    [
                        self.Introduction()
                    ], justify='center', className='m-1'),
                
                dbc.Row(
                    [
                        self.available_data()
                    ], justify='center', className='m-1'),
                
                dbc.Row(
                    [
                        self.plot_graph()
                    ], justify='center', className='m-1'),
                

                


            ]
        )
    
    def callbacks(self):
        @self.app.callback(
            [Output(f"{self.ml_num}_collapse_c1-{i}", "is_open") for i in range(1, self.button_id+1)],
            [Input(f"{self.ml_num}_group_c1-{i}-toggle", "n_clicks") for i in range(1, self.button_id+1)],
            [State(f"{self.ml_num}_collapse_c1-{i}", "is_open") for i in range(1, self.button_id+1)],
        )
        def toggle_collapse(*args):
            no_of_cards=int((len(args))/2)
            toggle_status=list(args[no_of_cards:])
            ctx = callback_context
            if not ctx.triggered:
                return [False]*no_of_cards
            else:
                button_id = ctx.triggered[0]["prop_id"].split(".")[0].split("-")[1]
            if button_id:
                if toggle_status[int(button_id)-1]:
                    toggle_status[int(button_id)-1] = False
                else:
                    toggle_status[int(button_id)-1] = True

                return toggle_status
            else:
                return toggle_status
            
            
        @self.app.callback(
            [Output(f'feature_description_{self.ml_num}', 'children')],
            [Input(f'select-feature_{self.ml_num}', 'value')]
        )
        def feature_info(value):
            return [self.fd[self.fd['Feature_name'] == value]['Feature_description']]
        

                
        
        
        
        @self.app.callback(
            [Output(f'feature-graph_{self.ml_num}', 'figure'),
             Output(f'switch_{self.ml_num}', 'style'),
             Output(f'select-bins_{self.ml_num}', 'style'),
             Output(f'switch_{self.ml_num}_2', 'style')],
            [Input(f'select-feature_{self.ml_num}', 'value'),
             Input(f'switch_{self.ml_num}', 'on'),
             Input(f'switch_{self.ml_num}_2', 'on'),
             Input(f'select-bins_{self.ml_num}', 'value')]
        )
        def feature_plot(value, on, target_include, bin_req):
            
            if self.problem_type == 'classification':
                sec_bool_switch = {'width':'200px','position':'absolute', 'z-index': 10,
                                                      'top':0, 'right':'-50px'}
            else:
                sec_bool_switch = {'display':'none'}

            cat_col = self.df.select_dtypes(include=['object', 'category']).columns.tolist()

            column_full_mean = self.fd.iloc[:, [0, 1]].set_index('Feature_name').T.to_dict('list')
            
            if value in self.no_graph:
                fig = go.Figure()
                fig.add_annotation(text="NO Graph for this Attribute<br> Select Another one",
                                   font=dict(family="Courier New, monospace",size=20,color = 'black'),
                                      xref="x", yref="y",textangle=0,align="center",
                                      x=2.5, y=1, showarrow=False)
                fig.add_layout_image(
                        dict(
                            source="assets/"+assets_target+"/no_required.PNG",
                            xref="x", yref="y", x=0, y=5, sizex=5, sizey=5,
                            sizing="stretch", opacity=1,
                            layer="above")
                )

                fig.update_layout(template="plotly_white", xaxis=dict(showgrid=False, showticklabels=False,zeroline=False),
                                  yaxis=dict(showgrid=False, showticklabels=False, zeroline=False))
                style={'display':'none'}
                style_bin={'display':'none'}
                return [fig, style, style_bin, sec_bool_switch]
            
            elif value == self.target:
                d= pd.DataFrame(self.df[self.target].value_counts())
                fig = px.pie(d,values=self.target,
#                              names=['No','Yes'],
                             hole=0.4,opacity=0.6,
                            color_discrete_sequence=self.colors['histogram'],
                            )

                fig.add_annotation(text=self.donut_center_text,
                                   x=0.5,y=0.5,showarrow=False,font_size=14,opacity=0.7,font_family='monospace')

                fig.update_layout(
                    font_family='monospace',
                    title=dict(text=self.donut_title,x=0.47,y=0.98,
                               font=dict(color=self.colors['graph_title'],size=20,family='serif')),
                    legend=dict(x=0.37,y=-0.05,orientation='h',traceorder='reversed'),
                    hoverlabel=dict(bgcolor='white'))

                fig.update_traces(textposition='outside', textinfo='percent+label')
                style={'display':'none'}
                style_bin={'display':'none'}
                style2={'display':'none'}
                return [fig, style, style_bin, style2]
                
            elif value is None:
                fig = go.Figure()
                fig.add_annotation(text="Select Column from dropdown",
                                   font=dict(family="Courier New, monospace",size=20,color = 'black'),
                                      xref="x", yref="y",textangle=0,align="center",
                                      x=2.5, y=1, showarrow=False)
                fig.update_layout(template="plotly_white", xaxis=dict(showgrid=False, showticklabels=False,zeroline=False),
                                  yaxis=dict(showgrid=False, showticklabels=False, zeroline=False))
                style={'display':'none'}
                style_bin={'display':'none'}
                style2={'display':'none'}
                return [fig, style, style_bin, style2]
                
            elif value in cat_col:

                style_bin = {'display':'none'}
                style={'width':'200px','position':'absolute', 'z-index': 10 }
                if on:
                    self.type_of_graph = [i+2 for i in range(1, self.df[self.target].nunique()+1)]
                    text_template = "%{y:,}%"
                    y_text='Percentage'
                    col_num='percentage'
                else:
                    self.type_of_graph = [-1*i-1 for i in range(self.df[self.target].nunique(), 0,-1)]
                    text_template = "%{y:,}"
                    y_text='Sum'
                    col_num='count'
                    
                if target_include:
#                     ax = round(self.df.groupby(value)[self.target].value_counts(
#                         normalize=True).unstack(self.target)*100,2).reset_index()
#                     bx = round(self.df.groupby(value)[self.target].value_counts(
#                         normalize=False).unstack(self.target),2).reset_index()
#                     ax = pd.concat([ax, bx.drop(value, axis=1)], axis=1)
#                     feature_value = ax.iloc[:, 0].tolist()
                    feature_value = self.bivariate_data.loc[self.bivariate_data['feature'].isin([value]), 'index'].tolist()
                    self.feature_value = [str(i) for i in feature_value]
#                     legend_name = ax.columns.unique().tolist()[1:]       #check
                    number_of_legend = (self.bivariate_data.shape[1]-4)/2
                    legend_name = self.bivariate_data.columns[3:3+int(number_of_legend)].tolist()
#                     legend_name = self.df[self.target].unique().tolist()
                    if self.colors['legend_colors']:
                        legend_color = self.colors['legend_colors']
                    else:
                        legend_color = ['green', 'blue']
                    
                    fig = go.Figure(
                        data=[
                            go.Bar(
                                x=feature_value,
                                y=self.bivariate_data.loc[self.bivariate_data['feature'].isin([value]),
                                                           self.bivariate_data.columns[i]].tolist(),
                                name=legend_name[j],
                                marker_color=legend_color[j],
                                texttemplate=text_template,
                                textposition='auto'
                            ) for j, i in enumerate(self.type_of_graph)
                        ]
                    )
                    
                else:

#                     ff = pd.merge(
#                         left = (self.df[value].value_counts(normalize=True)*100).round(),
#                         right = self.df[value].value_counts(),left_index=True,
#                         right_index=True
#                     )
# #                     print(ff)
#                     feature_value = ff.sort_index().index.tolist()
                    feature_value = self.bivariate_data.loc[self.bivariate_data['feature'].isin([value]),'index'].tolist()
                    self.feature_value = [str(i) for i in feature_value]

                    fig = go.Figure(
                        data=[
                            go.Bar(
                                x=self.bivariate_data[self.bivariate_data.feature == value]['index'].tolist(),
                                y=self.bivariate_data.loc[self.bivariate_data['feature'].isin([value]), col_num],
                                texttemplate=text_template,
                                textposition='auto',
                                
                            )
                        ]
                    )
                    fig.update_traces(marker_color=self.colors['histogram'][2])
                fig.update_traces(marker_line_color=self.colors['histogram'][2],
                                  marker_line_width=0.5,
                                  opacity=0.8)
                fig.update_layout(
                    template='plotly_white',
                    font_family='serif',
                    title=dict(text=f'{value}',x=0.53,y=0.95,font=dict(color=self.colors['graph_title'],size=20)),
                    xaxis_title_text='',
                    xaxis=dict(tickvals=self.feature_value, categoryorder='category ascending'),
                    yaxis_title_text=y_text,
                    barmode='stack',
                    paper_bgcolor=self.colors['background_color'],
                    plot_bgcolor=self.colors['background_color'],
                    legend_title_text='',
                    legend=dict(x=1,y=1.02,yanchor="bottom",xanchor="right",orientation="h",
                                bordercolor=self.colors['legend_border'],borderwidth=0,tracegroupgap=5),
                    bargap=0.3,
                )

                return [fig, style, style_bin, sec_bool_switch]


            else:
                if target_include:
#                     dd = round(self.df.groupby(value)[self.target].value_counts(
#                         normalize=False).unstack(self.target),2).reset_index()
#                     dd['feature'] = value
#                     dd.rename(columns = {value:'feature_value'}, inplace = True)
                    dd = self.bivariate_numerical_data[self.bivariate_numerical_data['feature'].isin([value])]
                    ddf = dd.iloc[:,2:]
                    self.dd = ddf.melt(id_vars=['feature','feature_value'])
                    fig = px.histogram(self.dd,x='feature_value',y='value',color='variable', template='plotly_white',
                                          marginal='box',opacity=0.7,nbins=int(bin_req),
                                          color_discrete_sequence=self.colors['histogram'],
                                          barmode='group',histfunc='sum')

                else:

                    dd = self.bivariate_numerical_data[self.bivariate_numerical_data['feature'].isin([value])]
                    dd = dd.iloc[:,:2]
                    list_of_data = []
                    for index, row in dd.iterrows():
                        [list_of_data.append(row['index']) for i in range(row['count'])]
                    fig = px.histogram(x=list_of_data,
                                       template='plotly_white',
                                       marginal='box',opacity=0.7,nbins=int(bin_req),
                                       color_discrete_sequence=[self.colors['histogram'][0]],
                                       histfunc='sum')

                xaxistitle=" "
                yaxistitle='Count'
                plottitle=f'{value} Distribution'


                fig.update_layout(
                    font_family='monospace',
                    title=dict(text=plottitle,x=0.53,y=0.95,
                               font=dict(color=self.colors['graph_title'],size=20)),
                    xaxis_title_text=xaxistitle,
                    yaxis_title_text=yaxistitle,
                    legend_title_text='',
                    paper_bgcolor=self.colors['background_color'],
                    plot_bgcolor=self.colors['background_color'],
                    legend=dict(x=1,y=0.96,bordercolor=self.colors['legend_border'],borderwidth=0,tracegroupgap=5),
                    bargap=0.3,
                )
                style={'display':'none'}
                style_bin={'position':'absolute','z-index': 2 ,'width':'80px'}
                return [fig, style, style_bin,sec_bool_switch]
 



In [400]:
pp = webiste()

title = 'AmExpert 2021 – Machine Learning Hackathon'

# introduction
para = [
    {
        'title':'Problem Statement'
    },
    {
        'T1':'XYZ Bank is a mid-sized private bank that includes a variety of banking products, such as savings accounts, \
        current accounts, investment products, credit products, and home loans.',
    },
    {
        'T2':'The Bank wants to',
        'B1':'predict the next set of products',
        'T3':'for a set of customers to optimize their marketing and communication campaigns.'
    },
    {
       'T1':'Here, our task is to',
       'B1':'predict the next set of products (upto 3 products)',
       'T2':'for a set of customers (Test data) based on their demographics and current product holdings.' 
    }
]

image_detail = [f"assets/{assets_target}/intro.jpg", 'Product Prediction']

# data
d_para = [
    {
        'T1':'There are 22 Product - P00, P1, P2, P3, P4, P5, P6, P7, P8, P9, P10, P11, P12, P13,P14, P15, P16, \
        P17, P18, P19, P20, P21',
    },
    {
        'B1':'Multi-Label Classification Problem - \n',
        'T1':'A classification Problem in which we have to predict more than one class label'      
    },
    {
        'T1':'Typically, a classification task involve predicting single value. Alternately, it involve prediction \
        likelihood across two or more class label.  In these cases, the classes are mutually exclusive, meaning the \
        classification task assumes that the input belongs to one class only.'
    },
    
    {
        'T1':'Some classification tasks require predicting more than one class label. This means that class labels \
        are not mutually exclusive. These tasks are referred to as', 
        'B1':'multiple label classification, or multi-label classification',
        'T2':'for short.'
    }
]

#plot
Pl_variable_list = feature_description['Feature_name'].tolist()

pp.layout('multi-classification',train_data, feature_description,classification_target='Product_Holding_B2', title=title,
          In_para=para, In_image=image_detail, In_true=True,
          Da_true=True, Da_para=d_para,
          Pl_true=True, Pl_variable_list = Pl_variable_list,
          Pl_no_graph=['Customer_ID','Product_Holding_B1','Product_Holding_B2'],
          donut_title = 'How many Candidates are Looking for change', donut_center_text = "Looking or Not",
          not_to_use=['Customer_ID', 'Product_Holding_B1'],
         )

pp.callbacks()
pp.run_server(8058)

0 Introduction Intro
1 Data data
2 Graph graph
2 uni_graph uni_graph
3 bi_graph bi_graph
Dash app running on http://127.0.0.1:8058/


In [126]:
pp.bivariate_data

Unnamed: 0,index,count,percentage,['P00'],"['P1', 'P10', 'P12', 'P16']","['P1', 'P10', 'P12']","['P1', 'P10', 'P13']","['P1', 'P10']","['P1', 'P11']","['P1', 'P12', 'P13']",...,"['P9', 'P10', 'P12']","['P9', 'P10']","['P9', 'P11']","['P9', 'P12', 'P13']","['P9', 'P12', 'P16']","['P9', 'P12']","['P9', 'P13']","['P9', 'P16']",['P9'],feature
0,Male,25890,68.59,15.45,0.00,0.04,0.01,0.17,0.01,0.03,...,6.0,21.0,2.0,3.0,,228.0,,2.0,440.0,Gender
1,Female,11858,31.41,16.09,,0.03,,0.20,0.02,0.05,...,2.0,10.0,3.0,1.0,1.0,97.0,2.0,4.0,206.0,Gender
2,CC247596,1,0.00,,,,,,,,...,,,,,,,,,,Is_Active
3,CC311929,1,0.00,,,,,,,,...,,,,,,,,,1.0,Is_Active
4,CC393667,1,0.00,100.00,,,,,,,...,,,,,,,,,,Is_Active
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37750,C1,18941,50.18,15.14,0.01,0.02,0.01,0.21,0.01,0.04,...,5.0,17.0,3.0,2.0,,168.0,2.0,2.0,296.0,City_Category
37751,C2,18807,49.82,16.17,,0.05,0.01,0.16,0.01,0.04,...,3.0,14.0,2.0,2.0,1.0,157.0,,4.0,350.0,City_Category
37752,S3,17865,47.33,14.23,0.01,0.06,0.01,0.23,0.01,0.07,...,6.0,20.0,2.0,4.0,1.0,161.0,1.0,5.0,294.0,Customer_Category
37753,S2,13265,35.14,16.00,,0.03,0.01,0.16,0.02,0.02,...,2.0,9.0,,,,106.0,,1.0,220.0,Customer_Category


In [12]:
train_data['count'] = 1
# Orders

ed_order = ['Primary School','High School','Graduate','Masters','Phd']
enroll_order = ['No Enrollment','Part time course','Full time course']
disc_order = ['STEM','Unknown','Humanities','Other','Business Degree','Arts','No Major']
exp_yrs_order = ['<1','1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18','19','20','>20']
exp_yrs_order_2 = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20]
size_order = ['<10', '50-99', '100-500', '500-999', '1000-4999', '5000-9999', '10000+']
job_order = ['Never', '1', '2', '3', '4', '>4']
exp_order =['No relevant experience','Has relevant experience']
gender_order = ['Male','Female','Other']
company_order = ['Pvt Ltd','Unknown','Funded Startup','Public Sector','Early Stage Startup','NGO','Other']

train_data.fillna(0, inplace=True)
pd.pivot_table(data=train_data, index=['gender'], columns=['company_size'], values='count',aggfunc=np.sum).loc[gender_order, size_order]

company_size,<10,50-99,100-500,500-999,1000-4999,5000-9999,10000+
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Male,917,2117,1891,612,930,404,1424
Female,73,214,166,63,107,40,138
Other,11,31,13,5,13,5,22


In [17]:
ct_gen_size = pd.crosstab(train_data['company_size'],train_data['experience'], normalize='index').loc[size_order,exp_yrs_order]
ct_gen_size*100

experience,<1,1,2,3,4,5,6,7,8,9,...,12,13,14,15,16,17,18,19,20,>20
company_size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
<10,2.446483,3.669725,6.727829,7.33945,5.810398,8.027523,5.733945,5.963303,3.746177,5.351682,...,1.987768,2.06422,4.281346,3.746177,2.675841,1.834862,0.688073,1.376147,0.458716,16.131498
50-99,1.946156,2.205644,4.346416,6.811547,7.200778,8.076549,7.26565,5.319494,4.930263,5.611417,...,3.211158,2.205644,3.795005,3.924749,3.048978,2.043464,1.524489,1.362309,0.454103,15.115148
100-500,1.478024,1.205757,4.2007,5.600933,6.30105,6.378841,5.484247,5.523143,5.289771,6.378841,...,3.267211,2.294827,3.306107,4.628549,2.683781,2.10035,2.10035,1.944769,0.855698,18.786464
500-999,1.824401,1.824401,3.534778,4.446978,6.157355,6.157355,6.841505,6.49943,3.876853,5.701254,...,3.990878,3.306727,2.964652,5.017104,3.762828,1.824401,1.710376,2.280502,0.798176,19.270239
1000-4999,1.581325,1.204819,2.861446,4.292169,5.271084,5.496988,5.496988,4.518072,4.141566,4.819277,...,2.786145,2.635542,4.668675,5.271084,3.313253,2.409639,1.430723,2.409639,1.355422,23.644578
5000-9999,0.17762,2.486679,4.262877,2.664298,5.328597,6.749556,5.683837,6.927176,4.973357,4.973357,...,3.730018,2.841918,3.552398,3.374778,3.730018,2.309059,2.131439,1.776199,1.598579,20.426288
10000+,1.386825,1.584943,3.021298,4.754829,5.250124,6.785537,7.429421,5.398712,4.259534,5.497771,...,2.426944,2.575532,3.219416,4.110946,3.120357,2.278356,1.83259,2.030708,1.089648,21.297672


In [16]:
train_data['count'] = 1
aug_train = train_data.copy()
pv_gen_size = pd.pivot_table(aug_train, values='count',index=['gender'],columns=['company_size'],aggfunc=np.sum).loc[gender_order, size_order]
ct_gen_size = pd.crosstab(aug_train['company_size'],aug_train['experience'], normalize='index').loc[size_order,exp_yrs_order_2]


KeyError: "None of [Int64Index([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,\n            20],\n           dtype='int64', name='experience')] are in the [columns]"

In [19]:
pv_gen_size = pd.pivot_table(aug_train, values='count',index=['gender'],columns=['company_size'],aggfunc=np.sum).loc[gender_order, size_order]
pd.crosstab(aug_train['company_size'],aug_train['experience'], normalize='index')

experience,0,1,10,11,12,13,14,15,16,17,...,20,3,4,5,6,7,8,9,<1,>20
company_size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.003873,0.047322,0.035029,0.025935,0.016504,0.013304,0.018862,0.021893,0.017346,0.012967,...,0.006905,0.100876,0.094813,0.081341,0.058774,0.052375,0.034018,0.039912,0.049006,0.154429
10/49,0.003399,0.029232,0.062542,0.041468,0.030591,0.023114,0.029232,0.03467,0.031271,0.011557,...,0.006118,0.066621,0.081577,0.086336,0.076139,0.046227,0.040789,0.056424,0.023793,0.123046
100-500,0.003112,0.012058,0.060677,0.038117,0.032672,0.022948,0.033061,0.046285,0.026838,0.021004,...,0.008557,0.056009,0.063011,0.063788,0.054842,0.055231,0.052898,0.063788,0.01478,0.187865
1000-4999,0.003765,0.012048,0.063253,0.036898,0.027861,0.026355,0.046687,0.052711,0.033133,0.024096,...,0.013554,0.042922,0.052711,0.05497,0.05497,0.045181,0.041416,0.048193,0.015813,0.236446
10000+,0.003962,0.015849,0.064388,0.038138,0.024269,0.025755,0.032194,0.041109,0.031204,0.022784,...,0.010896,0.047548,0.052501,0.067855,0.074294,0.053987,0.042595,0.054978,0.013868,0.212977
50-99,0.001622,0.022056,0.055141,0.039247,0.032112,0.022056,0.03795,0.039247,0.03049,0.020435,...,0.004541,0.068115,0.072008,0.080765,0.072657,0.053195,0.049303,0.056114,0.019462,0.151151
500-999,0.003421,0.018244,0.04447,0.034208,0.039909,0.033067,0.029647,0.050171,0.037628,0.018244,...,0.007982,0.04447,0.061574,0.061574,0.068415,0.064994,0.038769,0.057013,0.018244,0.192702
5000-9999,0.007105,0.024867,0.055062,0.040853,0.0373,0.028419,0.035524,0.033748,0.0373,0.023091,...,0.015986,0.026643,0.053286,0.067496,0.056838,0.069272,0.049734,0.049734,0.001776,0.204263
<10,0.003058,0.036697,0.057339,0.038991,0.019878,0.020642,0.042813,0.037462,0.026758,0.018349,...,0.004587,0.073394,0.058104,0.080275,0.057339,0.059633,0.037462,0.053517,0.024465,0.161315


# check

In [33]:
d = train_data.copy()
d.columns

Index(['Customer_ID', 'Gender', 'Age', 'Vintage', 'Is_Active', 'City_Category',
       'Customer_Category', 'Product_Holding_B1', 'Product_Holding_B2'],
      dtype='object')

In [34]:
d.columns = ['Customer_ID', 'Gender', 'Age', 'Vintage', 'Is_Active', 'City_Category',
       'Customer_Category', 'B1', 'B2']

In [35]:
d.Customer_ID = d.Customer_ID.apply(lambda x : x[2:]).astype(int)

In [39]:
d["Customer_ID_bucket"] = d.Customer_ID.apply(lambda x : round(x/1000))
d.head(1)

Unnamed: 0,Customer_ID,Gender,Age,Vintage,Is_Active,City_Category,Customer_Category,B1,B2,Customer_ID_bucket
0,264719,Male,41,14,0,C1,S3,['P16'],['P8'],265


In [49]:
d.B1 = d.B1.apply(lambda x: re.findall('P[0-9]*', x))
d.B2 = d.B2.apply(lambda x: re.findall('P[0-9]*', x))

In [70]:
customer_Fav = d.B1.explode().to_frame() \
    .merge(d, left_index=True, right_index=True) \
    .groupby('Customer_ID_bucket')['B1_x'].apply(lambda x: x.value_counts()).reset_index() \
    .groupby('Customer_ID_bucket')['level_1'].apply(lambda x: list(x)[:5]).reset_index()
customer_Fav.head(2)

Unnamed: 0,Customer_ID_bucket,level_1
0,199,"[P13, P12, P17, P21, P20]"
1,200,"[P13, P12, P17, P21, P20]"


In [75]:
d = d.merge(customer_Fav, on='Customer_ID_bucket')
d = d.rename(columns={'level_1':'popular_product'})
d.head(2)

Unnamed: 0,Customer_ID,Gender,Age,Vintage,Is_Active,City_Category,Customer_Category,B1,B2,Customer_ID_bucket,popular_product
0,264719,Male,41,14,0,C1,S3,[P16],[P8],265,"[P16, P13, P12, P21, P17]"
1,264929,Male,35,15,0,C2,S1,[P16],[P8],265,"[P16, P13, P12, P21, P17]"


In [77]:
d['Gender_encoding']=d.Gender.map({'Male':1, 'Female':0})

In [8]:
import re
list_of_product = []
for i in d.B1.iloc[:]:
    list_of_product.extend(re.findall('P[0-9]*',i))
len(set(list_of_product))

22

In [31]:
# alternative is explode and merge

dd = d.B1.iloc[:].apply(pd.Series) \
    .merge(d.iloc[:], left_index=True, right_index=True) \
    .drop(['B1'], axis=1) \
    .melt(id_vars=['Customer_ID','Gender','Age','Vintage','Is_Active','City_Category', 'Customer_Category', 'B2'],
          value_name = "Product") \
    .drop(['variable'], axis=1) \
    .dropna()
dd.head()

Unnamed: 0,Customer_ID,Gender,Age,Vintage,Is_Active,City_Category,Customer_Category,B2,Product
0,CC264719,Male,41,14,0,C1,S3,[P8],P16
1,CC209679,Female,47,14,1,C1,S2,[P3],P13
2,CC319633,Female,59,14,0,C2,S2,[P00],P11
3,CC231413,Female,32,16,0,C1,S2,[P6],P8
4,CC259633,Male,30,15,0,C2,S3,"[P8, P12]",P16


In [94]:
# dd.groupby('Product').size().reset_index()

In [244]:
dd

Unnamed: 0,Customer_ID,Gender,Age,Vintage,Is_Active,City_Category,Customer_Category,B2,Product
0,CC264719,Male,41,14,0,C1,S3,[P8],P16
1,CC209679,Female,47,14,1,C1,S2,[P3],P13
2,CC319633,Female,59,14,0,C2,S2,[P00],P11
3,CC231413,Female,32,16,0,C1,S2,[P6],P8
4,CC259633,Male,30,15,0,C2,S3,"[P8, P12]",P16
...,...,...,...,...,...,...,...,...,...
257992,CC209167,Male,26,16,1,C1,S3,"[P1, P3, P4, P6, P7, P8]",P21
259332,CC207753,Male,40,17,0,C1,S3,"[P1, P3, P4, P6, P7, P8]",P21
270321,CC209004,Male,31,19,1,C1,S2,"[P1, P3, P4, P5, P6, P7, P8]",P21
285128,CC203587,Male,38,27,1,C2,S3,"[P3, P4, P5, P7, P8, P9, P10]",P21


<!-- Metrics for Multiple classification Problem -->


Metrics for multiple label classification problem


In [416]:
# Precision at K(P@K)
def pk(y_true, y_pred, k):
    
    if k == 0:
        return 0
    
    # interested in top k 
    y_pred = y_pred[:k]
    pred_set = set(y_pred)
    true_set = set(y_true)
    common_value = pred_set.intersection(true_set)
    return len(common_value) / len(y_pred[:k])

def apk(y_true, y_pred, k):
    pk_values = []
    # loop over all k. from 1 to k + 1
    for i in range(1, k + 1):
        # calculate p@i and append to list
        pk_values.append(pk(y_true, y_pred, i))
    # if we have no values in the list, return 0
    if len(pk_values) == 0:
        return 0
    # else, we return the sum of list over length of list
    return sum(pk_values) / len(y_true)

def mapk(y_true, y_pred, k):
    # initialize empty list for apk values
    apk_values = []
    # loop over all samples
    for i in range(len(y_true)):
    # store apk values for every sample
        apk_values.append(
            apk2(y_true[i], y_pred[i], k=k)
        )
    # return mean of apk values list
    return sum(apk_values) / len(apk_values)

import numpy as np
def apk2(actual, predicted, k=10):
    """
    order matters and we weigh the predictions.
    """

    if len(predicted)>k:
        predicted = predicted[:k]
    score = 0.0
    num_hits = 0.0
    for i,p in enumerate(predicted):
#         print(i, "p-",p)
        x = predicted[:i]
#         print(x)
        if p in actual and p not in x:
            
            num_hits += 1.0
            score += num_hits / (i+1.0)
#             print(num_hits,'score - ', score)
#         else:
#             print('no')
#     print(actual)
#     if not actual:
#         return 0.0
    return score / min(len(actual), k)

In [417]:
# example of a multi-label classification task
from sklearn.datasets import make_multilabel_classification
# define dataset
X, y = make_multilabel_classification(n_samples=1000, n_features=10, n_classes=3, n_labels=2, random_state=1)
# summarize dataset shape
print(X.shape, y.shape)
# summarize first few examples
for i in range(10):
    print(X[i], y[i])

KeyboardInterrupt: 

In [428]:
# mlp for multi-label classification
from numpy import mean
from numpy import std
from sklearn.datasets import make_multilabel_classification
from sklearn.model_selection import RepeatedKFold
from keras.models import Sequential
from keras.layers import Dense
from sklearn.metrics import accuracy_score

# get the dataset
def get_dataset():
    X, y = make_multilabel_classification(n_samples=1000, n_features=10, n_classes=3, n_labels=2, random_state=1)
    return X, y

# get the model
def get_model(n_inputs, n_outputs):
    model = Sequential()
    model.add(Dense(20, input_dim=n_inputs, kernel_initializer='he_uniform', activation='relu'))
    model.add(Dense(20, activation='relu'))
    model.add(Dense(n_outputs, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam')
    return model

# evaluate a model using repeated k-fold cross-validation
def evaluate_model(X, y):
    results = list()
    n_inputs, n_outputs = X.shape[1], y.shape[1]
    # define evaluation procedure
    cv = RepeatedKFold(n_splits=2, n_repeats=2, random_state=1)
    # enumerate folds
    for train_ix, test_ix in cv.split(X):
        # prepare data
        X_train, X_test = X[train_ix], X[test_ix]
        y_train, y_test = y[train_ix], y[test_ix]
        # define model
        model = get_model(n_inputs, n_outputs)
        # fit model
        model.fit(X_train, y_train, verbose=0, epochs=100)
        yhat = model.predict(X_test)
        yhat = yhat.round()
        print(yhat)
        acc = mapk(y_test, yhat, 3)
        # store result
        print('>%.3f' % acc)
        results.append(acc)
    return results

# load dataset
# X, y = get_dataset()
# evaluate model
results = evaluate_model(X[:10], y[:10])
# summarize performance
print('Accuracy: %.3f (%.3f)' % (mean(results), std(results)))

[[1. 1. 1.]
 [1. 1. 1.]
 [1. 0. 1.]
 [1. 0. 1.]
 [1. 0. 0.]]
>0.467
[[1. 1. 0.]
 [1. 1. 0.]
 [1. 1. 1.]
 [1. 1. 1.]
 [1. 1. 0.]]
>0.222
[[1. 1. 0.]
 [1. 1. 0.]
 [1. 1. 0.]
 [1. 1. 0.]
 [0. 0. 0.]]
>0.267
[[1. 0. 1.]
 [1. 0. 1.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 1. 1.]]
>0.500
Accuracy: 0.364 (0.121)


In [450]:
d.B1.apply(lambda x: )

ValueError: The truth value of a DataFrame is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [455]:
def populate_data(x):
    print(x, x.index)
    for i in x:
        d.loc[x.index(), i] = 1
d.B1.apply(lambda x: populate_data(x))

['P16'] <built-in method index of list object at 0x00000229AD8041C0>


TypeError: index expected at least 1 argument, got 0

In [466]:

for index, row in d.iloc[:20].iterrows():
    for i in row['B1']:
        d.loc[index, i] = 1

In [467]:
d

Unnamed: 0,Customer_ID,Gender,Age,Vintage,Is_Active,City_Category,Customer_Category,B1,B2,P16,...,P18,P15,P6,P9,P7,P3,P5,P4,P1,P14
0,CC264719,Male,41,14,0,C1,S3,[P16],[P8],1.0,...,,,,,,,,,,
1,CC209679,Female,47,14,1,C1,S2,"[P13, P20]",[P3],,...,,,,,,,,,,
2,CC319633,Female,59,14,0,C2,S2,[P11],[P00],,...,,,,,,,,,,
3,CC231413,Female,32,16,0,C1,S2,"[P8, P13]",[P6],,...,,,,,,,,,,
4,CC259633,Male,30,15,0,C2,S3,"[P16, P17, P21]","[P8, P12]",1.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37743,CC314217,Male,28,24,1,C1,S2,[P16],[P9],,...,,,,,,,,,,
37744,CC388747,Male,36,11,0,C2,S2,"[P16, P17, P21]","[P5, P12]",,...,,,,,,,,,,
37745,CC318056,Male,44,26,1,C2,S2,[P14],[P13],,...,,,,,,,,,,
37746,CC373551,Female,56,15,1,C1,S2,"[P12, P13]",[P1],,...,,,,,,,,,,
