In [6]:
import pandas as pd
import warnings
warnings.filterwarnings(
    "ignore",
    message="A value is trying to be set on a copy of a DataFrame",
    category=FutureWarning
)


In [2]:
# Load the notebook
records=pd.read_csv("data/InterviewTask/AI/records.csv")
gold_cases=pd.read_csv("data/InterviewTask/AI/gold_cases.csv")

In [3]:
len(records)

20000

## Claim Value Bande Based on Client Segment

In [29]:
gold_cases_p0=gold_cases['case_id'].loc[gold_cases['expected_priority']=="P0"].values


In [12]:
records.loc[records['case_id'].isin( gold_cases['case_id'])].to_csv("data/InterviewTask/AI/gold_records.csv",index=False)

In [34]:
gold_cases_p1=gold_cases['case_id'].loc[gold_cases['expected_priority']=="P1"].values

In [21]:
# records.loc[records['case_id'].isin( gold_cases_p1)]

In [36]:
gold_cases_p2=gold_cases['case_id'].loc[gold_cases['expected_priority']=="P2"].values

In [20]:
# records.loc[records['case_id'].isin( gold_cases_p2)]


In [38]:
gold_cases_p3=gold_cases['case_id'].loc[gold_cases['expected_priority']=="P3"].values

In [19]:
# records.loc[records['case_id'].isin( gold_cases_p3)]

# Process the data

In [4]:
records.columns

Index(['case_id', 'received_at', 'client_segment', 'jurisdiction',
       'service_line', 'claim_value_band', 'attachments_present',
       'free_text_summary', 'handler_notes', 'historical_outcome'],
      dtype='object')

In [5]:
#take the first 2000 rows
records_copy=records.loc[0:1500].copy()

#Drop columns ['received_at','free_text_summary', 'handler_notes', 'historical_outcome']
records_copy.drop(['received_at','free_text_summary', 'handler_notes'],axis=1,inplace=True)

#Replace null service_line with no data available
records_copy['service_line'].fillna("No data Available", inplace=True)

#Replace null client segment with no data available
records_copy['client_segment'].fillna("No data Available", inplace=True)

#Drop the cases without any case ids
records_copy.drop(records_copy.loc[records_copy["case_id"].isna()].index,axis=0,inplace=True)

#Replace null Jurisdiction with no data available
records_copy['jurisdiction'].fillna("No data Available", inplace=True)

#Replace the Null values in claim_value_band to Unknown
records_copy['claim_value_band'].fillna("Unknown", inplace=True)

#Replace Null values in the Attachment Present to False
records_copy['attachments_present'].fillna(False, inplace=True)

#Replace Null values in the historical_outcome to Unknown
records_copy['historical_outcome'].fillna("Unknown", inplace=True)

def covert_to_yesno(value):
    if value:
        return ("yes")
    else:
        return ("No")
#Convert Attachment Present to Yes No 
records_copy['attachments_present']=records_copy['attachments_present'].apply(covert_to_yesno)


In [6]:
import os
COLUMNS = ['case_id','severe_legal_or_regulatory_risk','business_critical_impact','potential_fraud','conflicting_information','complex_incident_details','policy_interpretation_issues','legal_disputes','jurisdictional_complexity','coverage_terms_unclear','exclusions_may_apply','new_or_unusual_claim_type','no_legal_or_fraud_concerns','unclear_incident_description','claim_invalid_or_fraudulent','required_conditions_not_met','risk_summary']


df_all_signals = pd.DataFrame(columns=COLUMNS)

for i in range(1,4):
    df=pd.read_csv(f"data/llm_out{i}.0.csv")
    df_all_signals = pd.concat( [df_all_signals, df],ignore_index=True)

In [7]:
#merge the two dataframes
processed_df=pd.merge(records_copy, df_all_signals, on="case_id", how="left")

In [8]:
from src.triage.predict import ClaimTriageEvaluator
claim_calculator=ClaimTriageEvaluator()

In [None]:
row=processed_df.loc[0].to_dict()
risk_score=claim_calculator.calculate_risk_score(row)
priority=claim_calculator.determine_priority(risk_score)
action=claim_calculator.determine_action(row,risk_score)
confidence=claim_calculator.calculate_confidence(row)
extracted_signals=claim_calculator.build_extracted_signals(row)

In [10]:
[risk_score,priority,action,confidence,extracted_signals]

[0.64,
 'P1',
 'Immediate escalation',
 75,
 '{"jurisdiction": "UK", "service_line": "Legal", "claim_value_band": "<50k", "legal_risk": "Yes", "fraud_risk": "No"}']

In [17]:
processed_df.loc[0]

case_id                                                                      C-00001
client_segment                                                                   SMB
jurisdiction                                                                      UK
service_line                                                                   Legal
claim_value_band                                                                <50k
attachments_present                                                              yes
historical_outcome                                                           Settled
severe_legal_or_regulatory_risk                                                  Yes
business_critical_impact                                                          No
potential_fraud                                                                   No
conflicting_information                                                           No
complex_incident_details                                         

In [11]:
df_all_signals.columns

Index(['case_id', 'severe_legal_or_regulatory_risk',
       'business_critical_impact', 'potential_fraud',
       'conflicting_information', 'complex_incident_details',
       'policy_interpretation_issues', 'legal_disputes',
       'jurisdictional_complexity', 'coverage_terms_unclear',
       'exclusions_may_apply', 'new_or_unusual_claim_type',
       'no_legal_or_fraud_concerns', 'unclear_incident_description',
       'claim_invalid_or_fraudulent', 'required_conditions_not_met',
       'risk_summary'],
      dtype='object')

In [3]:
from src.triage.validate import evaluation

In [4]:
evaluation("data/InterviewTask/AI/gold_cases.csv")

gold dataset and prediction dataset merged successfully


In [7]:
gold_cases=pd.read_csv("data/InterviewTask/AI/gold_cases.csv")
gold_records=pd.read_csv("data/InterviewTask/AI/gold_records.csv")

In [8]:
merged_df = pd.merge(
    gold_cases,
    gold_records,
    on="case_id",
    how="inner"
)

In [9]:
merged_df.to_csv("data/InterviewTask/AI/gold_records_merged.csv")

In [None]:
from src.triage.injest import preprocess_getstructrureddata
preprocess_getstructrureddata("data/InterviewTask/AI/gold_records.csv")