In [8]:
import pandas as pd
pd.set_option('display.max_colwidth', None)

In [9]:
def load_data(path_csv):
	df = pd.read_csv(path_csv)
	df.drop_duplicates(inplace=True) # drop duplicate value
	df.dropna(inplace=True) # drop missing value
	df.info()
	return df

In [None]:
path = "fraud_oracle.csv"
df = load_data(path)
df.head(5)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15420 entries, 0 to 15419
Data columns (total 33 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Month                 15420 non-null  object
 1   WeekOfMonth           15420 non-null  int64 
 2   DayOfWeek             15420 non-null  object
 3   Make                  15420 non-null  object
 4   AccidentArea          15420 non-null  object
 5   DayOfWeekClaimed      15420 non-null  object
 6   MonthClaimed          15420 non-null  object
 7   WeekOfMonthClaimed    15420 non-null  int64 
 8   Sex                   15420 non-null  object
 9   MaritalStatus         15420 non-null  object
 10  Age                   15420 non-null  int64 
 11  Fault                 15420 non-null  object
 12  PolicyType            15420 non-null  object
 13  VehicleCategory       15420 non-null  object
 14  VehiclePrice          15420 non-null  object
 15  FraudFound_P          15420 non-null

Unnamed: 0,Month,WeekOfMonth,DayOfWeek,Make,AccidentArea,DayOfWeekClaimed,MonthClaimed,WeekOfMonthClaimed,Sex,MaritalStatus,...,AgeOfVehicle,AgeOfPolicyHolder,PoliceReportFiled,WitnessPresent,AgentType,NumberOfSuppliments,AddressChange_Claim,NumberOfCars,Year,BasePolicy
0,Dec,5,Wednesday,Honda,Urban,Tuesday,Jan,1,Female,Single,...,3 years,26 to 30,No,No,External,none,1 year,3 to 4,1994,Liability
1,Jan,3,Wednesday,Honda,Urban,Monday,Jan,4,Male,Single,...,6 years,31 to 35,Yes,No,External,none,no change,1 vehicle,1994,Collision
2,Oct,5,Friday,Honda,Urban,Thursday,Nov,2,Male,Married,...,7 years,41 to 50,No,No,External,none,no change,1 vehicle,1994,Collision
3,Jun,2,Saturday,Toyota,Rural,Friday,Jul,1,Male,Married,...,more than 7,51 to 65,Yes,No,External,more than 5,no change,1 vehicle,1994,Liability
4,Jan,5,Monday,Honda,Urban,Tuesday,Feb,2,Female,Single,...,5 years,31 to 35,No,No,External,none,no change,1 vehicle,1994,Collision


## Feature Definitions

- **Month**: Month of the insurance claim.
- **WeekOfMonth**: Week of the month of the claim.
- **DayOfWeek**: Day of the week of the claim.
- **Make**: Vehicle manufacturer.
- **AccidentArea**: Accident area (urban/rural).
- **DayOfWeekClaimed**: Day of the week the claim was processed.
- **MonthClaimed**: Month the claim was processed.
- **WeekOfMonthClaimed**: Week of the month the claim was processed.
- **Sex**: Policyholder's gender.
- **MaritalStatus**: Policyholder's marital status.
- **Age**: Policyholder's age.
- **Fault**: Policyholder at fault (yes/no).
- **PolicyType**: Type of insurance policy.
- **VehicleCategory**: Vehicle category (e.g., sedan, SUV).
- **VehiclePrice**: Price of the vehicle.
- **FraudFound_P**: Fraud detected (yes/no).
- **PolicyNumber**: Insurance policy identifier.
- **RepNumber**: Insurance representative identifier.
- **Deductible**: Out-of-pocket amount before insurance coverage.
- **DriverRating**: Driver's rating.
- **Days_Policy_Accident**: Days from policy issue to accident.
- **Days_Policy_Claim**: Days from policy issue to claim.
- **PastNumberOfClaims**: Previous claims by the policyholder.
- **AgeOfVehicle**: Age of the vehicle.
- **AgeOfPolicyHolder**: Age of the policyholder.
- **PoliceReportFiled**: Police report filed (yes/no).
- **WitnessPresent**: Witness present (yes/no).
- **AgentType**: Type of insurance agent (internal/external).
- **NumberOfSuppliments**: Number of supplementary documents/claims.
- **AddressChange_Claim**: Address change at the time of claim (yes/no).
- **NumberOfCars**: Number of cars insured.
- **Year**: Year of the claim.
- **BasePolicy**: Base policy type (e.g., Liability, Collision, All Perils).

In [11]:
df.describe()

Unnamed: 0,WeekOfMonth,WeekOfMonthClaimed,Age,FraudFound_P,PolicyNumber,RepNumber,Deductible,DriverRating,Year
count,15420.0,15420.0,15420.0,15420.0,15420.0,15420.0,15420.0,15420.0,15420.0
mean,2.788586,2.693969,39.855707,0.059857,7710.5,8.483268,407.70428,2.487808,1994.866472
std,1.287585,1.259115,13.492377,0.23723,4451.514911,4.599948,43.950998,1.119453,0.803313
min,1.0,1.0,0.0,0.0,1.0,1.0,300.0,1.0,1994.0
25%,2.0,2.0,31.0,0.0,3855.75,5.0,400.0,1.0,1994.0
50%,3.0,3.0,38.0,0.0,7710.5,8.0,400.0,2.0,1995.0
75%,4.0,4.0,48.0,0.0,11565.25,12.0,400.0,3.0,1996.0
max,5.0,5.0,80.0,1.0,15420.0,16.0,700.0,4.0,1996.0


In [21]:
df.loc[:, df.nunique() < 3]

Unnamed: 0,AccidentArea,Sex,Fault,FraudFound_P,PoliceReportFiled,WitnessPresent,AgentType
0,Urban,Female,Policy Holder,0,No,No,External
1,Urban,Male,Policy Holder,0,Yes,No,External
2,Urban,Male,Policy Holder,0,No,No,External
3,Rural,Male,Third Party,0,Yes,No,External
4,Urban,Female,Third Party,0,No,No,External
...,...,...,...,...,...,...,...
15415,Urban,Male,Policy Holder,1,No,No,External
15416,Urban,Male,Policy Holder,0,No,No,External
15417,Rural,Male,Policy Holder,1,No,No,External
15418,Urban,Female,Third Party,0,No,No,External
