In [242]:
# Importing library
import pandas as pd

# Set maximum columns width
pd.options.display.max_colwidth = 200

# Load the dataset
URL = 'https://drive.google.com/uc?id=1Cw74EdOP8ewFf08H7X5QIDi9VXQmwZ-F'
raw_data = pd.read_csv(URL, engine='pyarrow', parse_dates=['DateOfBirth', 'claim_date'])

# Copy the dataset
df = raw_data.copy()

### QUESTION #1 

Perform a data audit for the datatypes and find out if there are any mismatch within the current datatypes of the columns and their business significance.

In [243]:
print(f'Jumlah baris dan kolom: {df.shape}')

pd.DataFrame({
    'Name': df.columns.values,
    'Type': df.dtypes.values,
    'N/A (%)': df.isna().mean().values * 100,
    'Unique': df.nunique().values,
    'Sample': [df[col].unique() for col in df.columns]
})

Jumlah baris dan kolom: (1085, 17)


Unnamed: 0,Name,Type,N/A (%),Unique,Sample
0,CUST_ID,int64,0.0,1078,"[21868593, 75740424, 30308357, 47830476, 19269962, 21831191, 18401412, 73486606, 32813689, 33507197, 99103685, 51583214, 35875366, 21240703, 18288638, 63240241, 1407979, 63916778, 17004138, 966776..."
1,gender,object,0.0,2,"[Female, Male]"
2,DateOfBirth,datetime64[ns],0.0,1078,"[1979-01-12T00:00:00.000000000, 2070-01-13T00:00:00.000000000, 1984-03-11T00:00:00.000000000, 1986-05-01T00:00:00.000000000, 1977-05-13T00:00:00.000000000, 1977-05-17T00:00:00.000000000, 1999-03-0..."
3,State,object,0.0,50,"[VT, ME, TN, MA, NV, NH, AR, AK, ID, RI, KY, MD, MI, IL, WA, SC, OK, KS, CT, IN, UT, AZ, IA, NM, WI, WY, NY, LA, DE, GA, OH, SD, WV, NJ, TX, CA, NC, MT, VA, OR, AL, HI, ND, MN, MO, CO, FL, NE, MS,..."
4,Contact,object,0.0,1078,"[789-916-8172, 265-543-1264, 798-631-4758, 413-187-7945, 956-871-8691, 419-712-8513, 752-398-2914, 256-968-9172, 142-324-7481, 165-519-4583, 764-439-9856, 743-486-5264, 954-964-1746, 473-561-2675,..."
5,Segment,object,0.0,3,"[Platinum, Silver, Gold]"
6,claim_id,int64,0.0,1085,"[54004764, 33985796, 53522022, 63017412, 13015401, 91609100, 22890252, 39219616, 24050443, 91475471, 12878692, 27026412, 43908336, 63246959, 74165873, 28564401, 72738047, 53780662, 67257404, 35489..."
7,customer_id,int64,0.0,1078,"[21868593, 75740424, 30308357, 47830476, 19269962, 21831191, 18401412, 73486606, 32813689, 33507197, 99103685, 51583214, 35875366, 21240703, 18288638, 63240241, 1407979, 63916778, 17004138, 966776..."
8,incident_cause,object,0.0,5,"[Driver error, Crime, Other driver error, Natural causes, Other causes]"
9,claim_date,datetime64[ns],0.0,100,"[2017-11-27T00:00:00.000000000, 2018-10-03T00:00:00.000000000, 2018-02-02T00:00:00.000000000, 2018-04-04T00:00:00.000000000, 2018-06-17T00:00:00.000000000, 2017-04-12T00:00:00.000000000, 2018-01-1..."


Standarize value

In [244]:
df.replace('Other driver error', 'Driver error', inplace=True)

Drop unnecessary columns

In [245]:
unwantedColumns = ['CUST_ID', 'claim_id', '_merge']

df.drop(columns=unwantedColumns, inplace=True)

### QUESTION #2

Convert the column claim_amount to numeric. Use the appropriate modules/attributes to remove the sign.

In [246]:
df['claim_amount'] = df['claim_amount'].str.replace('$' ,'', regex=True)
df['claim_amount'] = pd.to_numeric(df['claim_amount'])

### QUESTION #3 

Of all the injury claims, some of them have gone unreported with the police. Create an alert flag (1,0) for all such claims.

In [175]:
import numpy as np

df['flag'] = np.where(df.police_report == 'Unknown', 0, 1)

### QUESTION #4

Retain the most recent observation and delete any duplicated records in the data based on the customer ID column.

In [176]:
df.drop_duplicates(subset='customer_id', keep='last', inplace=True)

### QUESTION #5 

Check for missing values and impute the missing values with an appropriate value.(mean for continuous and mode for categorical)

In [255]:
df.isna().sum()

gender                  0
DateOfBirth             0
State                   0
Contact                 0
Segment                 0
customer_id             0
incident_cause          0
claim_date              0
claim_area              0
police_report           0
claim_type              0
claim_amount            0
total_policy_claims    10
fraudulent              0
dtype: int64

In [260]:
df.fillna(value={
    'total_policy_claims': df.total_policy_claims.mode()[0],
    'claim_amount': df.claim_amount.mean()},
    inplace=True
)

### QUESTION #6 

Calculate the age of customers in years. Based on the age, categorize the customers :

In [268]:
def catGroup(x):
    if x < 18:
        return 'Children'
    elif 18 <= x < 30:
        return 'Youth'
    elif 30 <= x < 60:
        return 'Adult'
    else:
        return 'Senior' 
    
df['age'] = pd.to_datetime('today').year - df['DateOfBirth'].dt.year
df['age'] = np.where(df.age < 0, df.age + 100, df.age)

df['categorize'] = df['age'].apply(catGroup)

### QUESTION #7 

Show the monthly trend of the total amount that has been claimed by the customers.

In [200]:
df.groupby(df['claim_date'].dt.month).agg(func={'claim_amount': 'sum'})

Unnamed: 0_level_0,claim_amount
claim_date,Unnamed: 1_level_1
1,1239658.5
2,1114535.5
3,1257741.5
4,1268950.5
5,1014447.5
6,1202340.0
7,1266800.5
8,873792.0
9,858183.0
10,1309625.0


### QUESTION #8

What is the total claim amount based on incident cause for all the claims that have been done at least 20 days prior to 1st of October, 2018.

In [182]:
import datetime as dt

days_before = dt.datetime.strptime('2018-10-1', '%Y-%m-%d') - dt.timedelta(days=20)
df[df.claim_date < days_before].groupby('incident_cause').agg(func={'claim_amount': 'sum'})

Unnamed: 0_level_0,claim_amount
incident_cause,Unnamed: 1_level_1
Crime,659430.0
Driver error,6280844.0
Natural causes,1200231.5
Other causes,3485619.0


### QUSETION #9 

How many adults from TX, DE, and AK claimed insurance for driver related issues and causes?

In [283]:
mask = (df.State.isin(['TX', 'DE', 'AK'])) & (df.categorize == 'Adult')

df[mask].groupby('incident_cause').agg(func={'customer_id': 'count'})

Unnamed: 0_level_0,customer_id
incident_cause,Unnamed: 1_level_1
Crime,1
Driver error,28
Natural causes,12
Other causes,14


### QUESTION #10

What is the average amount claimed by the customers from various segments?

In [184]:
df.groupby('Segment').agg(func={'claim_amount': 'mean'})

Unnamed: 0_level_0,claim_amount
Segment,Unnamed: 1_level_1
Gold,12868.127507
Platinum,12366.015896
Silver,12246.512579


### QUESTION #11

What is the average claim amount for gender and age categories?

In [185]:
df.groupby(['gender', 'categorize']).agg(func={'claim_amount': 'mean'})

Unnamed: 0_level_0,Unnamed: 1_level_0,claim_amount
gender,categorize,Unnamed: 2_level_1
Female,Adult,12769.19086
Female,Senior,9030.641667
Female,Youth,10947.636364
Male,Adult,12987.822539
Male,Senior,12323.03125
Male,Youth,13038.676923


### QUESTION #12

Which age group had the maximum fraudulent policy claims?

In [229]:
mask = df['fraudulent'] == 'Yes'

df[mask].groupby('categorize').agg(func={'total_policy_claims': 'count'})

Unnamed: 0_level_0,total_policy_claims
categorize,Unnamed: 1_level_1
Adult,184
Senior,35
Youth,25


### QUESTION #13

Among males and females, which gender had claimed the most for any type of driver related issues?

In [188]:
mask = df['incident_cause'] == 'Driver error'

df[mask].groupby('gender').agg(func={'claim_amount': 'count'})

Unnamed: 0_level_0,claim_amount
gender,Unnamed: 1_level_1
Female,208
Male,264
