In [1]:
# Generic Libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
claims_data = '../data/claims.csv'
premiums_data = '../data/premiums.csv'

In [3]:
claims_df = pd.read_csv(claims_data)
premiums_df = pd.read_csv(premiums_data)

In [4]:
claims_df = claims_df[~claims_df['ClaimCause'].isin(['Claim Denied', 'Claim Withdrawn'])]

In [5]:
claims_df = claims_df.dropna()

claims_df['DateOfLoss'] = pd.to_datetime(claims_df['DateOfLoss'])
claims_df['ClaimReceivedDate'] = pd.to_datetime(claims_df['ClaimReceivedDate'])
claims_df['PolicyEffectiveDate'] = pd.to_datetime(claims_df['PolicyEffectiveDate'])

claims_df = claims_df.astype({
    'PolicyNumber': 'string',
    'ClaimCause': 'string',
    'County': 'string',
    'State': 'string'
})

In [6]:
premiums_df = premiums_df.dropna()

premiums_df['PolicyEffectiveDate'] = pd.to_datetime(premiums_df['PolicyEffectiveDate'])
premiums_df['PolicyExpirationDate'] = pd.to_datetime(premiums_df['PolicyExpirationDate'])

premiums_df = premiums_df.astype({
    'PolicyNumber': 'string',
    'County': 'string',
    'State': 'string'
})

In [7]:
premiums_df = premiums_df.drop_duplicates()

In [8]:
from datetime import datetime

def impute_zero_for_year(
    value : float
) -> float:
    current_year = datetime.now().year
    if 1950.0 <= value <= current_year:
        return value
    else:
        return 0.0

In [9]:
premiums_df['Equip Year'] = premiums_df['Equip Year'].apply(impute_zero_for_year)

### Modeling Questions:
1. Can we predict if a customer will submit a claim? Useful features: county, state, number of policies, previous claims
2. Anomaly detection for fraud
3. Customer churn analysis

In [10]:
premiums_df

Unnamed: 0,PolicyNumber,CustomerId,Loc,PolicyEffectiveDate,PolicyExpirationDate,PolicyPremium,LocationPremium,Deductible,LocValue,County,State,Equip Year,Equip Value
0,P-562-2018,3833,1,2018-03-01,2019-03-01,594,594.000000,1000,36000.0,Furnas,NE,1994.0,36000.0
1,P-5085-2018,3123,1,2018-08-10,2019-08-10,7520,960.000000,1000,60000.0,Castro,TX,1993.0,60000.0
2,P-5085-2018,3123,2,2018-08-10,2019-08-10,7520,1760.000000,1000,110000.0,Castro,TX,2015.0,110000.0
3,P-5085-2018,3123,3,2018-08-10,2019-08-10,7520,960.000000,1000,60000.0,Castro,TX,1988.0,60000.0
4,P-5085-2018,3123,4,2018-08-10,2019-08-10,7520,1760.000000,1000,110000.0,Castro,TX,2014.0,110000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
82069,P-2915-2023,1160,2,2023-03-16,2024-03-16,13268,2080.564831,1000,85000.0,Antelope,NE,2008.0,85000.0
82070,P-2915-2023,1160,3,2023-03-16,2024-03-16,13268,2203.742550,1000,90000.0,Antelope,NE,1991.0,90000.0
82071,P-2915-2023,1160,4,2023-03-16,2024-03-16,13268,2203.742550,1000,90000.0,Antelope,NE,1992.0,90000.0
82072,P-2915-2023,1160,5,2023-03-16,2024-03-16,13268,2349.692620,1000,96000.0,Antelope,NE,2013.0,96000.0
