# Customer Churn Prediction
## **Question?** Will they churn once their current set of policies expire?
### **Churned**: no policy for (>=6 months)
### **Non-Churned**: has policy for (2 years+) and not without policy for (>=6 months) 

## **Features**:
1. First policy year (static)
2. Sum of premiums in USD (dynamic)
3. Sum of claims in USD (dynamic)
4. Number of policies (dynamic)
5. Number of claims (dynamic)
6. State (mode; static)
7. County (mode; static)
8. Sum of deductibles (dynamic)
9. Average equipment year (dynamic)
10. Average location premium (dynamic)

In [66]:
import warnings
warnings.filterwarnings('ignore')

In [1]:
# Generic Libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
claims_data = '../data/claims.csv'
premiums_data = '../data/premiums.csv'

In [3]:
claims_df = pd.read_csv(claims_data)
premiums_df = pd.read_csv(premiums_data)

In [4]:
claims_df = claims_df[~claims_df['ClaimCause'].isin(['Claim Denied', 'Claim Withdrawn'])]

In [5]:
claims_df = claims_df.dropna()

claims_df['DateOfLoss'] = pd.to_datetime(claims_df['DateOfLoss'])
claims_df['ClaimReceivedDate'] = pd.to_datetime(claims_df['ClaimReceivedDate'])
claims_df['PolicyEffectiveDate'] = pd.to_datetime(claims_df['PolicyEffectiveDate'])

claims_df = claims_df.astype({
    'PolicyNumber': 'string',
    'ClaimCause': 'string',
    'County': 'string',
    'State': 'string'
})

In [6]:
premiums_df = premiums_df.dropna()

premiums_df['PolicyEffectiveDate'] = pd.to_datetime(premiums_df['PolicyEffectiveDate'])
premiums_df['PolicyExpirationDate'] = pd.to_datetime(premiums_df['PolicyExpirationDate'])

premiums_df = premiums_df.astype({
    'PolicyNumber': 'string',
    'County': 'string',
    'State': 'string'
})

In [7]:
premiums_df = premiums_df.drop_duplicates()

In [8]:
from datetime import datetime

def impute_zero_for_year(
    value : float
) -> float:
    current_year = datetime.now().year
    if 1950.0 <= value <= current_year:
        return value
    else:
        return 0.0

In [9]:
premiums_df['Equip Year'] = premiums_df['Equip Year'].apply(impute_zero_for_year)

## The below logic is only for getting positive samples

In [290]:
all_positive_customer_data = pd.DataFrame([], columns = [
    'min_policy_date', 'sum_policy_premiums',
    'sum_location_premiums', 'no_previous_equipment',
    'state', 'sum_of_deductibles',
    'avg_equipment_year', 'total_claim_amount', 'churned'
])

unique_customer_ids = premiums_df['CustomerId'].unique()
positive_sample_customer_ids = []

iteration = 0
for customer_id in unique_customer_ids:
    date_list = []
    for index, row in premiums_df.loc[premiums_df['CustomerId'] == customer_id].iterrows():
        start_date = str(row['PolicyEffectiveDate'].date())
        end_date = str(row['PolicyExpirationDate'].date())
    
        date_range = pd.date_range(start_date, end_date).values.astype('datetime64[D]')
        date_strings = np.datetime_as_string(date_range, unit = 'D').tolist()
        
        date_list += date_strings
    
    unique_active_dates = np.unique(date_list)
    
    customer_date_range = pd.DataFrame({
        'ProbeDate': pd.date_range(
            premiums_df['PolicyEffectiveDate'].min(),
            premiums_df['PolicyExpirationDate'].max(),
            freq = 'D'
        )
    })
    
    customer_date_range['CustomerId'] = customer_id
    customer_date_range['ActivePolicy'] = 0
    customer_date_range.loc[customer_date_range['ProbeDate'].isin(unique_active_dates), 'ActivePolicy'] = 1
    
    customer_date_range = customer_date_range.loc[customer_date_range['ProbeDate'] < '2023-01-01']
    
    customer_date_range['ProbePosition'] = 0
    
    def find_switch(
        df : pd.DataFrame,
        col : str
    ) -> list:
        switch_indices = df[df[col].diff() == -1].index
        for idx in switch_indices:
            if (df.loc[idx : idx + 179, col] == 0).all():
                return idx
        return None
    
    index = find_switch(customer_date_range, 'ActivePolicy')

    if index != None:
        customer_date_range.loc[
            [index - 45, index - 90, index - 135, index - 180, index - 225, index - 270, index - 315, index - 360],
            'ProbePosition'
        ] = 1
        positive_sample_customer_ids.append(customer_id)
    else:
        continue
    
    customer_probe_dates = customer_date_range \
                        .loc[customer_date_range['ProbePosition'] == 1][['CustomerId', 'ProbeDate']] \
                        .reset_index(drop = True)
    
    def first(x):
        return x.iloc[0]
    
    def row_count(x):
        return len(x)
    
    all_probe_data = []
    for index, row in customer_probe_dates.iterrows():
        filtered_premium_df = premiums_df.loc[(premiums_df['CustomerId'] == customer_id) & (premiums_df['PolicyEffectiveDate'] < row['ProbeDate'])]
        probe_premium_data = filtered_premium_df.agg({
            'PolicyEffectiveDate': 'min',
            'PolicyPremium': 'sum',
            'LocationPremium': 'sum',
            'Loc': 'max',
            'State': first,
            'Deductible': 'sum',
            'Equip Year': 'mean'
        }).values
    
        filtered_claims_df = claims_df.loc[(claims_df['CustomerId'] == customer_id) & (claims_df['PolicyEffectiveDate'] < row['ProbeDate'])]
        probe_claims_data = filtered_claims_df.agg({
            'TotalPaidToDate': 'sum'
        }).values
    
        probe_data = list(probe_premium_data.astype(str)) + list(probe_claims_data.astype(str)) + ['1']
    
        all_probe_data.append(probe_data)
    
    customer_data = pd.DataFrame(all_probe_data, columns = [
        'min_policy_date', 'sum_policy_premiums',
        'sum_location_premiums', 'no_previous_equipment',
        'state', 'sum_of_deductibles',
        'avg_equipment_year', 'total_claim_amount', 'churned'
    ])
    
    customer_data = customer_data.drop_duplicates()

    all_positive_customer_data = pd.concat([all_positive_customer_data, customer_data], ignore_index = True)

    iteration += 1
    if iteration % 100 == 0:
        print('Finished 100 additional iterations.')

Finished 100 additional iterations.
Finished 100 additional iterations.
Finished 100 additional iterations.
Finished 100 additional iterations.
Finished 100 additional iterations.
Finished 100 additional iterations.
Finished 100 additional iterations.
Finished 100 additional iterations.
Finished 100 additional iterations.
Finished 100 additional iterations.
Finished 100 additional iterations.
Finished 100 additional iterations.
Finished 100 additional iterations.
Finished 100 additional iterations.
Finished 100 additional iterations.
Finished 100 additional iterations.
Finished 100 additional iterations.
Finished 100 additional iterations.
Finished 100 additional iterations.
Finished 100 additional iterations.
Finished 100 additional iterations.
Finished 100 additional iterations.
Finished 100 additional iterations.
Finished 100 additional iterations.
Finished 100 additional iterations.
Finished 100 additional iterations.


In [287]:
all_positive_customer_data.head(5)

Unnamed: 0,min_policy_date,sum_policy_premiums,sum_location_premiums,no_previous_equipment,state,sum_of_deductibles,avg_equipment_year,total_claim_amount,churned
0,2018-03-01 00:00:00,594,594.0,1,NE,1000,1994.0,0.0,1
1,2018-08-10 00:00:00,90240,15040.0,6,TX,12000,2004.0,11266.51,1
2,2018-06-13 00:00:00,6804,6804.0,1,TX,3000,2018.0,0.0,1
3,2018-03-27 00:00:00,164640,23520.0,7,TX,21000,2003.0,0.0,1
4,2020-06-15 00:00:00,912,912.0,1,TX,1000,2013.0,0.0,1


In [289]:
all_positive_customer_data.to_csv('../data/churn_modeling/churn_prediction_positive_samples.csv', index = False)

In [296]:
premiums_df.loc[premiums_df['CustomerId'] == 4241].head(2)

Unnamed: 0,PolicyNumber,CustomerId,Loc,PolicyEffectiveDate,PolicyExpirationDate,PolicyPremium,LocationPremium,Deductible,LocValue,County,State,Equip Year,Equip Value
68,P-376-2019,4241,1,2019-03-07,2020-03-07,67668,1931.057074,2500,150000.0,Union,NM,2006.0,150000.0
69,P-376-2019,4241,2,2019-03-07,2020-03-07,67668,1931.057074,2500,150000.0,Union,NM,2006.0,150000.0


## The below logic is only for getting negative samples

In [294]:
print(positive_sample_customer_ids[:10])

[3833, 3123, 3903, 3663, 3256, 4241, 3668, 630, 5025, 4770]


In [303]:
negative_customer_samples = premiums_df.groupby('CustomerId').agg({
    'PolicyEffectiveDate': 'min',
    'PolicyExpirationDate': 'max'
}).reset_index(drop = False).rename(columns = {
    'PolicyEffectiveDate': 'minPolicyEffectiveDate',
    'PolicyExpirationDate': 'maxPolicyExpirationDate'
})

In [304]:
negative_customer_samples = negative_customer_samples.loc[~negative_customer_samples['CustomerId'].isin(positive_sample_customer_ids)]

In [309]:
negative_customer_samples['policy_activity_length'] = negative_customer_samples['maxPolicyExpirationDate'] - negative_customer_samples['minPolicyEffectiveDate']

In [311]:
negative_customer_samples['policy_activity_length'] = negative_customer_samples['policy_activity_length'].dt.days

In [313]:
negative_customer_samples = negative_customer_samples.loc[negative_customer_samples['policy_activity_length'] > 365 * 2]

In [315]:
negative_customer_samples = negative_customer_samples.loc[negative_customer_samples['minPolicyEffectiveDate'] < '2022-01-01']

In [317]:
## probe window should be anywhere from min policy date to 2023-01-01

In [316]:
negative_customer_samples

Unnamed: 0,CustomerId,minPolicyEffectiveDate,maxPolicyExpirationDate,policy_activity_length
2,2,2018-03-11,2024-03-11,2192
3,3,2020-07-30,2023-07-30,1095
4,4,2018-02-22,2024-02-22,2191
5,5,2018-09-15,2023-09-15,1826
12,12,2021-01-13,2024-01-13,1095
...,...,...,...,...
5298,5373,2018-08-06,2023-08-06,1826
5300,5375,2019-06-03,2023-06-03,1461
5301,5376,2018-04-16,2023-04-16,1826
5303,5378,2018-06-02,2023-06-02,1826
