# Customer Churn Prediction
## **Target**: Will a customer churn or reinstate their policy?

In [66]:
import warnings
warnings.filterwarnings('ignore')

In [1]:
# Generic Libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
claims_data = '../data/claims.csv'
premiums_data = '../data/premiums.csv'

In [3]:
claims_df = pd.read_csv(claims_data)
premiums_df = pd.read_csv(premiums_data)

In [4]:
claims_df = claims_df[~claims_df['ClaimCause'].isin(['Claim Denied', 'Claim Withdrawn'])]

In [5]:
claims_df = claims_df.dropna()

claims_df['DateOfLoss'] = pd.to_datetime(claims_df['DateOfLoss'])
claims_df['ClaimReceivedDate'] = pd.to_datetime(claims_df['ClaimReceivedDate'])
claims_df['PolicyEffectiveDate'] = pd.to_datetime(claims_df['PolicyEffectiveDate'])

claims_df = claims_df.astype({
    'PolicyNumber': 'string',
    'ClaimCause': 'string',
    'County': 'string',
    'State': 'string'
})

In [6]:
premiums_df = premiums_df.dropna()

premiums_df['PolicyEffectiveDate'] = pd.to_datetime(premiums_df['PolicyEffectiveDate'])
premiums_df['PolicyExpirationDate'] = pd.to_datetime(premiums_df['PolicyExpirationDate'])

premiums_df = premiums_df.astype({
    'PolicyNumber': 'string',
    'County': 'string',
    'State': 'string'
})

In [7]:
premiums_df = premiums_df.drop_duplicates()

In [8]:
from datetime import datetime

def impute_zero_for_year(
    value : float
) -> float:
    current_year = datetime.now().year
    if 1950.0 <= value <= current_year:
        return value
    else:
        return 0.0

In [9]:
premiums_df['Equip Year'] = premiums_df['Equip Year'].apply(impute_zero_for_year)

## **Potential Features**:
1. First policy year (static)
2. Sum of premiums in USD (dynamic)
3. Sum of claims in USD (dynamic)
4. Number of policies (dynamic)
5. Number of claims (dynamic)
6. State (mode; static)
7. County (mode; static)
8. Sum of deductibles (dynamic)
9. Average equipment year (dynamic)
10. Average location premium (dynamic)

## Step 1:
1. For a given customer, find the time periods in which they have an active policy.

In [141]:
date_list = []

for index, row in premiums_df.loc[premiums_df['CustomerId'] == 1248].iterrows():
    start_date = str(row['PolicyEffectiveDate'].date())
    end_date = str(row['PolicyExpirationDate'].date())

    date_range = pd.date_range(start_date, end_date).values.astype('datetime64[D]')
    date_strings = np.datetime_as_string(date_range, unit = 'D').tolist()
    
    date_list += date_strings

unique_active_dates = np.unique(date_list)

In [142]:
customer_date_range = pd.DataFrame({
    'Date': pd.date_range(
        premiums_df['PolicyEffectiveDate'].min(),
        premiums_df['PolicyExpirationDate'].max(),
        freq = 'D'
    )
})

In [143]:
customer_date_range['CustomerId'] = 1248
customer_date_range['ActivePolicy'] = 0
customer_date_range.loc[customer_date_range['Date'].isin(unique_active_dates), 'ActivePolicy'] = 1

In [144]:
customer_date_range = customer_date_range.loc[customer_date_range['Date'] < '2023-01-01']

In [145]:
customer_date_range['ProbePosition'] = 0

In [146]:
def find_switch(
    df : pd.DataFrame,
    col : str
) -> list:
    switch_indices = df[df[col].diff() == -1].index
    for idx in switch_indices:
        if (df.loc[idx : idx + 59, col] == 0).all():
            return idx
    return None

index = find_switch(customer_date_range, 'ActivePolicy')

In [147]:
customer_date_range.loc[
    [index - 45, index - 90, index - 135, index - 180, index - 225, index - 270, index - 315, index - 360],
    'ProbePosition'
] = 1

In [148]:
customer_date_range.loc[customer_date_range['ProbePosition'] == 1]

Unnamed: 0,Date,CustomerId,ActivePolicy,ProbePosition
295,2018-05-23,1248,1,1
340,2018-07-07,1248,1,1
385,2018-08-21,1248,1,1
430,2018-10-05,1248,1,1
475,2018-11-19,1248,1,1
520,2019-01-03,1248,1,1
565,2019-02-17,1248,1,1
610,2019-04-03,1248,1,1
