In [None]:
# 1. Import libraries
import io
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split


In [None]:
from google.colab import files
uploaded = files.upload()


Saving insurance-churn-insights.csv to insurance-churn-insights.csv


In [None]:
#Load and inspect the dataset
import pandas as pd
import io

# Replace the file name below if it's different
file_name = list(uploaded.keys())[0]

df = pd.read_csv(io.BytesIO(uploaded[file_name]), encoding='utf-8', engine='python')

# Quick overview
df.head()


Unnamed: 0,policyholder_id,first_name,last_name,date_of_birth,gender,email,phone_number,address_street,address_city,address_state,...,total_claims_amount,last_claim_date,churned,churn_date,churn_reason,customer_tenure_months,last_contact_date,contact_channel,satisfaction_score,number_of_policies
0,PH0001,Amelia,Richards,1987-08-21,female,amelia.richards@gmail.com,+1-212-555-0134,120 East 54th St,New York,NY,...,0.0,,False,,,59,2023-11-05,online-portal,8.7,1
1,PH0002,Omar,Al-Mansouri,1965-03-14,male,o.mansouri@emiratesmail.ae,+971-50-123-9942,56 Sheikh Zayed Rd,Dubai,,...,25000.0,2020-04-15,False,,,197,2024-03-19,phone,9.2,1
2,PH0003,Priya,Mehra,1992-11-04,female,priya.mehra@outlook.com,,45 Dadar West,Mumbai,MH,...,0.0,,False,,,29,2024-02-15,email,7.9,2
3,PH0004,James,Henderson,1975-07-29,male,james.henderson@usa.net,+1-312-555-0812,89 Wacker Dr,Chicago,IL,...,0.0,,False,,,170,2024-02-28,mail,9.8,1
4,PH0005,Elena,Morozova,1958-02-17,female,e.morozova@mail.ru,,Prospekt Mira 12,Moscow,,...,110000.0,2018-11-23,True,2020-09-01,Policy matured,240,2020-09-02,mail,8.9,1


# **Exploratory Data Analysis**

In [None]:
#number of columns and records, and the datatypes
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 31 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   policyholder_id         200 non-null    object 
 1   first_name              200 non-null    object 
 2   last_name               200 non-null    object 
 3   date_of_birth           200 non-null    object 
 4   gender                  200 non-null    object 
 5   email                   200 non-null    object 
 6   phone_number            145 non-null    object 
 7   address_street          200 non-null    object 
 8   address_city            200 non-null    object 
 9   address_state           77 non-null     object 
 10  address_postal_code     200 non-null    object 
 11  address_country         200 non-null    object 
 12  policy_id               200 non-null    object 
 13  policyholder_id_ref     200 non-null    object 
 14  policy_type             200 non-null    ob

In [None]:
#summary statistics
df.describe()

Unnamed: 0,premium_amount,total_claims_count,total_claims_amount,customer_tenure_months,satisfaction_score,number_of_policies
count,200.0,200.0,137.0,200.0,200.0,200.0
mean,197.6931,0.415,19159.72,98.125,7.6705,1.125
std,247.525263,0.636238,89168.53,98.936756,2.013717,0.331549
min,1.0,0.0,0.0,4.0,0.0,1.0
25%,58.925,0.0,0.0,24.0,7.5,1.0
50%,87.9,0.0,0.0,53.5,8.1,1.0
75%,236.5,1.0,9000.0,150.75,8.9,1.0
max,1250.0,2.0,1010000.0,476.0,10.0,2.0


In [None]:
#Missing values in each column
df.isnull().sum()


Unnamed: 0,0
policyholder_id,0
first_name,0
last_name,0
date_of_birth,0
gender,0
email,0
phone_number,55
address_street,0
address_city,0
address_state,123


In [None]:
#the count of values in the target variable (churned)
df['churned'].value_counts()

Unnamed: 0_level_0,count
churned,Unnamed: 1_level_1
False,155
True,45


# **Data Cleaning & Preprocessing**

In [None]:
# Check for Missing Values
df.isnull().sum()


Unnamed: 0,0
policyholder_id,0
first_name,0
last_name,0
date_of_birth,0
gender,0
email,0
phone_number,55
address_street,0
address_city,0
address_state,123


In [None]:
# Check Data Types
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 31 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   policyholder_id         200 non-null    object 
 1   first_name              200 non-null    object 
 2   last_name               200 non-null    object 
 3   date_of_birth           200 non-null    object 
 4   gender                  200 non-null    object 
 5   email                   200 non-null    object 
 6   phone_number            145 non-null    object 
 7   address_street          200 non-null    object 
 8   address_city            200 non-null    object 
 9   address_state           77 non-null     object 
 10  address_postal_code     200 non-null    object 
 11  address_country         200 non-null    object 
 12  policy_id               200 non-null    object 
 13  policyholder_id_ref     200 non-null    object 
 14  policy_type             200 non-null    ob

In [None]:
# Encode Categorical Columns
df_encoded = pd.get_dummies(df, drop_first=True)
df_encoded

Unnamed: 0,premium_amount,total_claims_count,total_claims_amount,churned,customer_tenure_months,satisfaction_score,number_of_policies,policyholder_id_PH0002,policyholder_id_PH0003,policyholder_id_PH0004,...,last_contact_date_2024-06-10,last_contact_date_2024-06-11,last_contact_date_2024-06-12,last_contact_date_2024-06-13,last_contact_date_2024-06-14,contact_channel_in-person,contact_channel_mail,contact_channel_online-portal,contact_channel_other,contact_channel_phone
0,84.60,0,0.0,False,59,8.7,1,False,False,False,...,False,False,False,False,False,False,False,True,False,False
1,395.30,1,25000.0,False,197,9.2,1,True,False,False,...,False,False,False,False,False,False,False,False,False,True
2,51.75,0,0.0,False,29,7.9,2,False,True,False,...,False,False,False,False,False,False,False,False,False,False
3,175.00,0,0.0,False,170,9.8,1,False,False,True,...,False,False,False,False,False,False,True,False,False,False
4,980.25,2,110000.0,True,240,8.9,1,False,False,False,...,False,False,False,False,False,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,232.00,0,,False,141,9.3,1,False,False,False,...,False,False,False,False,False,False,True,False,False,False
196,109.00,0,,False,86,8.7,1,False,False,False,...,False,False,False,False,False,False,True,False,False,False
197,96.00,0,,False,23,7.7,1,False,False,False,...,False,False,False,False,False,False,False,True,False,False
198,300.00,1,42000.0,False,271,9.2,1,False,False,False,...,False,False,False,False,False,False,True,False,False,False


# **Handling missing values**

In [None]:
df.isnull().sum()


Unnamed: 0,0
policyholder_id,0
first_name,0
last_name,0
date_of_birth,0
gender,0
email,0
phone_number,55
address_street,0
address_city,0
address_state,123


In [None]:
# Data Cleaning Actions
# Drop unnecessary columns
df.drop(['policyholder_id', 'first_name', 'last_name', 'email', 'phone_number',
         'address_street', 'address_city', 'address_state', 'address_postal_code',
         'address_country', 'policy_id', 'policyholder_id_ref', 'policy_end_date',
         'churn_date'], axis=1, inplace=True)

# Fill missing numerical values
df['total_claims_amount'].fillna(0, inplace=True)

# Fill missing categorical values
df['last_claim_date'].fillna('No Last Claim date', inplace=True)
df['churn_reason'].fillna('Not Churned', inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['total_claims_amount'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['last_claim_date'].fillna('No Last Claim date', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which 

In [None]:
# checking the output
df.isnull().sum()


Unnamed: 0,0
date_of_birth,0
gender,0
policy_type,0
policy_start_date,0
policy_status,0
premium_amount,0
payment_frequency,0
total_claims_count,0
total_claims_amount,0
last_claim_date,0


# **Encoding categorical variables and preparing the dataset for training**

In [None]:
# One-hot encode categorical variables
categorical_cols = ['gender', 'policy_type', 'policy_status',
                    'payment_frequency', 'last_claim_date',
                    'contact_channel', 'churn_reason']

df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

# Check the first few rows
df_encoded.head()


Unnamed: 0,date_of_birth,policy_start_date,premium_amount,total_claims_count,total_claims_amount,churned,customer_tenure_months,last_contact_date,satisfaction_score,number_of_policies,...,churn_reason_Requested cancellation,churn_reason_Switched provider,churn_reason_System error,churn_reason_System test account closure,churn_reason_Term completed,churn_reason_fraud suspicion,churn_reason_lapsed payment,churn_reason_non-payment,churn_reason_policy matured,churn_reason_user cancelled
0,1987-08-21,2019-06-10,84.6,0,0.0,False,59,2023-11-05,8.7,1,...,False,False,False,False,False,False,False,False,False,False
1,1965-03-14,2007-12-01,395.3,1,25000.0,False,197,2024-03-19,9.2,1,...,False,False,False,False,False,False,False,False,False,False
2,1992-11-04,2022-01-12,51.75,0,0.0,False,29,2024-02-15,7.9,2,...,False,False,False,False,False,False,False,False,False,False
3,1975-07-29,2010-04-10,175.0,0,0.0,False,170,2024-02-28,9.8,1,...,False,False,False,False,False,False,False,False,False,False
4,1958-02-17,2000-09-01,980.25,2,110000.0,True,240,2020-09-02,8.9,1,...,False,False,False,False,False,False,False,False,False,False


In [None]:
# Verify encoding
df_encoded.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Columns: 111 entries, date_of_birth to churn_reason_user cancelled
dtypes: bool(102), float64(3), int64(3), object(3)
memory usage: 34.1+ KB


# **Train/Test Split**

In [None]:
from sklearn.model_selection import train_test_split

# Features and target
X = df_encoded.drop('churned', axis=1)
y = df_encoded['churned']

# Split 80% train / 20% test, stratify by target to keep churn ratio
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Training samples:", X_train.shape[0])
print("Test samples:", X_test.shape[0])


Training samples: 160
Test samples: 40


In [None]:
#Drop the date columns for now
X = df_encoded.drop(['churned', 'date_of_birth', 'policy_start_date', 'last_contact_date'], axis=1)
y = df_encoded['churned']


I realised that after training the models I needed to deal with dates columns. That is why I went back here.

In [None]:
#splitting again
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


# **Training Models**

In [None]:
# Step 1: Import necessary libraries
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

# Step 2: Define models with class weights to handle imbalance
models = {
    'Logistic Regression': LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42),
    'Decision Tree': DecisionTreeClassifier(class_weight='balanced', random_state=42),
    'Random Forest': RandomForestClassifier(class_weight='balanced', n_estimators=100, random_state=42),
    'SVM': SVC(class_weight='balanced', probability=True, random_state=42)
}

# Step 3: Train models and evaluate
results = []

for name, model in models.items():
    # Train
    model.fit(X_train, y_train)

    # Predict
    y_pred = model.predict(X_test)

    # Evaluate
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    results.append({
        'Model': name,
        'Accuracy': acc,
        'Precision': prec,
        'Recall': rec,
        'F1 Score': f1
    })

# Step 4: Show results
results_df = pd.DataFrame(results)
results_df


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score
0,Logistic Regression,1.0,1.0,1.0,1.0
1,Decision Tree,1.0,1.0,1.0,1.0
2,Random Forest,1.0,1.0,1.0,1.0
3,SVM,0.7,0.2,0.111111,0.142857
