# Data Preparation

In [1]:
# Import necessary libraries
import pandas as pd  # For data manipulation and analysis
import numpy as np   # For numerical operations
import seaborn as sns  # For statistical data visualization
import matplotlib.pyplot as plt  # For creating static, animated, and interactive visualizations
import scipy.stats as stats  # For statistical functions

In [2]:
# Import feature selection and preprocessing modules from scikit-learn
from sklearn.feature_selection import SelectKBest, f_classif  # For selecting the best features based on ANOVA F-statistic
from sklearn.decomposition import PCA  # For dimensionality reduction using Principal Component Analysis
from sklearn.preprocessing import StandardScaler  # For standardizing features by removing the mean and scaling to unit variance
from sklearn.preprocessing import OneHotEncoder  # For converting categorical variables into a form that can be provided to ML algorithms
from sklearn.preprocessing import LabelEncoder  # For encoding target labels with a value between 0 and n_classes-1
from sklearn.preprocessing import OrdinalEncoder  # For encoding ordinal categorical features with integer values

In [3]:
import os  # For interacting with the operating system
import sys  # For system-specific parameters and functions

# Add the 'scripts' directory to the Python path for module imports
sys.path.append(os.path.abspath(os.path.join('..', 'scripts')))

In [4]:
pd.set_option('display.max_columns', 200)  # Maximum columns to display
pd.set_option('display.max_rows', 200)     # Maximum rows to display

In [5]:
df = pd.read_csv('../data/cleaned_data.csv', low_memory=False, index_col=False)

In [6]:
df.head(10)

Unnamed: 0.1,Unnamed: 0,UnderwrittenCoverID,PolicyID,TransactionMonth,IsVATRegistered,Citizenship,LegalType,Title,Language,Bank,AccountType,MaritalStatus,Gender,Country,Province,PostalCode,MainCrestaZone,SubCrestaZone,ItemType,mmcode,VehicleType,RegistrationYear,make,Model,Cylinders,cubiccapacity,kilowatts,bodytype,NumberOfDoors,VehicleIntroDate,AlarmImmobiliser,TrackingDevice,CapitalOutstanding,NewVehicle,SumInsured,TermFrequency,CalculatedPremiumPerTerm,ExcessSelected,CoverCategory,CoverType,CoverGroup,Section,Product,StatutoryClass,StatutoryRiskType,TotalPremium,TotalClaims
0,0,145249,12827,2015-03-01 00:00:00,True,,Close Corporation,Mr,English,First National Bank,Current account,Not specified,Not specified,South Africa,Gauteng,1459,Rand East,Rand East,Mobility - Motor,44069150.0,Passenger Vehicle,2004,MERCEDES-BENZ,E 240,6.0,2597.0,130.0,S/D,4.0,,Yes,No,119300,More than 6 months,0.01,Monthly,25.0,Mobility - Windscreen,Windscreen,Windscreen,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,21.929825,0.0
1,1,145249,12827,2015-05-01 00:00:00,True,,Close Corporation,Mr,English,First National Bank,Current account,Not specified,Not specified,South Africa,Gauteng,1459,Rand East,Rand East,Mobility - Motor,44069150.0,Passenger Vehicle,2004,MERCEDES-BENZ,E 240,6.0,2597.0,130.0,S/D,4.0,,Yes,No,119300,More than 6 months,0.01,Monthly,25.0,Mobility - Windscreen,Windscreen,Windscreen,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,21.929825,0.0
2,2,145249,12827,2015-07-01 00:00:00,True,,Close Corporation,Mr,English,First National Bank,Current account,Not specified,Not specified,South Africa,Gauteng,1459,Rand East,Rand East,Mobility - Motor,44069150.0,Passenger Vehicle,2004,MERCEDES-BENZ,E 240,6.0,2597.0,130.0,S/D,4.0,,Yes,No,119300,More than 6 months,0.01,Monthly,25.0,Mobility - Windscreen,Windscreen,Windscreen,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,0.0,0.0
3,3,145255,12827,2015-05-01 00:00:00,True,,Close Corporation,Mr,English,First National Bank,Current account,Not specified,Not specified,South Africa,Gauteng,1459,Rand East,Rand East,Mobility - Motor,44069150.0,Passenger Vehicle,2004,MERCEDES-BENZ,E 240,6.0,2597.0,130.0,S/D,4.0,,Yes,No,119300,More than 6 months,119300.0,Monthly,220.1628,Mobility - Metered Taxis - R2000,Own damage,Own Damage,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,54.824561,0.0
4,4,145255,12827,2015-07-01 00:00:00,True,,Close Corporation,Mr,English,First National Bank,Current account,Not specified,Not specified,South Africa,Gauteng,1459,Rand East,Rand East,Mobility - Motor,44069150.0,Passenger Vehicle,2004,MERCEDES-BENZ,E 240,6.0,2597.0,130.0,S/D,4.0,,Yes,No,119300,More than 6 months,119300.0,Monthly,220.1628,Mobility - Metered Taxis - R2000,Own damage,Own Damage,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,0.0,0.0
5,5,145247,12827,2015-01-01 00:00:00,True,,Close Corporation,Mr,English,First National Bank,Current account,Not specified,Not specified,South Africa,Gauteng,1459,Rand East,Rand East,Mobility - Motor,44069150.0,Passenger Vehicle,2004,MERCEDES-BENZ,E 240,6.0,2597.0,130.0,S/D,4.0,,Yes,No,119300,More than 6 months,500000.0,Monthly,57.5412,No excess,Third Party,Third Party,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,3.256435,0.0
6,6,145247,12827,2015-04-01 00:00:00,True,,Close Corporation,Mr,English,First National Bank,Current account,Not specified,Not specified,South Africa,Gauteng,1459,Rand East,Rand East,Mobility - Motor,44069150.0,Passenger Vehicle,2004,MERCEDES-BENZ,E 240,6.0,2597.0,130.0,S/D,4.0,,Yes,No,119300,More than 6 months,500000.0,Monthly,57.5412,No excess,Third Party,Third Party,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,50.474737,0.0
7,7,145247,12827,2015-06-01 00:00:00,True,,Close Corporation,Mr,English,First National Bank,Current account,Not specified,Not specified,South Africa,Gauteng,1459,Rand East,Rand East,Mobility - Motor,44069150.0,Passenger Vehicle,2004,MERCEDES-BENZ,E 240,6.0,2597.0,130.0,S/D,4.0,,Yes,No,119300,More than 6 months,500000.0,Monthly,57.5412,No excess,Third Party,Third Party,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,35.332316,0.0
8,8,145247,12827,2015-08-01 00:00:00,True,,Close Corporation,Mr,English,First National Bank,Current account,Not specified,Not specified,South Africa,Gauteng,1459,Rand East,Rand East,Mobility - Motor,44069150.0,Passenger Vehicle,2004,MERCEDES-BENZ,E 240,6.0,2597.0,130.0,S/D,4.0,,Yes,No,119300,More than 6 months,500000.0,Monthly,57.5412,No excess,Third Party,Third Party,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,0.0,0.0
9,9,145245,12827,2015-03-01 00:00:00,True,,Close Corporation,Mr,English,First National Bank,Current account,Not specified,Not specified,South Africa,Gauteng,1459,Rand East,Rand East,Mobility - Motor,44069150.0,Passenger Vehicle,2004,MERCEDES-BENZ,E 240,6.0,2597.0,130.0,S/D,4.0,,Yes,No,119300,More than 6 months,617500.0,Monthly,1.1508,No excess,Passenger Liability,Passenger Liability,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,1.009474,0.0


In [7]:
# Check for missing values in each column of the DataFrame
missing_counts = df.isnull().sum()

In [8]:
# Identify columns that have more than 1 missing value
columns_with_missing = missing_counts[missing_counts > 1].index
print(f'Columns with more than 1 missing value:\n{columns_with_missing}')

Columns with more than 1 missing value:
Index(['VehicleIntroDate'], dtype='object')


In [9]:
# Drop these columns from the DataFrame
df = df.drop(columns=columns_with_missing)

In [10]:
# Verify the result by printing the remaining columns
print(f'Columns remaining after dropping:\n{df.columns}')

Columns remaining after dropping:
Index(['Unnamed: 0', 'UnderwrittenCoverID', 'PolicyID', 'TransactionMonth',
       'IsVATRegistered', 'Citizenship', 'LegalType', 'Title', 'Language',
       'Bank', 'AccountType', 'MaritalStatus', 'Gender', 'Country', 'Province',
       'PostalCode', 'MainCrestaZone', 'SubCrestaZone', 'ItemType', 'mmcode',
       'VehicleType', 'RegistrationYear', 'make', 'Model', 'Cylinders',
       'cubiccapacity', 'kilowatts', 'bodytype', 'NumberOfDoors',
       'AlarmImmobiliser', 'TrackingDevice', 'CapitalOutstanding',
       'NewVehicle', 'SumInsured', 'TermFrequency', 'CalculatedPremiumPerTerm',
       'ExcessSelected', 'CoverCategory', 'CoverType', 'CoverGroup', 'Section',
       'Product', 'StatutoryClass', 'StatutoryRiskType', 'TotalPremium',
       'TotalClaims'],
      dtype='object')


# Identify Unique Categories

In [14]:
# Get categorical columns from the DataFrame
categorical_columns = df.select_dtypes(include='object').columns

# Print the summary DataFrame
summary_df

Unnamed: 0,Column,DataType,NumUniqueValues
0,TransactionMonth,object,23
1,Citizenship,object,4
2,LegalType,object,6
3,Title,object,5
4,Language,object,1
5,Bank,object,11
6,AccountType,object,3
7,MaritalStatus,object,3
8,Gender,object,3
9,Country,object,1


In [15]:
# Create a summary DataFrame to hold information about categorical columns
summary_df = pd.DataFrame({
    'Column': categorical_columns,  # Column names
    'DataType': [df[col].dtype for col in categorical_columns],  # Data types of each column
    'NumUniqueValues': [df[col].nunique() for col in categorical_columns]  # Number of unique values in each column
})

# Data Insights Summary

## Key Insights

1. **Low Variability Columns**:
   - The following columns exhibit minimal variability:
     - **Language**
     - **Country**
     - **ItemType**
     - **StatutoryClass**
     - **StatutoryRiskType**
   - **Action**: These columns should be removed from the dataset.

2. **Model Complexity**:
   - The **Model** column contains **411 unique values**, which may complicate analysis.
   - **Action**: Consider reducing cardinality through grouping similar models or applying target encoding.

3. **Binary Columns**:
   - The following columns have only **2 unique values** each:
     - **AlarmImmobiliser**
     - **TrackingDevice**
     - **NewVehicle**
   - **Action**: Evaluate if these columns are necessary for the model or if they can be combined.

4. **Date Conversion**:
   - The **TransactionMonth** column should be converted to a proper date format.
   - **Action**: Extract useful features such as **year**, **month**, and **quarter** to enhance analysis.

5. **Numeric Features**:
   - The **CapitalOutstanding** column should be numeric and can be utilized as a continuous feature.
   - **Action**: Ensure it is correctly formatted for effective modeling.

## Recommendations
Follow the above actions to optimize the dataset for analysis and model training.

In [17]:
   columns_to_drop = ['Language', 'Country', 'ItemType', 'StatutoryClass', 'StatutoryRiskType']
   df = df.drop(columns=columns_to_drop)
   print(f"Dropped columns: {columns_to_drop}")

Dropped columns: ['Language', 'Country', 'ItemType', 'StatutoryClass', 'StatutoryRiskType']


Convert the `'CapitalOutstanding'` to numeric format

In [21]:
# Convert 'CapitalOutstanding' to numeric, coercing errors to handle non-numeric values (e.g., empty strings or NaNs)
df['CapitalOutstanding'] = pd.to_numeric(df['CapitalOutstanding'], errors='coerce')
print("Converted 'CapitalOutstanding' to numeric type. Any non-numeric values have been coerced to NaN.")

# Fill missing values if necessary, e.g., with 0 or the column's mean
df['CapitalOutstanding'] = df['CapitalOutstanding'].fillna(0)  # Alternatively, use the mean: df['CapitalOutstanding'].fillna(df['CapitalOutstanding'].mean())
print("Filled missing values in 'CapitalOutstanding' with 0. Any NaN values have been replaced.")

# Ensure it's a float type
df['CapitalOutstanding'] = df['CapitalOutstanding'].astype(float)
print("Converted 'CapitalOutstanding' to float type.")

# Verify the conversion and display the data type
print(f"The data type of 'CapitalOutstanding' is now: {df['CapitalOutstanding'].dtype}")

Converted 'CapitalOutstanding' to numeric type. Any non-numeric values have been coerced to NaN.
Filled missing values in 'CapitalOutstanding' with 0. Any NaN values have been replaced.
Converted 'CapitalOutstanding' to float type.
The data type of 'CapitalOutstanding' is now: float64


Convert boolean `'IsVATRegistered'` to integer

In [22]:
# Convert boolean 'IsVATRegistered' to integer (True becomes 1, False becomes 0)
df['IsVATRegistered'] = df['IsVATRegistered'].astype(int)
print("Converted 'IsVATRegistered' from boolean to integer type.")
print(f"The unique values in 'IsVATRegistered' are now: {df['IsVATRegistered'].unique()}")

Converted 'IsVATRegistered' from boolean to integer type.
The unique values in 'IsVATRegistered' are now: [1 0]


Convert the `'TransactionMonth'` to a datetime format and extract relevant features.

In [23]:
# Convert 'TransactionMonth' to datetime format
df['TransactionMonth'] = pd.to_datetime(df['TransactionMonth'])
print("Converted 'TransactionMonth' to datetime format.")

# Extract the year from 'TransactionMonth' and create a new column 'TransactionYear'
df['TransactionYear'] = df['TransactionMonth'].dt.year
print("Extracted year from 'TransactionMonth' and created 'TransactionYear'.")

# Extract the month from 'TransactionMonth' and create a new column 'TransactionMonthOnly'
df['TransactionMonthOnly'] = df['TransactionMonth'].dt.month
print("Extracted month from 'TransactionMonth' and created 'TransactionMonthOnly'.")

# Extract the quarter from 'TransactionMonth' and create a new column 'TransactionQuarter'
df['TransactionQuarter'] = df['TransactionMonth'].dt.quarter
print("Extracted quarter from 'TransactionMonth' and created 'TransactionQuarter'.")

# Drop the original datetime column if it's no longer needed
df = df.drop(columns=['TransactionMonth'])
print("Dropped the original 'TransactionMonth' column as it is no longer needed.")

Converted 'TransactionMonth' to datetime format.
Extracted year from 'TransactionMonth' and created 'TransactionYear'.
Extracted month from 'TransactionMonth' and created 'TransactionMonthOnly'.
Extracted quarter from 'TransactionMonth' and created 'TransactionQuarter'.
Dropped the original 'TransactionMonth' column as it is no longer needed.


Check the Distribution of `'Title'`, `'MaritalStatus'`, and `'Gender'`

In [24]:
print(f'Gender Distribution:\n{df.Gender.value_counts()}')
print(f'Title Distribution:\n {df.Title.value_counts()}')
print(f'Marital Status Distribution:\n {df.MaritalStatus.value_counts()}')

Gender Distribution:
Gender
Not specified    950526
Male              42817
Female             6755
Name: count, dtype: int64
Title Distribution:
 Title
Mr      933555
Mrs      45850
Ms       13269
Miss      6614
Dr         810
Name: count, dtype: int64
Marital Status Distribution:
 MaritalStatus
Not specified    994467
Single             4254
Married            1377
Name: count, dtype: int64


# Gender Imputation Analysis

## High Proportion of Unspecified Gender

A significant portion of the Gender values in the dataset is missing or unspecified, totaling **950,526** entries. 

To improve the quality of the data, these unspecified gender values can be filled using information from the Title column where available.

## Title-to-Gender Mapping

Certain titles provide a clear indication of gender:

- **Mr**: Male
- **Mrs, Miss, Ms**: Female
- **Dr**: Ambiguous (leave as Not Specified)

## Approach to Impute Missing Genders Using Titles

1. **Mr** → Male
2. **Mrs, Miss, Ms** → Female
3. **Dr** → Leave as Not Specified (since it’s ambiguous)

By using this mapping, we can effectively impute missing gender values based on the available titles in the dataset.