In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
df = pd.read_csv("/content/drive/MyDrive/Week3/MachineLearningRating_v3.csv", delimiter="|")


  df = pd.read_csv("/content/drive/MyDrive/Week3/MachineLearningRating_v3.csv", delimiter="|")


In [5]:
df.columns


Index(['UnderwrittenCoverID', 'PolicyID', 'TransactionMonth',
       'IsVATRegistered', 'Citizenship', 'LegalType', 'Title', 'Language',
       'Bank', 'AccountType', 'MaritalStatus', 'Gender', 'Country', 'Province',
       'PostalCode', 'MainCrestaZone', 'SubCrestaZone', 'ItemType', 'mmcode',
       'VehicleType', 'RegistrationYear', 'make', 'Model', 'Cylinders',
       'cubiccapacity', 'kilowatts', 'bodytype', 'NumberOfDoors',
       'VehicleIntroDate', 'CustomValueEstimate', 'AlarmImmobiliser',
       'TrackingDevice', 'CapitalOutstanding', 'NewVehicle', 'WrittenOff',
       'Rebuilt', 'Converted', 'CrossBorder', 'NumberOfVehiclesInFleet',
       'SumInsured', 'TermFrequency', 'CalculatedPremiumPerTerm',
       'ExcessSelected', 'CoverCategory', 'CoverType', 'CoverGroup', 'Section',
       'Product', 'StatutoryClass', 'StatutoryRiskType', 'TotalPremium',
       'TotalClaims'],
      dtype='object')

In [6]:
# Changing month transaction to date time datype
df['TransactionMonth'] = pd.to_datetime(df["TransactionMonth"])
# Dorpping rows with a Not specified gender values
df = df[df['Gender'] != "Not specified"]
df = df.drop("NumberOfVehiclesInFleet", axis = 1)

Importing neccessary libraries for imputing and modeling tasks

In [7]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
from sklearn.ensemble import RandomForestRegressor

In [8]:
simpleimputer = SimpleImputer(strategy= "most_frequent")
knnimputer = KNNImputer(n_neighbors = 3)



In [9]:
def nearest_date_imputer(X):
    """Imputes missing datetime values with the nearest valid date."""
    df = pd.DataFrame(X)
    df.columns = ['TransactionMonth']  # Assuming single datetime column
    df['TransactionMonth'] = pd.to_datetime(df['TransactionMonth'])

    # Create a mask for missing values
    mask = df['TransactionMonth'].isnull()

    # For missing values, find the nearest valid date
    df.loc[mask, 'TransactionMonth'] = df.loc[mask, 'TransactionMonth'].apply(
        lambda x: df['TransactionMonth'].dropna().iloc[(df['TransactionMonth'].dropna() - x).abs().argsort()[0]]
        if pd.notna(x) else x
    )
    # Convert 'TransactionMonth' to datetime64[ns] and then to integers
    return df['TransactionMonth'].astype('datetime64[ns]').astype(int).values.reshape(-1, 1)

In [15]:
def string_converter_func(x):
    return x.astype(str)

def bool_to_int_func(x):
    return x.astype(int)

In [10]:
x = df.drop(['TotalPremium','TotalClaims'], axis=1)
y = df[['TotalPremium','TotalClaims']]

numerical_features = x.select_dtypes(include = np.number).columns
categorical_features = x.select_dtypes(include = ['object']).columns
boolean_features = x.select_dtypes(include=['bool']).columns
datetime_features = x.select_dtypes(include=['datetime64']).columns

# split to train test with 80 to 20 ratio
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2)

In [16]:

# Pipelines for different data types
numerical_pipeline = Pipeline([
    ('knn_imputer', KNNImputer(n_neighbors=3))
])

categorical_pipeline = Pipeline([
    ('mode_imputer', SimpleImputer(strategy='most_frequent')),
    # Convert all values to strings before applying OneHotEncoder
    ('string_converter', FunctionTransformer(string_converter_func)),
    ('one_hot', OneHotEncoder(handle_unknown='ignore'))
])

boolean_pipeline = Pipeline([
     ('knn_imputer', KNNImputer(n_neighbors=3)),  # Replace SimpleImputer with KNNImputer
    ('bool_to_int', FunctionTransformer(bool_to_int_func)) # Convert boolean to int
])

datetime_pipeline = Pipeline([
    ('nearest_date_imputer', FunctionTransformer(nearest_date_imputer)),
])

# Combine all pipelines using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical_features),
        ('cat', categorical_pipeline, categorical_features),
        ('bool', boolean_pipeline, boolean_features),
        ('datetime', datetime_pipeline, datetime_features)
    ])

# Final pipeline with the model
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor(random_state=42))
])

# ... (rest of your code for fitting and evaluating the model) ...

Forming Test and trainn values with targets total primium and total claims to fit it to the pipe line formd above

Fit the train sets to the pipeline and train it

In [18]:
pipeline.fit(x_train, y_train)




In [19]:
from sklearn.metrics import mean_absolute_error, r2_score

y_pred = pipeline.predict(x_test)
# Assuming y_test and y_pred are defined
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"MAE: {mae}")
print(f"R² Score: {r2}")




MAE: 41.32579276605776
R² Score: 0.44286356544775074


In [20]:
import joblib

# Assuming `pipeline` is your trained model
joblib.dump(pipeline, 'model_pipeline.pkl')


['model_pipeline.pkl']

In [21]:
feature_importances = pipeline.named_steps['model'].feature_importances_
sorted_features = sorted(zip(feature_importances, numerical_features), reverse=True)
print(sorted_features)


[(0.12011366596328851, 'UnderwrittenCoverID'), (0.033658475911234956, 'PolicyID'), (0.028274244561482326, 'CalculatedPremiumPerTerm'), (0.013073773008509957, 'SumInsured'), (0.012724195700421896, 'CustomValueEstimate'), (0.009040758364317171, 'RegistrationYear'), (0.005616462295414932, 'PostalCode'), (0.001949733093479403, 'mmcode'), (0.001690414673609269, 'kilowatts'), (0.0016316996847981747, 'cubiccapacity'), (0.00011730104498132904, 'NumberOfDoors'), (6.791452110190145e-06, 'Cylinders')]


In [None]:


features = x_train.columns

# Create a DataFrame for better visualization
importance_df = pd.DataFrame({
    'Feature': features,
    'Importance': feature_importances
}).sort_values(by='Importance', ascending=False)

# Plot
plt.figure(figsize=(10, 6))
plt.barh(importance_df['Feature'], importance_df['Importance'])
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.title('Feature Importances')
plt.gca().invert_yaxis()  # Invert y-axis to show the most important features on top
plt.show()
