## Imports and Environment Variables

In [1]:
import os
from dotenv import load_dotenv

In [2]:
# Load environment variables
load_dotenv()

SUBSCRIPTION_ID = os.getenv('SUBSCRIPTION_ID')

## Feature Engineering

In [3]:
import pandas as pd

# Read the data
df = pd.read_csv('src/data/train.csv', index_col=0)

In [5]:
# Explore the data
print(df.dtypes)
df.sample(10)


Gender                   object
Age                       int64
Driving_License           int64
Region_Code             float64
Previously_Insured        int64
Vehicle_Age              object
Vehicle_Damage           object
Annual_Premium          float64
Policy_Sales_Channel    float64
Vintage                   int64
Response                  int64
dtype: object


Unnamed: 0_level_0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
6814871,Male,26,1,8.0,1,< 1 Year,No,27784.0,152.0,106,0
6066432,Female,65,1,5.0,0,> 2 Years,Yes,36397.0,26.0,90,0
7538684,Male,22,1,34.0,1,< 1 Year,No,31478.0,152.0,126,0
4832906,Male,77,1,28.0,0,1-2 Year,Yes,2630.0,122.0,157,0
7103026,Male,28,1,28.0,0,1-2 Year,Yes,36581.0,156.0,95,1
10458005,Male,35,1,28.0,0,1-2 Year,Yes,29669.0,26.0,218,1
279568,Female,37,1,4.0,0,1-2 Year,Yes,23409.0,124.0,257,1
6722233,Female,51,1,15.0,1,1-2 Year,No,42063.0,163.0,177,0
5026776,Male,61,1,48.0,1,1-2 Year,Yes,2630.0,26.0,152,0
2405876,Female,21,1,50.0,1,< 1 Year,No,40605.0,160.0,244,0


## Generate training scripts

In [6]:
# create a folder for the script files
script_folder = 'src'
os.makedirs(script_folder, exist_ok=True)
print(script_folder, 'folder created')

src folder created


In [13]:
%%writefile $script_folder/train.py
# Import libraries
import argparse
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, OrdinalEncoder
from sklearn.linear_model import LogisticRegression

def parse_args():
    
    # Parse arguments
    parser = argparse.ArgumentParser()
    parser.add_argument("--train-data", dest="train_data", type=str, required=True)
    parser.add_argument("--test-data", dest="test_data", type=str, required=True)
    args = parser.parse_args()
    
    return args

def main(args):
    print("Reading data...")
    train_df = read_data(args.train_data)
    test_df = read_data(args.test_data)
    
    categorical_columns = ["Gender", "Vehicle_Age", "Vehicle_Damage"]
    numerical_columns = ["Age", "Annual_Premium"]
    
    print("Preprocessing data and applying encoding to categorical columns...")

    # Apply ordinal encoding to categorical columns
    for col in categorical_columns:
        encoder = OrdinalEncoder()
        train_df[col] = encoder.fit_transform(train_df[col].values.reshape(-1,1))
        test_df[col] = encoder.transform(test_df[col].values.reshape(-1,1))
    
    # Normalize the data
    for col in numerical_columns:
        scaler = MinMaxScaler()
        train_df[col] = scaler.fit_transform(train_df[col].values.reshape(-1,1))
        test_df[col] = scaler.transform(test_df[col].values.reshape(-1,1))
    
    X_train = train_df.drop('Response', axis=1)
    y_train = train_df['Response']
    
    # Train using logistic regression
    print("Traning model...")
    model = train_logistic_regression_model(X_train, y_train)
    
    # Calculate accuracy
    X_test = test_df.drop('Response', axis=1)
    y_test = test_df['Response']
    
    print("Evaluating model...")
    
    predictions = model.predict(X_test)
    accuracy = np.average(y_test == predictions)
    print("Accuracy:", accuracy)
    
    
def train_logistic_regression_model(X_train, y_train):
    # Train a logistic regression model
    model = LogisticRegression(solver='liblinear')
    model.fit(X_train, y_train)
    return model
    
def read_data(data_path):
    # Read data
    data = pd.read_csv(data_path, index_col=0)
    return data

if __name__ == "__main__":
    
    print("\n")
    print("*" * 60)

    args = parse_args()
    main(args)
    
    print("*" * 60)
    print("\n")

Overwriting src/train.py
