# Import Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor

# Read Training and Testing Data

In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [3]:
train.columns

Index(['id', 'perc_premium_paid_by_cash_credit', 'age_in_days', 'Income',
       'Count_3-6_months_late', 'Count_6-12_months_late',
       'Count_more_than_12_months_late', 'application_underwriting_score',
       'no_of_premiums_paid', 'sourcing_channel', 'residence_area_type',
       'target'],
      dtype='object')

In [4]:
train.shape

(79853, 12)

# Create Dummy Variables

In [5]:
train = pd.get_dummies(train)
test = pd.get_dummies(test)

# Fill Missing Data

In [6]:
train.fillna(0, inplace = True)
test.fillna(0, inplace = True)

# Initialize a Scaler Object

In [7]:
scaler = StandardScaler()

In [8]:
trainData = train[:70000]

# Seperate Features (x) and Final Results (y) from Training Data

## Scale Both Training and Testing Data

In [9]:
xTrain = trainData.drop("target", axis = 1)
yTrain = trainData["target"]
xTrain = scaler.fit_transform(xTrain)

In [10]:
testData = test
testData = scaler.fit_transform(testData)

# Initialize Object of Random Forest Regressor with Parameters

## Parameters Here are Chose After Repeated Testing

In [11]:
regr = RandomForestRegressor(n_estimators = 250, max_depth = 6)

# Train the Model

In [12]:
regr.fit(xTrain, yTrain)

RandomForestRegressor(max_depth=6, n_estimators=250)

# Predict on Test Dataset

In [13]:
predict = regr.predict(testData)

# Save Results as CSV for Checking

In [14]:
submission = pd.DataFrame()
submission['id'] = test["id"]
submission['target'] = predict

In [15]:
submission.to_csv('submission.csv', header=True, index=False)