In [None]:
# necessary imports 

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

import warnings
warnings.filterwarnings('ignore')

plt.style.use('ggplot')

In [None]:
df = pd.read_csv('./insurance_claims.csv')

In [None]:
df.head()

In [None]:
# we can see some missing values denoted by '?' so lets replace missing values with np.nan

df.replace('?', np.nan, inplace = True)

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
# missing values
df.isna().sum()

In [None]:
import missingno as msno

msno.bar(df)
plt.show()

In [None]:
df['collision_type'] = df['collision_type'].fillna(df['collision_type'].mode()[0])

In [None]:
df['property_damage'] = df['property_damage'].fillna(df['property_damage'].mode()[0])

In [None]:
df['police_report_available'] = df['police_report_available'].fillna(df['police_report_available'].mode()[0])

In [None]:
df.isna().sum()

In [None]:
# heatmap

plt.figure(figsize = (18, 12))

corr = df.corr()

sns.heatmap(data = corr, annot = True, fmt = '.2g', linewidth = 1)
plt.show()

In [None]:
df.nunique()

In [None]:
# dropping columns which are not necessary for prediction

to_drop = ['policy_number','policy_bind_date','policy_state','insured_zip','incident_location','incident_date',
           'incident_state','incident_city','insured_hobbies','auto_make','auto_model','auto_year', '_c39']

df.drop(to_drop, inplace = True, axis = 1)

In [None]:
df.head()

In [None]:
# checking for multicollinearity

plt.figure(figsize = (18, 12))

corr = df.corr()
mask = np.triu(np.ones_like(corr, dtype = bool))

sns.heatmap(data = corr, mask = mask, annot = True, fmt = '.2g', linewidth = 1)
plt.show()

**From the above plot, we can see that there is high correlation between `age` and `months_as_customer`.We will drop the "Age" column. Also there is high correlation between `total_clam_amount`, `injury_claim`, `property_claim`, `vehicle_claim` as total claim is the sum of all others. So we will drop the total claim column.**

In [None]:
df.drop(columns = ['age', 'total_claim_amount'], inplace = True, axis = 1)

In [None]:
df.head()

In [None]:
df.info()

In [None]:
num_columns = len(df.columns)

# Print the number of columns
print(f"The DataFrame has {num_columns} columns.")

# Print the column names
print("The columns are:")
print(df.columns.tolist())

In [None]:
# separating the feature and target columns

X = df.drop('fraud_reported', axis = 1)
# y = df['fraud_reported']
y = df['fraud_reported'].map({'Y': 1, 'N': 0})  # Convert 'fraud_reported' to binary (1 for 'Y', 0 for 'N')

In [None]:
num_columns = len(X.columns)

# Print the number of columns
print(f"The DataFrame has {num_columns} columns.")

# Print the column names
print("The columns are:")
print(X.columns.tolist())

X.info()

In [None]:
# extracting categorical columns
cat_df = X.select_dtypes(include = ['object'])

In [None]:
num_columns = len(cat_df.columns)

# Print the number of columns
print(f"The DataFrame has {num_columns} columns.")

# Print the column names
print("The columns are:")
print(cat_df.columns.tolist())
cat_df.info()

In [None]:
cat_df.head()

In [None]:
# printing unique values of each column
for col in cat_df.columns:
    print(f"{col}: \n{cat_df[col].unique()}\n")

In [None]:
cat_df = pd.get_dummies(cat_df, drop_first = True)

In [None]:
cat_df.head()

In [None]:
# extracting the numerical columns

num_df = X.select_dtypes(include = ['int64'])

In [None]:
num_df.head()

In [None]:
# combining the Numerical and Categorical dataframes to get the final dataset

X = pd.concat([num_df, cat_df], axis = 1)

In [None]:
X.head()

In [None]:
plt.figure(figsize = (25, 20))
plotnumber = 1

for col in X.columns:
    if plotnumber <= 24:
        ax = plt.subplot(5, 5, plotnumber)
        sns.distplot(X[col])
        plt.xlabel(col, fontsize = 15)
        
    plotnumber += 1
    
plt.tight_layout()
plt.show()

**Data looks good, let's check for outliers.**

In [None]:
plt.figure(figsize = (20, 15))
plotnumber = 1

for col in X.columns:
    if plotnumber <= 24:
        ax = plt.subplot(5, 5, plotnumber)
        sns.boxplot(X[col])
        plt.xlabel(col, fontsize = 15)
    
    plotnumber += 1
plt.tight_layout()
plt.show()

**Outliers are present in some numerical columns we will scale numerical columns later**

In [None]:
# splitting data into training set and test set

# from sklearn.model_selection import train_test_split

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)

In [None]:
print(X_train.shape)  # Should be (num_samples, num_features)
print(y_train.shape)  # Should be (num_samples,)


In [None]:
# !tensorflowjs_converter --input_format=tf_saved_model --output_node_names='Identity' --saved_modddel_tags=serve /path/to/your/saved_model /path/to/output/folder
!pip install tensorflowjs

In [None]:
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

# Assuming you have your own pre-processed dataset in X and y
# X = your_feature_data  # Shape (num_samples, num_features)
# y = your_labels  # Shape (num_samples,)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Normalize the data (optional but recommended)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Build a simple neural network model using TensorFlow
model = tf.keras.Sequential([
    tf.keras.layers.InputLayer(input_shape=(X_train.shape[1],)),  # Input layer based on your feature count
    tf.keras.layers.Dense(64, activation='relu'),  # Hidden layer with 64 units
    tf.keras.layers.Dense(len(set(y)), activation='softmax')  # Output layer with the number of unique classes in y
])

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=20, batch_size=16, validation_data=(X_test, y_test))

# Evaluate the model on the test set
test_loss, test_acc = model.evaluate(X_test, y_test, verbose=2)

# Print the test accuracy
print(f'Test Accuracy: {test_acc * 100:.2f}%')

# Optionally, predict on the test set and calculate accuracy manually
y_pred = model.predict(X_test)
y_pred_classes = tf.argmax(y_pred, axis=1).numpy()  # Get the predicted class indices
accuracy = accuracy_score(y_test, y_pred_classes)
print(f'Manual Accuracy: {accuracy * 100:.2f}%')

# Save the model in TensorFlow SavedModel format
model.save('saved_model/my_model')