In [1]:
import geopandas as gpd
import pandas as pd
from shapely.geometry import Point

df = pd.read_csv("../data/processed/claims_processed.csv")

geometry = [Point(xy) for xy in zip(df.longitude, df.latitude)]
gdf = gpd.GeoDataFrame(df, geometry=geometry, crs="EPSG:4326")

# Simulated spatial feature
gdf["distance_to_city_center_km"] = (
    ((gdf.latitude - 51.5074)**2 + (gdf.longitude + 0.1278)**2)**0.5 * 111
)

gdf.head()


Unnamed: 0,claim_id,latitude,longitude,claim_amount,claim_type,policy_type,incident_date,high_risk_claim,high_claim,geometry,distance_to_city_center_km
0,1,51.506181,-0.225947,539.51,Vehicle,Standard,2023-02-26,0,0,POINT (-0.22595 51.50618),10.895138
1,2,51.592607,-0.08324,567.6,Theft,Comprehensive,2023-10-25,0,0,POINT (-0.08324 51.59261),10.673263
2,3,51.559799,0.049178,2202.33,Property,Standard,2023-04-08,1,1,POINT (0.04918 51.5598),20.487544
3,4,51.539799,-0.00711,580.95,Vehicle,Standard,2023-05-17,0,0,POINT (-0.00711 51.5398),13.87089
4,5,51.473403,0.022624,1147.46,Property,Standard,2023-03-28,0,0,POINT (0.02262 51.4734),17.118247


In [2]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

features = ["claim_amount", "distance_to_city_center_km"]
X = gdf[features]
y = gdf["high_claim"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       1.00      1.00      1.00       162
           1       1.00      1.00      1.00       138

    accuracy                           1.00       300
   macro avg       1.00      1.00      1.00       300
weighted avg       1.00      1.00      1.00       300

