# Ride Fare Anomaly Detection - Data Exploration

In [None]:
import pandas as pd

import numpy as np

import matplotlib.pyplot as plt

import seaborn as sns

from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import IsolationForest

In [None]:
# Load the dataset

df = pd.read_csv('../data/ride_fares.csv')

In [None]:
# Basic data exploration

print(df.info())

print("\nDescriptive Statistics:")

print(df.describe())

In [None]:
# Visualize distribution of fares

plt.figure(figsize=(10, 6))

sns.histplot(df['fare'], kde=True)

plt.title('Distribution of Ride Fares')

plt.xlabel('Fare')

plt.ylabel('Frequency')

plt.show()

In [None]:
# Scatter plot of distance vs fare

plt.figure(figsize=(10, 6))

plt.scatter(df['pickup_distance'] + df['drop_distance'], df['fare'])

plt.title('Total Distance vs Fare')

plt.xlabel('Total Distance (km)')

plt.ylabel('Fare')

plt.show()

In [None]:
# Prepare data for Isolation Forest

features = ['pickup_distance', 'drop_distance', 'fare']

X = df[features]



# Scale features

scaler = StandardScaler()

X_scaled = scaler.fit_transform(X)

In [None]:
# Train Isolation Forest

iso_forest = IsolationForest(contamination=0.1, random_state=42)

iso_forest.fit(X_scaled)

In [None]:
# Predict anomalies

df['anomaly'] = iso_forest.predict(X_scaled)

anomalies = df[df['anomaly'] == -1]



print("\nAnomaly Statistics:")

print(anomalies)

In [None]:
# Visualize anomalies

plt.figure(figsize=(12, 6))

plt.scatter(df['pickup_distance'] + df['drop_distance'], df['fare'], c=df['anomaly'], cmap='viridis')

plt.title('Anomaly Detection in Ride Fares')

plt.xlabel('Total Distance (km)')

plt.ylabel('Fare')

plt.colorbar(label='Anomaly')

plt.show()