In [1]:
import pandas as pd
import folium
import numpy as np
import scipy
from scipy import stats

In [2]:
# Load your CSV data
movement_data = pd.read_csv("../data/movements.csv", parse_dates=["datetime"])  # Replace with your file path
restaurant_data = pd.read_csv("../data/restaurants.csv")  # Replace with your file path

# Clean Data

In [3]:
movement_data.columns = movement_data.columns.str.lower()
restaurant_data.columns = restaurant_data.columns.str.lower()
restaurant_data.rename(columns={'restaurant id': 'restaurant_id'})

print(movement_data.head())
print(restaurant_data.head())



             datetime    id   longitude   latitude
0 2020-01-01 00:00:00  I000 -122.335167  47.608013
1 2020-01-01 00:01:07  I000 -122.335166  47.608462
2 2020-01-01 00:01:37  I000 -122.335167  47.608432
3 2020-01-01 00:02:35  I000 -122.335166  47.608682
4 2020-01-01 00:04:05  I000 -122.335166  47.608841
  restaurant id                   name  \
0          R000   Pullman Quick Stop 0   
1          R001   Pullman Quick Stop 1   
2          R002   Pullman Department 2   
3          R003    Pullman Groceries 3   
4          R004  Pullman Supercenter 4   

                                            category   longitude   latitude  
0                                 Convenience Stores -122.435908  47.621564  
1                                 Convenience Stores -122.297377  47.649300  
2                                  Department Stores -122.338057  47.666842  
3  Supermarkets/Other Grocery (Exc Convenience) Strs -122.242926  47.587846  
4                     Warehouse Clubs & Supercenter

In [4]:
print(movement_data.isnull().sum())
print(restaurant_data.isnull().sum())

datetime     0
id           0
longitude    0
latitude     0
dtype: int64
restaurant id    0
name             0
category         0
longitude        0
latitude         0
dtype: int64


# Exploratory Data Analysis

In [5]:
# Haversine distance function (Earth radius in kilometers)
def haversine(lon1, lat1, lon2, lat2):
    R = 6371  # Radius of Earth in kilometers
    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a))
    return R * c

# Define a function to check if each movement is near a restaurant
def is_near_restaurant(movement, restaurants, threshold_km=0.5):
    for _, restaurant in restaurants.iterrows():
        distance = haversine(movement['longitude'], movement['latitude'],
                             restaurant['longitude'], restaurant['latitude'])
        if distance <= threshold_km:
            return True, restaurant['id']  # Indicate a visit and the restaurant ID
    return False, None  # No nearby restaurant

# Apply function to each movement and create a column indicating restaurant visits
movement_data['visited_restaurant'] = movement_data.apply(
    lambda row: is_near_restaurant(row, restaurant_data)[0], axis=1
)
movement_data['restaurant_id'] = movement_data.apply(
    lambda row: is_near_restaurant(row, restaurant_data)[1], axis=1
)

KeyError: 'id'