### This script contains the following:

#### 1. Importing libraries and data
#### 2. Calculation of distances between zip codes for a new column


In [1]:
# 01 Importing Libraries
import pandas as pd
import numpy as np
import os

In [2]:
## 02 Importing cleaned csv as a dataframe, named "df"
df = pd.read_csv('/Users/emmawilcox/Desktop/camper_clean.csv', index_col=False, low_memory=False)

In [3]:
## 03 Looking at df
df.head()

Unnamed: 0.1,Unnamed: 0,ordernumber,agency,orgid,regioncode,regiondescription,parentlocationid,parentlocation,park,sitetype,...,discount,totalpaid,startdate,enddate,orderdate,nights,numberofpeople,equipmentdescription,clean_startdate,clean_enddate
0,1488,0343383462-1,BLM,126,AZ,Arizona,16669,Kingman Field Office,Burro Creek Campground,GROUP STANDARD NONELECTRIC,...,0.0,258.0,2021-11-11T00:00:00Z,2021-11-16T00:00:00Z,2021-09-08T02:10:33.470611Z,5,12,Trailer,2021-11-11,2021-11-16
1,1492,0400231814-1,BLM,126,AZ,Arizona,16669,Kingman Field Office,Burro Creek Campground,GROUP STANDARD NONELECTRIC,...,0.0,108.0,2022-03-18T00:00:00Z,2022-03-20T00:00:00Z,2022-02-06T01:38:16.919819Z,2,4,Fifth Wheel,2022-03-18,2022-03-20
2,1504,0404840886-1,BLM,126,AZ,Arizona,16669,Kingman Field Office,Burro Creek Campground,GROUP STANDARD NONELECTRIC,...,0.0,108.0,2022-03-11T00:00:00Z,2022-03-13T00:00:00Z,2021-11-09T01:31:10.500458Z,2,12,Trailer,2022-03-11,2022-03-13
3,1525,0410136112-1,BLM,126,AZ,Arizona,16669,Kingman Field Office,Burro Creek Campground,GROUP STANDARD NONELECTRIC,...,0.0,58.0,2022-01-09T00:00:00Z,2022-01-10T00:00:00Z,2021-12-12T22:17:51.124814Z,1,6,RV,2022-01-09,2022-01-10
4,1546,0416249892-1,BLM,126,AZ,Arizona,16669,Kingman Field Office,Burro Creek Campground,GROUP STANDARD NONELECTRIC,...,0.0,158.0,2022-02-18T00:00:00Z,2022-02-21T00:00:00Z,2022-01-05T17:24:17.047159Z,3,12,Fifth Wheel,2022-02-18,2022-02-21


In [4]:
## 04 Checking for mixed type columns
for col in df.columns.tolist():
  weird = (df[[col]].applymap(type) != df[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df[weird]) > 0:
    print (col)

#### 2. Calculation of distances between zip codes for a new column

In [6]:
## 05 Part 1 Calculating distances between zip codes by getting their lat/long
from uszipcode import SearchEngine
import math
import pandas as pd

# Initialize the SearchEngine
search = SearchEngine()

# Function to calculate distance between two sets of latitude and longitude
def calculate_distance(lat1, lon1, lat2, lon2):
    R = 3958.8  # Earth radius in miles
    
    dlat = math.radians(lat2 - lat1)
    dlon = math.radians(lon2 - lon1)
    a = math.sin(dlat / 2) * math.sin(dlat / 2) + math.cos(math.radians(lat1)) * math.cos(math.radians(lat2)) * math.sin(dlon / 2) * math.sin(dlon / 2)
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
    distance = R * c
    
    return distance

# Function to get latitude and longitude from a zip code
def get_lat_lon(zip_code):
    result = search.by_zipcode(zip_code)
    if result:
        return result.lat, result.lng
    else:
        return None, None

In [8]:
## 06 Putting distances in miles in a new column, "distance"
df['distance'] = df.apply(lambda row: calculate_distance(
    get_lat_lon(row['customerzip'])[0], get_lat_lon(row['customerzip'])[1],
    get_lat_lon(row['facilityzip'])[0], get_lat_lon(row['facilityzip'])[1]
) if get_lat_lon(row['customerzip'])[0] is not None and get_lat_lon(row['customerzip'])[1] is not None and
     get_lat_lon(row['facilityzip'])[0] is not None and get_lat_lon(row['facilityzip'])[1] is not None
else None, axis=1)

In [11]:
## 07 Testing a sample with google to confirm it worked in miles and is accurate
df.head()

Unnamed: 0.1,Unnamed: 0,ordernumber,agency,orgid,regioncode,regiondescription,parentlocationid,parentlocation,park,sitetype,...,totalpaid,startdate,enddate,orderdate,nights,numberofpeople,equipmentdescription,clean_startdate,clean_enddate,distance
0,1488,0343383462-1,BLM,126,AZ,Arizona,16669,Kingman Field Office,Burro Creek Campground,GROUP STANDARD NONELECTRIC,...,258.0,2021-11-11T00:00:00Z,2021-11-16T00:00:00Z,2021-09-08T02:10:33.470611Z,5,12,Trailer,2021-11-11,2021-11-16,122.820784
1,1492,0400231814-1,BLM,126,AZ,Arizona,16669,Kingman Field Office,Burro Creek Campground,GROUP STANDARD NONELECTRIC,...,108.0,2022-03-18T00:00:00Z,2022-03-20T00:00:00Z,2022-02-06T01:38:16.919819Z,2,4,Fifth Wheel,2022-03-18,2022-03-20,56.785258
2,1504,0404840886-1,BLM,126,AZ,Arizona,16669,Kingman Field Office,Burro Creek Campground,GROUP STANDARD NONELECTRIC,...,108.0,2022-03-11T00:00:00Z,2022-03-13T00:00:00Z,2021-11-09T01:31:10.500458Z,2,12,Trailer,2022-03-11,2022-03-13,103.5832
3,1525,0410136112-1,BLM,126,AZ,Arizona,16669,Kingman Field Office,Burro Creek Campground,GROUP STANDARD NONELECTRIC,...,58.0,2022-01-09T00:00:00Z,2022-01-10T00:00:00Z,2021-12-12T22:17:51.124814Z,1,6,RV,2022-01-09,2022-01-10,809.033807
4,1546,0416249892-1,BLM,126,AZ,Arizona,16669,Kingman Field Office,Burro Creek Campground,GROUP STANDARD NONELECTRIC,...,158.0,2022-02-18T00:00:00Z,2022-02-21T00:00:00Z,2022-01-05T17:24:17.047159Z,3,12,Fifth Wheel,2022-02-18,2022-02-21,71.057958


In [10]:
## 08 Initialize the uszipcode search engine to get state from customer's zip code for new column
search = SearchEngine()

# Function to get state name from zip code
def get_state_name(zip_code):
    result = search.by_zipcode(zip_code)
    return result.state if result else None

# Apply the function to the 'customerzip' column and create a new 'state' column
df['customerstate'] = df['customerzip'].apply(get_state_name)

In [11]:
## 09 New column is present
df.head()

Unnamed: 0.1,Unnamed: 0,ordernumber,agency,orgid,regioncode,regiondescription,parentlocationid,parentlocation,park,sitetype,...,startdate,enddate,orderdate,nights,numberofpeople,equipmentdescription,clean_startdate,clean_enddate,customerstate,distance
0,1488,0343383462-1,BLM,126,AZ,Arizona,16669,Kingman Field Office,Burro Creek Campground,GROUP STANDARD NONELECTRIC,...,2021-11-11T00:00:00Z,2021-11-16T00:00:00Z,2021-09-08T02:10:33.470611Z,5,12,Trailer,2021-11-11,2021-11-16,AZ,122.820784
1,1492,0400231814-1,BLM,126,AZ,Arizona,16669,Kingman Field Office,Burro Creek Campground,GROUP STANDARD NONELECTRIC,...,2022-03-18T00:00:00Z,2022-03-20T00:00:00Z,2022-02-06T01:38:16.919819Z,2,4,Fifth Wheel,2022-03-18,2022-03-20,AZ,56.785258
2,1504,0404840886-1,BLM,126,AZ,Arizona,16669,Kingman Field Office,Burro Creek Campground,GROUP STANDARD NONELECTRIC,...,2022-03-11T00:00:00Z,2022-03-13T00:00:00Z,2021-11-09T01:31:10.500458Z,2,12,Trailer,2022-03-11,2022-03-13,AZ,103.5832
3,1525,0410136112-1,BLM,126,AZ,Arizona,16669,Kingman Field Office,Burro Creek Campground,GROUP STANDARD NONELECTRIC,...,2022-01-09T00:00:00Z,2022-01-10T00:00:00Z,2021-12-12T22:17:51.124814Z,1,6,RV,2022-01-09,2022-01-10,MT,809.033807
4,1546,0416249892-1,BLM,126,AZ,Arizona,16669,Kingman Field Office,Burro Creek Campground,GROUP STANDARD NONELECTRIC,...,2022-02-18T00:00:00Z,2022-02-21T00:00:00Z,2022-01-05T17:24:17.047159Z,3,12,Fifth Wheel,2022-02-18,2022-02-21,AZ,71.057958


In [12]:
## 10 Defining "path" as a shortcut to the main Instacart folder on my desktop
path = r'/Users/emmawilcox/Desktop'

In [13]:
## 11 Exporting cleaned reservation data with new distance and state columns
df.to_csv(os.path.join(path, 'camper_clean_distance.csv'))