In [15]:
# Imports
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import date, datetime
from bs4 import BeautifulSoup
import requests

In [40]:

def load_data(filepath):
    df = pd.read_excel(filepath, engine='openpyxl')
    return df


def clean_data(data):
    # Drop columns with null values
    data.dropna(inplace=True)

    #clean up Oz column 
    data.rename(columns = {'#Oz':'Oz'}, inplace = True)
    data['Oz'] = data['Oz'].replace(regex=[r'\D+'], value="")
    data['Oz'] = round(data['Oz'].astype(float), 2)
    
    # Clean number of shipments column
    data['Number of Shipments'] = pd.to_numeric(data['Number of Shipments'], errors = 'coerce')
    data.dropna(inplace=True)
    data['Number of Shipments'] = data['Number of Shipments'].astype(int)
    
    # Add pounds column
    data['Pounds'] = round(data['Oz'].astype(float) / 16, 2)
    
    # Determine average number of Pounds/Oz per shipment
    data['Pounds_per_Shipment'] = round(data['Pounds'] / data['Number of Shipments'], 2)
    data['Oz_per_Shipment'] = round(data['Oz'] / data['Number of Shipments'], 2)
    
    # Create a feature capturing the number of months since the last donation
    data['DOLD'] = data['DOLD'].replace('10/0/3/2019', '10/3/2019')
    data['Current_Date'] = pd.to_datetime(date.today())
    data['Months_since_last_donation'] = ((pd.to_datetime(data['Current_Date']).dt.year - pd.to_datetime(data['DOLD']).dt.year) * 12) + (pd.to_datetime(data['Current_Date']).dt.month - pd.to_datetime(data['DOLD']).dt.month)
    
    
    # Return the cleaned data
    return data


def generate_zip_data(data):
    # Get unique zipcodes
    zipcode_list = data['Zip code'].unique()
    print(zipcode_list)
    # Scrape the coordinates
    latitude = []
    longitude = []
    
    for zip_code in zipcode_list:
        url = 'https://www.zipdatamaps.com/{}'.format(zip_code)
        html=requests.get(url)
        Soup =BeautifulSoup(html.content , 'html.parser')
        table = Soup.find(attrs={'class': "table table-striped table-bordered table-hover table-condensed"})
        
        data = [] 
        for i in table.find_all('tr'):
            data.append([j.text for j in i.find_all('td')])
        coord = data[-1][1:]
        
        lat, long = coord[0].split(',')
        latitude.append(lat)
        longitude.append(long)

    # Create the dataframe
    zipcode_df = pd.DataFrame({
        "Zip code": zipcode_list,
        "Latitude": latitude,
        "Longitude":longitude
    })
    
        
    # Return the zipcode dataframe
    return zipcode_df

In [47]:
path_to_data = os.path.join(os.getcwd(), 'NYMB_updates.xlsx')

# Load and clean the data
data = load_data(path_to_data)
data_cleaned = clean_data(data)

# Get coordinates for zipcodes
zipcode_df = generate_zip_data(data)

# Merge the datasets
data_cleaned = pd.merge(data_cleaned, zipcode_df, how='inner', on='Zip code')

# Reorganize the columns
data_cleaned = data_cleaned[['Donor Number', 'Current_Date', 'DOFD', 'DOLD', 'Months_since_last_donation', 'Origin', 'Neighborhood', 'Zip code', 'Latitude', 'Longitude', 'Number of Shipments', 'Oz', 'Oz_per_Shipment', 'Pounds', 'Pounds_per_Shipment']]

# Estimate the shipping cost
# TODO: Implement

display(data_cleaned)

Unnamed: 0,Donor Number,Current_Date,DOFD,DOLD,Months_since_last_donation,Origin,Neighborhood,Zip code,Latitude,Longitude,Number of Shipments,Oz,Oz_per_Shipment,Pounds,Pounds_per_Shipment
0,1,2022-10-17,2022-08-02,2022-08-02 00:00:00,2,Shipped from Donor,Battery Park,10280,40.70969600,-74.02023300,1,247.00,247.00,15.44,15.44
1,5,2022-10-17,2021-04-09,2021-05-07 00:00:00,17,Shipped from Donor,Battery Park,10280,40.70969600,-74.02023300,2,603.00,301.50,37.69,18.84
2,6,2022-10-17,2022-02-24,2022-02-24 00:00:00,8,Shipped from Donor,Battery Park,10280,40.70969600,-74.02023300,1,493.84,493.84,30.86,30.86
3,2,2022-10-17,2019-04-10,2019-05-10 00:00:00,41,\tColumbia Midtown,Battery Park,10282,40.71971300,-74.01464900,3,596.00,198.67,37.25,12.42
4,3,2022-10-17,2019-05-17,2019-05-17 00:00:00,41,Shipped from Donor,Battery Park,10282,40.71971300,-74.01464900,1,360.00,360.00,22.50,22.50
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
249,256,2022-10-17,2022-06-02,2022-06-02 00:00:00,4,Shipped from Donor,Washington Heights (upper west side),10040,40.85814000,-73.92921000,2,342.00,171.00,21.38,10.69
250,257,2022-10-17,2020-09-10,2020-09-10 00:00:00,25,Shipped from Donor,Washington Heights (upper west side),10040,40.85814000,-73.92921000,1,208.00,208.00,13.00,13.00
251,258,2022-10-17,2022-01-12,2022-02-11 00:00:00,8,Shipped from Donor,Washington Heights (upper west side),10040,40.85814000,-73.92921000,3,847.00,282.33,52.94,17.65
252,259,2022-10-17,2022-03-01,2022-08-04 00:00:00,2,Shipped from Donor,Washington Heights (upper west side),10040,40.85814000,-73.92921000,4,1104.00,276.00,69.00,17.25
