In [2]:
import json
import requests
import pandas as pd
import re


def genToken():
    url = "https://api.cloud.precisely.com/auth/v2/token"
    
    # Create Payload
    payload = 'grant_type=client_credentials&scope=default'
    headers = {
        'Content-Type': 'application/x-www-form-urlencoded',
        'Authorization': 'Basic NzdjOWRlNTk5YjM3YTEwZjpvR3UzRFNpRExVdG5MZDVYR3FFdXJOenJJVUVHZkdtMw=='  # Replace with your actual Base64-encoded credentials
    }
    
    # Send POST request to retrieve token
    response = requests.post(url, headers=headers, data=payload)

    if response.status_code == 200:
        # Parse JSON and return the access token
        token_info = response.json()
        return token_info.get('access_token')
    else:
        print(f"Failed to get token: {response.status_code} - {response.text}")
        return None

# Generate and print the token
token = genToken()
print(token)

# Define your Precisely API token
access_token = token

# API endpoint for Precisely
url = "https://api.cloud.precisely.com/data-graph/graphql"

# Set headers including the authorization token
headers = {
    'Content-Type': 'application/json',
    'Authorization': f'Bearer {access_token}'
}

# Loop in University City addresses
df = pd.read_csv("UnivCity.csv")
print(df.head())

df = df.head(20)
### For testing just select top 20

# Prepare columns for the new data
df['floodZone'] = None
df['floodZoneBaseFloodElevationFeet'] = None
df['year100FloodZoneDistanceFeet'] = None

df['wildfireRisk_severityRating'] = None
df['wildfireRisk_overallRiskRanking'] = None
df['wildfireRisk_distanceToHighRiskFeet'] = None

df['weatherRisk_hailRiskLevel'] = None
df['weatherRisk_tornadoRiskLevel'] = None
df['weatherRisk_windRiskLevel'] = None
df['weatherRisk_countOfHurricaneEvents'] = None

# Iterate over the addresses in the DataFrame
for ind in df.index:
    address = df['NUMBER'][ind] + " " + df['STREET'][ind] + " Philadelphia PA"
    
    # 1. Flood Risk Query
    flood_risk_payload = {
        "query": f"""
        query floodRisk {{
          getByAddress(address: "{address}") {{
            addresses {{
              data {{
                floodRisk {{
                  data {{
                    floodZone
                    floodZoneBaseFloodElevationFeet
                    year100FloodZoneDistanceFeet
                  }}
                }}
              }}
            }}
          }}
        }}
        """,
        "variables": {}
    }
    
    # 2. Wildfire Risk Query
    wildfire_risk_payload = {
        "query": f"""
        query wildfireRisk {{
          getByAddress(address: "{address}") {{
            addresses {{
              data {{
                wildfireRisk {{
                  data {{
                    severityRating
                    overallRiskRanking
                    distanceToHighRiskFeet
                  }}
                }}
              }}
            }}
          }}
        }}
        """,
        "variables": {}
    }
    
    # 3. Historical Weather Risk Query
    historical_weather_risk_payload = {
        "query": f"""
        query historicalWeatherRisk {{
          getByAddress(address: "{address}") {{
            addresses {{
              data {{
                historicalWeatherRisk {{
                  data {{
                    hailRiskLevel
                    tornadoRiskLevel
                    windRiskLevel
                    countOfHurricaneEvents
                  }}
                }}
              }}
            }}
          }}
        }}
        """,
        "variables": {}
    }
    
    # Make the POST request for Flood Risk
    flood_response = requests.post(url, headers=headers, json=flood_risk_payload)
    if flood_response.status_code == 200:
        flood_json = flood_response.json()
        # Print the response to understand its structure
        print(f"Flood Risk Response for '{address}': {flood_json}")

        # Safely get flood data if available
        get_by_address = flood_json.get('data', {}).get('getByAddress')
        if get_by_address and get_by_address.get('addresses'):
            addresses_data = get_by_address['addresses'].get('data')
            if addresses_data:
                flood_data = addresses_data[0].get('floodRisk', {}).get('data', [{}])[0]
                # Update DataFrame with flood risk data
                df.at[ind, 'floodZone'] = flood_data.get('floodZone')
                df.at[ind, 'floodZoneBaseFloodElevationFeet'] = flood_data.get('floodZoneBaseFloodElevationFeet')
                df.at[ind, 'year100FloodZoneDistanceFeet'] = flood_data.get('year100FloodZoneDistanceFeet')
        else:
            print(f"No flood data available for '{address}'")
    else:
        print(f"Failed to fetch Flood Risk data for '{address}': {flood_response.status_code} - {flood_response.text}")
    
    # Make the POST request for Wildfire Risk
    wildfire_response = requests.post(url, headers=headers, json=wildfire_risk_payload)
    if wildfire_response.status_code == 200:
        wildfire_json = wildfire_response.json()
        print(f"Wildfire Risk Response for '{address}': {wildfire_json}")

        get_by_address = wildfire_json.get('data', {}).get('getByAddress')
        if get_by_address and get_by_address.get('addresses'):
            addresses_data = get_by_address['addresses'].get('data')
            if addresses_data:
                wildfire_data = addresses_data[0].get('wildfireRisk', {}).get('data', [{}])[0]
                # Update DataFrame with wildfire risk data
                df.at[ind, 'wildfireRisk_severityRating'] = wildfire_data.get('severityRating')
                df.at[ind, 'wildfireRisk_overallRiskRanking'] = wildfire_data.get('overallRiskRanking')
                df.at[ind, 'wildfireRisk_distanceToHighRiskFeet'] = wildfire_data.get('distanceToHighRiskFeet')
        else:
            print(f"No wildfire data available for '{address}'")
    else:
        print(f"Failed to fetch Wildfire Risk data for '{address}': {wildfire_response.status_code} - {wildfire_response.text}")
    
    # Make the POST request for Historical Weather Risk
    weather_response = requests.post(url, headers=headers, json=historical_weather_risk_payload)
    if weather_response.status_code == 200:
        weather_json = weather_response.json()
        print(f"Weather Risk Response for '{address}': {weather_json}")

        get_by_address = weather_json.get('data', {}).get('getByAddress')
        if get_by_address and get_by_address.get('addresses'):
            addresses_data = get_by_address['addresses'].get('data')
            if addresses_data:
                weather_data = addresses_data[0].get('historicalWeatherRisk', {}).get('data', [{}])[0]
                # Update DataFrame with weather risk data
                df.at[ind, 'weatherRisk_hailRiskLevel'] = weather_data.get('hailRiskLevel')
                df.at[ind, 'weatherRisk_tornadoRiskLevel'] = weather_data.get('tornadoRiskLevel')
                df.at[ind, 'weatherRisk_windRiskLevel'] = weather_data.get('windRiskLevel')
                df.at[ind, 'weatherRisk_countOfHurricaneEvents'] = weather_data.get('countOfHurricaneEvents')
        else:
            print(f"No weather data available for '{address}'")
    else:
        print(f"Failed to fetch Historical Weather Risk data for '{address}': {weather_response.status_code} - {weather_response.text}")

# Display the updated DataFrame with new columns
print(df)


eyJhbGciOiJSUzI1NiIsInR5cCIgOiAiSldUIiwia2lkIiA6ICJON1NUbF9iQVo2Qk93MXZGUkNYM1Q2OWsyR0laTGw1cVd2MmZyMTdvTk1BIn0.eyJleHAiOjE3MjgzMzczNjYsImlhdCI6MTcyODMzMzc2NiwianRpIjoiYzYyYjJhYTQtNDJkZi00MmVjLWFlYWItYTVhNTdmNzFkOGZjIiwiaXNzIjoiaHR0cHM6Ly9hdXRoLmNsb3VkLnByZWNpc2VseS5jb20vYXV0aC9yZWFsbXMvUHJlY2lzZWx5IiwiYXVkIjoiUHJlY2lzZWx5Iiwic3ViIjoiNWFjNWY2NmQtNGQ5Zi00NDE2LThmNmMtMjc3NzBmZDZmZmE1IiwidHlwIjoiQmVhcmVyIiwiYXpwIjoiT0lEQy1ESVMtU2hlbGwiLCJzZXNzaW9uX3N0YXRlIjoiMzFlZjg2YWUtMDEzNS00NWU1LTgxZmItYTAzY2ViNzM2NmFjIiwiYWxsb3dlZC1vcmlnaW5zIjpbIioiXSwic2NvcGUiOiJ1cG4gcHJvZmlsZSBlbWFpbCIsInNpZCI6IjMxZWY4NmFlLTAxMzUtNDVlNS04MWZiLWEwM2NlYjczNjZhYyIsInJlc291cmNlX2FjY2VzcyI6eyJPSURDLUNvbm5lY3QiOnsicm9sZXMiOlsiRGVzaWduZXIiLCJPcGVyYXRvciJdfSwiT0lEQy1ESVMtQ3VzdG9tZXJBY2NvdW50TWFuYWdlbWVudCI6eyJyb2xlcyI6WyJhZG1pbiIsImFnZW50YWRtaW5pc3RyYXRvciJdfSwiT0lEQy1EaXNjb3ZlcnkiOnsicm9sZXMiOlsiYWRtaW4iLCJ1c2VyIl19LCJicm9rZXIiOnsicm9sZXMiOlsicmVhZC10b2tlbiJdfSwiT0lEQy1ESVMtRFFDb3JlIjp7InJvbGVzIjpbIkRlc2lnbmVyIiwiT3BlcmF0

In [3]:
## Data Cleaning
# Step 1: Get basic information about the DataFrame
print("=== Basic Information ===")
print(f"Number of observations (rows): {df.shape[0]}")
print(f"Number of variables (columns): {df.shape[1]}")
print("\n")

=== Basic Information ===
Number of observations (rows): 20
Number of variables (columns): 21




In [4]:
# Step 2: Display the first few rows to understand the structure
print("=== Preview of the DataFrame ===")
print(df.head(10))
print("\n")

=== Preview of the DataFrame ===
         LON        LAT    NUMBER          STREET UNIT  CITY  DISTRICT  \
0 -75.206209  39.969039      4102        OGDEN ST   AB   NaN       NaN   
1 -75.207125  39.956518      4215     CHESTNUT ST    A   NaN       NaN   
2 -75.207125  39.956518      4215     CHESTNUT ST    B   NaN       NaN   
3 -75.207125  39.956518      4215     CHESTNUT ST    C   NaN       NaN   
4 -75.207125  39.956518      4215     CHESTNUT ST    D   NaN       NaN   
5 -75.207125  39.956518      4215     CHESTNUT ST    E   NaN       NaN   
6 -75.190794  39.957274     61-63       N 34TH ST  NaN   NaN       NaN   
7 -75.196723  39.966273       714       N 37TH ST  NaN   NaN       NaN   
8 -75.194899  39.943977       697  UNIVERSITY AVE  NaN   NaN       NaN   
9 -75.207662  39.972382  4120R-74       POPLAR ST  NaN   NaN       NaN   

   REGION  POSTCODE         ID  ... floodZone floodZoneBaseFloodElevationFeet  \
0     NaN     19104  886740095  ...         X                          

In [5]:
# Step 3: Get detailed information about each column (type, non-null counts, etc.)
print("=== DataFrame Info ===")
df.info()
print("\n")

=== DataFrame Info ===
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 21 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   LON                                  20 non-null     float64
 1   LAT                                  20 non-null     float64
 2   NUMBER                               20 non-null     object 
 3   STREET                               20 non-null     object 
 4   UNIT                                 7 non-null      object 
 5   CITY                                 0 non-null      float64
 6   DISTRICT                             0 non-null      float64
 7   REGION                               0 non-null      float64
 8   POSTCODE                             20 non-null     int64  
 9   ID                                   20 non-null     int64  
 10  HASH                                 20 non-null     object 
 11  floodZone  

In [6]:
# Step 4: Drop rows or columns with a high proportion of null values if necessary
### Need to base on the data to decide
df_cleaned = df.dropna(subset=['floodZone'])

print(f"Rows remaining after dropping null 'preciselyID': {df_cleaned.shape[0]}")
print("\n")
df_cleaned.info()

Rows remaining after dropping null 'preciselyID': 18


<class 'pandas.core.frame.DataFrame'>
Index: 18 entries, 0 to 19
Data columns (total 21 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   LON                                  18 non-null     float64
 1   LAT                                  18 non-null     float64
 2   NUMBER                               18 non-null     object 
 3   STREET                               18 non-null     object 
 4   UNIT                                 7 non-null      object 
 5   CITY                                 0 non-null      float64
 6   DISTRICT                             0 non-null      float64
 7   REGION                               0 non-null      float64
 8   POSTCODE                             18 non-null     int64  
 9   ID                                   18 non-null     int64  
 10  HASH                                 18 non-null  

In [7]:
# Step 5: Check for duplicates and remove if necessary
print("=== Checking for Duplicates ===")
duplicate_count = df_cleaned.duplicated().sum()
print(f"Number of duplicate rows: {duplicate_count}")
df_cleaned.drop_duplicates(inplace=True)

=== Checking for Duplicates ===
Number of duplicate rows: 0


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned.drop_duplicates(inplace=True)


In [8]:
# Display the head of cleaned data
df_cleaned.head(20)

Unnamed: 0,LON,LAT,NUMBER,STREET,UNIT,CITY,DISTRICT,REGION,POSTCODE,ID,...,floodZone,floodZoneBaseFloodElevationFeet,year100FloodZoneDistanceFeet,wildfireRisk_severityRating,wildfireRisk_overallRiskRanking,wildfireRisk_distanceToHighRiskFeet,weatherRisk_hailRiskLevel,weatherRisk_tornadoRiskLevel,weatherRisk_windRiskLevel,weatherRisk_countOfHurricaneEvents
0,-75.206209,39.969039,4102,OGDEN ST,AB,,,,19104,886740095,...,X,X,3526,0,0,87886,low,low,,5
1,-75.207125,39.956518,4215,CHESTNUT ST,A,,,,19104,272012837,...,X,X,4592,0,0,87212,low,low,,5
2,-75.207125,39.956518,4215,CHESTNUT ST,B,,,,19104,272012839,...,X,X,4592,0,0,87212,low,low,,5
3,-75.207125,39.956518,4215,CHESTNUT ST,C,,,,19104,272012841,...,X,X,4592,0,0,87212,low,low,,5
4,-75.207125,39.956518,4215,CHESTNUT ST,D,,,,19104,272012843,...,X,X,4592,0,0,87212,low,low,,5
5,-75.207125,39.956518,4215,CHESTNUT ST,E,,,,19104,272012845,...,X,X,4592,0,0,87212,low,low,,5
6,-75.190794,39.957274,61-63,N 34TH ST,,,,,19104,885317880,...,X,X,1799,0,0,82819,low,low,,5
7,-75.196723,39.966273,714,N 37TH ST,,,,,19104,243081500,...,X,X,1824,0,0,85092,low,low,,5
9,-75.207662,39.972382,4120R-74,POPLAR ST,,,,,19104,62152400,...,X,X,2669,0,0,88572,low,low,,5
11,-75.207084,39.974145,4103,LEIDY AVE,,,,,19104,881819450,...,X,X,2151,0,0,88566,low,low,,5


In [12]:
# Function to clean string data: lowercase, strip spaces
# Function to clean string data: lowercase, strip spaces, but only for string columns
def clean_strings(col):
    if col.dtype == 'object':  # Check if the column is of string type
        return col.astype(str).str.strip().str.lower()  # Remove extra spaces and convert to lowercase
    return col  # Return the column as it is if it's not a string column

# Apply the cleaning function to all columns
df_cleaned = df_cleaned.apply(clean_strings)


# Apply cleaning to all columns
df_cleaned = df_cleaned.apply(clean_strings)

# Convert any numeric columns to proper numeric types, ignore errors if not possible
df_cleaned = df_cleaned.apply(pd.to_numeric, errors='ignore')

# Handle missing values for numeric columns (optional step: here filling with mean)
df_cleaned.fillna(df_cleaned.mean(), inplace=True)

# Standardize any date columns
df_cleaned['date_column'] = pd.to_datetime(df_cleaned['date_column'], errors='coerce')

# Final cleaned dataset
df_cleaned.head()  # Preview the cleaned data



TypeError: Could not convert ['41024215421542154215421561-637144120r-7441033922441510373914-384100820916801-17'
 'ogden stchestnut stchestnut stchestnut stchestnut stchestnut stn 34th stn 37th stpoplar stleidy avepine stchestnut stn 38th stbaring stbrown stn 43rd stn 43rd stn 45th st'
 'ababcdenannannannannannannannanadnannannan'
 '6fe85ffddef02977a9767341c0cbe58946e6a17201df7dd2a166854dac25162fd916601b5ac1a0d4c8568e93d32c52c8b4613cc2f0876ce7d56705727886e7c822a9183b5cdc9b7464b9ff8cb8130e38ae414ab080f923c4a6618aca860f3fed622af581956c91138d7d0a6420a590978bf289b421e1faa98a2f2fc7bef7c71460a3fd4a3e64c84c6e219611f774cba1'
 'xxxxxxxxxxxxxxxxxx' 'xxxxxxxxxxxxxxxxxx'
 'lowlowlowlowlowlowlowlowlowlowlowlowlowlowlowlowlowlow'
 'lowlowlowlowlowlowlowlowlowlowlowlowlowlowlowlowlowlow'
 'nonenonenonenonenonenonenonenonenonenonenonenonenonenonenonenonenonenone'] to numeric