In [None]:
#initial data import
import requests
import json

# Define the base URL
base_url = "https://stn.wim.usgs.gov/STNServices/Events.json"

# params for data
params = {
    "format": "json"  # Ensure the format is JSON
}

# get request
response = requests.get(base_url, params=params)

# check response
if response.status_code == 200:
    data = response.json()

    # save data
    with open('usgs_flood_data.json', 'w') as outfile:
        json.dump(data, outfile, indent=4)

    print("Data successfully retrieved and saved to 'usgs_flood_data.json'")
else:
    print(f"Failed to retrieve data: {response.status_code}")

Data successfully retrieved and saved to 'usgs_flood_data.json'


In [None]:
# Check the content of the JSON file
file_path = '/content/usgs_flood_data.json'
with open(file_path, 'r') as file:
    content = file.read()

# display content (first few lines to check for validity)
print(content[:500
])


In [None]:
import pandas as pd
import re

# Loads the JSON data
df = pd.read_json('/content/usgs_flood_data.json')

# Remove the specified columns
columns_to_remove = [
    'event_type_id', 'event_status_id', 'event_coordinator',
    'last_updated_by', 'instruments', 'hwms', 'last_updated'
]
df_cleaned = df.drop(columns=columns_to_remove)

# Function to eliminate events with "Exercise" or "Test" in the event name
def remove_exercises_and_tests(df):
    return df[~df['event_name'].str.contains('Exercise|Test', case=False)]

# Filter the DataFrame for events that started between 2010 and 2020
start_date = '2010-01-01'
end_date = '2020-12-31'
df_filtered = df_cleaned[(df_cleaned['event_start_date'] >= start_date) & (df_cleaned['event_start_date'] <= end_date)]

# Remove exercises and tests
df_filtered = remove_exercises_and_tests(df_filtered)

# Mapping of known hurricane events to their affected states
hurricane_states = {
    'Irene': ['New York', 'New Jersey', 'Vermont', 'Connecticut', 'Massachusetts'],
    'Lee': ['New York', 'New Jersey', 'Pennsylvania'],
    'Isaac': ['Louisiana', 'Mississippi', 'Arkansas', 'Missouri', 'Illinois'],
    'Sandy': ['New Jersey', 'New York', 'Connecticut', 'Delaware', 'Maryland'],
    'Joaquin': ['South Carolina', 'North Carolina'],
    'Hermine': ['Florida', 'Georgia', 'South Carolina'],
    'Matthew': ['Florida', 'Georgia', 'South Carolina', 'North Carolina'],
    'Harvey': ['Texas', 'Louisiana'],
    'Irma': ['Florida', 'Georgia', 'South Carolina', 'Alabama'],
    'Maria': ['Puerto Rico', 'US Virgin Islands'],
    'Jose': ['New Jersey', 'New York', 'Connecticut', 'Massachusetts'],
    'Nate': ['Louisiana', 'Mississippi', 'Alabama'],
    'Lane': ['Hawaii'],
    'Gordon': ['Mississippi', 'Alabama', 'Florida'],
    'Florence': ['North Carolina', 'South Carolina'],
    'Michael': ['Florida', 'Georgia', 'Alabama'],
    'Dorian': ['Florida', 'Georgia', 'South Carolina', 'North Carolina'],
    'Isaias': ['Florida', 'North Carolina', 'New York'],
    'Laura': ['Louisiana', 'Texas'],
    'Sally': ['Florida', 'Alabama'],
    'Delta': ['Louisiana', 'Texas']
}

# Mapping of US regions to their states
regions = {
    'northeast': ['Connecticut', 'Maine', 'Massachusetts', 'New Hampshire', 'Rhode Island', 'Vermont', 'New Jersey', 'New York', 'Pennsylvania'],
    'midwest': ['Illinois', 'Indiana', 'Michigan', 'Ohio', 'Wisconsin', 'Iowa', 'Kansas', 'Minnesota', 'Missouri', 'Nebraska', 'North Dakota', 'South Dakota'],
    'south': ['Delaware', 'Florida', 'Georgia', 'Maryland', 'North Carolina', 'South Carolina', 'Virginia', 'West Virginia', 'Alabama', 'Kentucky', 'Mississippi', 'Tennessee', 'Arkansas', 'Louisiana', 'Oklahoma', 'Texas'],
    'west': ['Arizona', 'Colorado', 'Idaho', 'Montana', 'Nevada', 'New Mexico', 'Utah', 'Wyoming', 'Alaska', 'California', 'Hawaii', 'Oregon', 'Washington']
}

# Function to extract states from event name or description
def extract_states(row):
    state_full_pattern = r'\b(Alabama|Alaska|Arizona|Arkansas|California|Colorado|Connecticut|Delaware|Florida|Georgia|Hawaii|Idaho|Illinois|Indiana|Iowa|Kansas|Kentucky|Louisiana|Maine|Maryland|Massachusetts|Michigan|Minnesota|Mississippi|Missouri|Montana|Nebraska|Nevada|New Hampshire|New Jersey|New Mexico|New York|North Carolina|North Dakota|Ohio|Oklahoma|Oregon|Pennsylvania|Rhode Island|South Carolina|South Dakota|Tennessee|Texas|Utah|Vermont|Virginia|Washington|West Virginia|Wisconsin|Wyoming)\b'

    state_abbr_pattern = r'\b(A[LKSZRAEP]|C[AOT]|D[CE]|F[LM]|G[AU]|HI|I[DLNA]|K[SY]|LA|M[DEIAOSNT]|N[CDEHJMVY]|O[HKR]|P[ARW]|RI|S[CD]|T[NX]|UT|V[AIT]|W[AIVY])\b'

    event_name = str(row['event_name']) if row['event_name'] else ''
    event_description = str(row['event_description']) if row['event_description'] else ''

    state_abbr_matches = re.findall(state_abbr_pattern, event_name + ' ' + event_description)
    state_full_matches = re.findall(state_full_pattern, event_name + ' ' + event_description)

    all_states = state_abbr_matches + state_full_matches

    # Remove duplicates but keep entries with both full state names and abbreviations
    state_dict = {}
    for state in all_states:
        if state in state_full_pattern and state not in state_dict.values():
            state_dict[state] = state
        elif state in state_abbr_pattern and state not in state_dict.keys():
            state_dict[state] = state

    # Add hurricane states if the event name matches a known hurricane
    for hurricane, states in hurricane_states.items():
        if hurricane.lower() in event_name.lower():
            for state in states:
                if state not in state_dict.values():
                    state_dict[state] = state

    # Check for regional mentions and add respective states
    regions_pattern = '|'.join(regions.keys())
    region_matches = re.findall(regions_pattern, event_name + ' ' + event_description, re.IGNORECASE)
    for region in region_matches:
        region = region.lower()
        if region in regions:
            for state in regions[region]:
                if state not in state_dict.values():
                    state_dict[state] = state

    if state_dict:
        return sorted(state_dict.values())  # Remove duplicates and sort
    else:
        return ['Null']

# Extract states and add as new column
df_filtered['state'] = df_filtered.apply(extract_states, axis=1)

# Sort the filtered DataFrame by event_start_date
df_sorted = df_filtered.sort_values(by='event_start_date')

# Function to renumber event IDs in order of date
def renumber_event_ids(df):
    df = df.reset_index(drop=True)
    df['event_id'] = df.index + 1
    return df

# Renumber event IDs
df_sorted = renumber_event_ids(df_sorted)

# Display the sorted and renumbered DataFrame
print(df_sorted)

# Saves final data
df_sorted.to_json('/content/usgs_flood_data_cleaned.json', orient='records', lines=True)


In [None]:
import pandas as pd
import re

# Load init data
df = pd.read_json('/content/usgs_flood_data.json')

# Remove the specified columns
columns_to_remove = [
    'event_type_id', 'event_status_id', 'event_coordinator',
    'last_updated_by', 'instruments', 'hwms', 'last_updated'
]
df_cleaned = df.drop(columns=columns_to_remove)

# Function to eliminate events with "Exercise" or "Test" in the event name
def remove_exercises_and_tests(df):
    return df[~df['event_name'].str.contains('Exercise|Test', case=False)]

# Filter the DataFrame for events that started between 2010 and 2020
start_date = '2010-01-01'
end_date = '2020-12-31'
df_filtered = df_cleaned[(df_cleaned['event_start_date'] >= start_date) & (df_cleaned['event_start_date'] <= end_date)]

# Remove exercises and tests
df_filtered = remove_exercises_and_tests(df_filtered)

# Mapping of known hurricane events to their affected states
hurricane_states = {
    'Irene': ['New York', 'New Jersey', 'Vermont', 'Connecticut', 'Massachusetts'],
    'Lee': ['New York', 'New Jersey', 'Pennsylvania'],
    'Isaac': ['Louisiana', 'Mississippi', 'Arkansas', 'Missouri', 'Illinois'],
    'Sandy': ['New Jersey', 'New York', 'Connecticut', 'Delaware', 'Maryland'],
    'Joaquin': ['South Carolina', 'North Carolina'],
    'Hermine': ['Florida', 'Georgia', 'South Carolina'],
    'Matthew': ['Florida', 'Georgia', 'South Carolina', 'North Carolina'],
    'Harvey': ['Texas', 'Louisiana'],
    'Irma': ['Florida', 'Georgia', 'South Carolina', 'Alabama'],
    'Maria': ['Puerto Rico', 'US Virgin Islands'],
    'Jose': ['New Jersey', 'New York', 'Connecticut', 'Massachusetts'],
    'Nate': ['Louisiana', 'Mississippi', 'Alabama'],
    'Lane': ['Hawaii'],
    'Gordon': ['Mississippi', 'Alabama', 'Florida'],
    'Florence': ['North Carolina', 'South Carolina'],
    'Michael': ['Florida', 'Georgia', 'Alabama'],
    'Dorian': ['Florida', 'Georgia', 'South Carolina', 'North Carolina'],
    'Isaias': ['Florida', 'North Carolina', 'New York'],
    'Laura': ['Louisiana', 'Texas'],
    'Sally': ['Florida', 'Alabama'],
    'Delta': ['Louisiana', 'Texas']
}

# Mapping of US regions to their states
regions = {
    'northeast': ['Connecticut', 'Maine', 'Massachusetts', 'New Hampshire', 'Rhode Island', 'Vermont', 'New Jersey', 'New York', 'Pennsylvania'],
    'midwest': ['Illinois', 'Indiana', 'Michigan', 'Ohio', 'Wisconsin', 'Iowa', 'Kansas', 'Minnesota', 'Missouri', 'Nebraska', 'North Dakota', 'South Dakota'],
    'south': ['Delaware', 'Florida', 'Georgia', 'Maryland', 'North Carolina', 'South Carolina', 'Virginia', 'West Virginia', 'Alabama', 'Kentucky', 'Mississippi', 'Tennessee', 'Arkansas', 'Louisiana', 'Oklahoma', 'Texas'],
    'west': ['Arizona', 'Colorado', 'Idaho', 'Montana', 'Nevada', 'New Mexico', 'Utah', 'Wyoming', 'Alaska', 'California', 'Hawaii', 'Oregon', 'Washington']
}

# Function to extract states from event name or description
def extract_states(row):
    state_full_pattern = r'\b(Alabama|Alaska|Arizona|Arkansas|California|Colorado|Connecticut|Delaware|Florida|Georgia|Hawaii|Idaho|Illinois|Indiana|Iowa|Kansas|Kentucky|Louisiana|Maine|Maryland|Massachusetts|Michigan|Minnesota|Mississippi|Missouri|Montana|Nebraska|Nevada|New Hampshire|New Jersey|New Mexico|New York|North Carolina|North Dakota|Ohio|Oklahoma|Oregon|Pennsylvania|Rhode Island|South Carolina|South Dakota|Tennessee|Texas|Utah|Vermont|Virginia|Washington|West Virginia|Wisconsin|Wyoming)\b'

    state_abbr_pattern = r'\b(A[LKSZRAEP]|C[AOT]|D[CE]|F[LM]|G[AU]|HI|I[DLNA]|K[SY]|LA|M[DEIAOSNT]|N[CDEHJMVY]|O[HKR]|P[ARW]|RI|S[CD]|T[NX]|UT|V[AIT]|W[AIVY])\b'

    event_name = str(row['event_name']) if row['event_name'] else ''
    event_description = str(row['event_description']) if row['event_description'] else ''

    state_abbr_matches = re.findall(state_abbr_pattern, event_name + ' ' + event_description)
    state_full_matches = re.findall(state_full_pattern, event_name + ' ' + event_description)

    all_states = state_abbr_matches + state_full_matches

    # Remove duplicates but keep entries with both full state names and abbreviations
    state_dict = {}
    for state in all_states:
        if state in state_full_pattern and state not in state_dict.values():
            state_dict[state] = state
        elif state in state_abbr_pattern and state not in state_dict.keys():
            state_dict[state] = state

    # Add hurricane states if the event name matches a known hurricane
    for hurricane, states in hurricane_states.items():
        if hurricane.lower() in event_name.lower():
            for state in states:
                if state not in state_dict.values():
                    state_dict[state] = state

    # Checks for regional mentions and add respective states
    regions_pattern = '|'.join(regions.keys())
    region_matches = re.findall(regions_pattern, event_name + ' ' + event_description, re.IGNORECASE)
    for region in region_matches:
        region = region.lower()
        if region in regions:
            for state in regions[region]:
                if state not in state_dict.values():
                    state_dict[state] = state

    if state_dict:
        return sorted(state_dict.values())  # Remove duplicates and sort
    else:
        return ['Null']

# Extracts states and add as new column
df_filtered['state'] = df_filtered.apply(extract_states, axis=1)

# Manually updates inferred states
state_updates = {
    14: ['Pennsylvania'],
    16: ['Pennsylvania'],
    27: ['West Virginia'],
    23: ['South Carolina'],
    36: ['California'],
    40: ['Ohio']  # Manually inferred example for event 40
}

for event_id, states in state_updates.items():
    df_filtered.loc[df_filtered['event_id'] == event_id, 'state'] = [states]

# Sort the filtered DataFrame by event_start_date
df_sorted = df_filtered.sort_values(by='event_start_date')

# Function to renumber event IDs in order of date
def renumber_event_ids(df):
    df = df.reset_index(drop=True)
    df['event_id'] = df.index + 1
    return df

# Renumber event IDs
df_sorted = renumber_event_ids(df_sorted)

# Display the entries with "Null" state after extraction
null_state_entries = df_sorted[df_sorted['state'].apply(lambda x: 'Null' in x)]
null_state_entries_list = null_state_entries[['event_id', 'event_name', 'event_description', 'event_start_date', 'event_end_date', 'state']]

# Display the null state entries list
print(null_state_entries_list)

# Saves final data
df_sorted.to_json('/content/usgs_flood_data_cleaned.json', orient='records', lines=True)


In [None]:
#Null state checker(change input path of dataset)
# Load the JSON data
df = pd.read_json('/content/usgs_flood_data_cleaned.json', lines=True)

# Total number of events
total_events = len(df)

# Number of events with 'Null' as the state
null_state_events = len(df[df['state'].apply(lambda x: 'Null' in x)])

# Percentage of events with 'Null' as the state
percentage_null_state_events = round(((null_state_events / total_events) * 100),2)

print("Floods:",total_events,", Floods with no state named:",null_state_events,", Percentage unnamed:",percentage_null_state_events)

Floods: 65 , Floods with no state named: 16 , Percentage unnamed: 24.62


In [None]:
import requests
import json

# NOAA API token
api_key = 'RdMmjNewLcNoQqMywPAXCCEKutgRAvIr'

# Function to get hurricane data from NOAA API
def get_hurricane_data(api_key, start_year=2010, end_year=2020):
    base_url = "https://www.ncdc.noaa.gov/cdo-web/api/v2/stormevents/details"
    headers = {
        'token': api_key
    }
    hurricane_states = {}

    for year in range(start_year, end_year + 1):
        params = {
            'datasetid': 'stormevents',
            'eventtype': 'Hurricane',
            'startdate': f'{year}-01-01',
            'enddate': f'{year}-12-31',
            'limit': 1000
        }

        try:
            response = requests.get(base_url, headers=headers, params=params)
            response.raise_for_status()
            data = response.json()
            for event in data.get('results', []):
                event_name = event.get('event_type')
                state = event.get('state')
                if event_name and state:
                    if event_name not in hurricane_states:
                        hurricane_states[event_name] = set()
                    hurricane_states[event_name].add(state)
        except requests.exceptions.HTTPError as http_err:
            print(f"HTTP error occurred for {year}: {http_err}")
        except Exception as err:
            print(f"Other error occurred for {year}: {err}")

    # Convert sets to sorted lists for JSON serialization
    for event_name in hurricane_states:
        hurricane_states[event_name] = sorted(hurricane_states[event_name])

    return hurricane_states

# Function to save hurricane data to a JSON file
def save_hurricane_data(api_key, start_year=2010, end_year=2020, output_file='hurricane_states.json'):
    hurricane_states = get_hurricane_data(api_key, start_year, end_year)
    with open(output_file, 'w') as outfile:
        json.dump(hurricane_states, outfile, indent=4)
    print(f"Hurricane data successfully saved to '{output_file}'")

# Usage
save_hurricane_data(api_key, start_year=2010, end_year=2020)


**HERE BEGINS SECTION FOR TWITTER/MONGODB **
⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀
⠀⣰⣄⡀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⣀⣴⡾
⠀⠀⣿⡍⠛⠲⣶⣄⣀⡀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⣀⣠⡴⠞⠉⣠⡞⠀⠀
⠀⠀⠘⣽⢷⣦⣌⣈⠋⡚⠿⣦⡀⠀⠀⣴⣶⡄⠀⠀⣠⡶⠚⠛⣙⣭⠠⣤⣶⣯⠆⠀⠀⠀
⠀⠀⠀⣼⣷⣀⠀⠀⠈⠀⠀⠀⢻⡇⠺⡿⠛⣿⡅⠀⢿⠀⠀⣼⠿⣫⣭⣠⣤⡶⠂⠀⠀⠀
⠀⠀⠀⠀⠉⠛⠿⣹⣾⠔⠃⠀⠈⠳⠾⠏⠀⠻⣷⡺⠋⠀⣤⣸⣷⣶⡾⠖⠀⠀⠀⠀⠀⠀
⠀⠀⠀⠀⠀⠈⠒⠷⣿⡻⣞⣀⣄⣀⣀⡄⠀⠀⣠⣄⣸⡿⣾⣿⡽⡄⠀⠀⠀⠀⠀⠀⠀⠀
⠀⠀⠀⠀⠀⠀⠀⠀⠀⠛⠟⠯⣽⢿⡿⠃⠀⢀⣿⡙⠑⠙⠛⠉⠁⠀⠀⠀⠀⠀⠀⠀⠀⠀
⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⢰⣯⣦⣾⣿⠃⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀
⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⢸⣼⣿⣿⣿⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀
⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠈⣿⢩⡿⠘⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀
⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠈⣽⡃⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀
⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀

In [None]:
# @title Necessary(or not) installs
!pip install pymongo
!pip install textblob
!pip install tweepy
!pip install pandas
!pip install re
!pip install pymongo[srv] tweepy textblob
!pip install --upgrade pymongo[srv]
!pip install geopy
!pip install Twarc2
!pip install twarc
!pip install certifi
!pip install pytrends pymongo[srv] certifi
!pip install pytrends pymongo[srv] certifi
!pip install matplotlib
!pip install seaborn
!pip install pymongo[srv]



In [None]:
from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi

uri = "mongodb+srv://cjp224:N4IR3nyeoqOak7yD@cs210data.5ghkb6u.mongodb.net/?retryWrites=true&w=majority&appName=CS210Data"

# Create a new client and connect to the server
client = MongoClient(uri, server_api=ServerApi('1'), tls=True, tlsAllowInvalidCertificates=True)

# Send a ping to confirm a successful connection
try:
    client.admin.command('ping')
    print("Pinged your deployment. You successfully connected to MongoDB!")
except Exception as e:
    print(e)


SSL handshake failed: ac-l21diot-shard-00-02.5ghkb6u.mongodb.net:27017: [SSL: TLSV1_ALERT_INTERNAL_ERROR] tlsv1 alert internal error (_ssl.c:1007) (configured timeouts: socketTimeoutMS: 20000.0ms, connectTimeoutMS: 20000.0ms),SSL handshake failed: ac-l21diot-shard-00-00.5ghkb6u.mongodb.net:27017: [SSL: TLSV1_ALERT_INTERNAL_ERROR] tlsv1 alert internal error (_ssl.c:1007) (configured timeouts: socketTimeoutMS: 20000.0ms, connectTimeoutMS: 20000.0ms),SSL handshake failed: ac-l21diot-shard-00-01.5ghkb6u.mongodb.net:27017: [SSL: TLSV1_ALERT_INTERNAL_ERROR] tlsv1 alert internal error (_ssl.c:1007) (configured timeouts: socketTimeoutMS: 20000.0ms, connectTimeoutMS: 20000.0ms), Timeout: 30s, Topology Description: <TopologyDescription id: 66980147109bd788de7122bb, topology_type: ReplicaSetNoPrimary, servers: [<ServerDescription ('ac-l21diot-shard-00-00.5ghkb6u.mongodb.net', 27017) server_type: Unknown, rtt: None, error=AutoReconnect('SSL handshake failed: ac-l21diot-shard-00-00.5ghkb6u.mongodb.

In [None]:
# @title Initial twitter attempt via tweepy
import tweepy
import pandas as pd
import re
from textblob import TextBlob
from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi
from datetime import datetime
import pytz
import certifi
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut

# Twitter API v2 setup
consumer_key = 'WcbUX1tcq66sHTNdDn0FtdUvk'
consumer_secret = 'JJYd7xLHDE4qYnljrGrIIGXaeemOFIPRivJGhaT2AYrQI2Hdkc'
access_token = '1803477174655262722-yTnE5eqoVjKWmRV1svObxL9kByjJZA'
access_token_secret = 'sw4xZINk9pihuWnxjHSRlwO4X5t25q91yh1wBs0D2gx25'
bearer_token = 'AAAAAAAAAAAAAAAAAAAAAB8xuwEAAAAAQXktnZ8i826qQoSk7L32Jyxtqoc%3DmKdq8lt7PsjWId6p2KJoX57paC6KUITrWzsQzRgMwigQrRBypN'

client = tweepy.Client(bearer_token=bearer_token,
                       consumer_key=consumer_key,
                       consumer_secret=consumer_secret,
                       access_token=access_token,
                       access_token_secret=access_token_secret)

# Define search query without place_country operator
search_query = "flood -is:retweet -is:reply"
no_of_tweets = 10

# Geolocator setup
geolocator = Nominatim(user_agent="geoapiExercises")

# Function to get state from location
def get_state_from_location(lat, lon):
    try:
        location = geolocator.reverse((lat, lon), timeout=10)
        if location and 'address' in location.raw:
            address = location.raw['address']
            return address.get('state', 'Unknown')
    except GeocoderTimedOut:
        return 'Unknown'
    return 'Unknown'

try:
    response = client.search_recent_tweets(query=search_query, max_results=no_of_tweets, tweet_fields=['created_at', 'public_metrics', 'source', 'text', 'geo'])
    tweets = response.data

    if tweets:
        # Extract tweet attributes and create DataFrame
        attributes_container = []
        for tweet in tweets:
            if tweet.geo and 'coordinates' in tweet.geo:
                lat, lon = tweet.geo['coordinates']['coordinates']
                state = get_state_from_location(lat, lon)
            else:
                state = 'Unknown'
            attributes_container.append([tweet.author_id, tweet.created_at, tweet.public_metrics['like_count'], tweet.source, tweet.text, state])

        columns = ["User", "Date Created", "Number of Likes", "Source of Tweet", "Tweet", "State"]
        tweets_df = pd.DataFrame(attributes_container, columns=columns)
        print("Tweets DataFrame created successfully.")
    else:
        print("No tweets found.")
        tweets_df = pd.DataFrame(columns=["User", "Date Created", "Number of Likes", "Source of Tweet", "Tweet", "State"])

except tweepy.TweepyException as e:
    print('Status Failed On,', str(e))
    tweets_df = pd.DataFrame(columns=["User", "Date Created", "Number of Likes", "Source of Tweet", "Tweet", "State"])

# Ensure tweets_df has data
print(f"Number of tweets fetched: {len(tweets_df)}")

if not tweets_df.empty:
    # Filter tweets from 2010 to 2020
    start_date = datetime(2010, 1, 1, tzinfo=pytz.UTC)
    end_date = datetime(2020, 12, 31, tzinfo=pytz.UTC)
    tweets_df['Date Created'] = pd.to_datetime(tweets_df['Date Created'])
    tweets_df = tweets_df[(tweets_df['Date Created'] >= start_date) & (tweets_df['Date Created'] <= end_date)]

    print(f"Number of tweets after date filter: {len(tweets_df)}")

    # Clean tweet text
    def clean_tweet(text):
        text = re.sub(r'http\S+', '', text)
        text = re.sub(r'@\S+', '', text)
        text = re.sub(r'#\S+', '', text)
        return text

    tweets_df['Cleaned Tweet'] = tweets_df['Tweet'].apply(clean_tweet)

    print(f"Sample cleaned tweets: {tweets_df['Cleaned Tweet'].head()}")

    # Sentiment Analysis
    def analyze_sentiment(text):
        analysis = TextBlob(text)
        return 'positive' if analysis.sentiment.polarity > 0 else 'negative' if analysis.sentiment.polarity < 0 else 'neutral'

    tweets_df['Sentiment'] = tweets_df['Cleaned Tweet'].apply(analyze_sentiment)

    # Tokenization
    tweets_df['Tokens'] = tweets_df['Cleaned Tweet'].apply(lambda x: x.split())

    # Custom filtering for contextually relevant tweets
    def is_relevant_tweet(text):
        relevant_keywords = ['flood', 'flooding', 'flooded', 'floods', 'rain', 'river', 'storm', 'disaster']
        irrelevant_phrases = ['my inbox', 'my emails', 'my email', 'my phone', 'workload', 'work load', 'calendar']

        for phrase in irrelevant_phrases:
            if phrase in text.lower():
                return False

        return any(keyword in text.lower() for keyword in relevant_keywords)

    tweets_df = tweets_df[tweets_df['Cleaned Tweet'].apply(is_relevant_tweet)]

    print(f"Number of relevant tweets: {len(tweets_df)}")

    # MongoDB Atlas setup
    uri = "mongodb+srv://cjp224:N4IR3nyeoqOak7yD@cs210data.5ghkb6u.mongodb.net/?retryWrites=true&w=majority&appName=CS210Data"
    mongo_client = MongoClient(uri, server_api=ServerApi('1'), tls=True, tlsCAFile=certifi.where())

    try:
        mongo_client.admin.command('ping')
        print("Pinged your deployment. You successfully connected to MongoDB!")

        db = mongo_client['CS210Data']
        tweets_collection = db['tweets']

        # Insert tweets into MongoDB
        tweets_data = tweets_df.to_dict('records')
        if tweets_data:
            tweets_collection.insert_many(tweets_data)
            print("Tweets inserted into MongoDB successfully.")

            # Retrieve sorted tweets from MongoDB
            sorted_tweets = tweets_collection.find().sort('Date Created', -1)
            tweets_data_sorted = list(sorted_tweets)
            df_tweets_sorted = pd.DataFrame(tweets_data_sorted)

            # Load cleaned flood data
            df_flood = pd.read_json('/mnt/data/usgs_flood_data_cleaned.json')

            # Example comparison: Number of tweets per flood event
            df_flood['Tweet Count'] = df_flood['event_name'].apply(lambda x: df_tweets_sorted['Tweet'].str.contains(x, case=False).sum())

            print(df_flood[['event_name', 'Tweet Count']])
        else:
            print("No valid tweet data to insert into MongoDB.")

    except Exception as e:
        print("Could not connect to MongoDB:", e)
else:
    print("tweets_df is empty. No tweets to process.")


In [None]:
# @title Secondary Twitter/X attempt via twarc2
from twarc import Twarc2, expansions
import datetime
import json
import pandas as pd
from textblob import TextBlob
from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi
import certifi
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut

# Replace your bearer token below
client = Twarc2(bearer_token="AAAAAAAAAAAAAAAAAAAAAB8xuwEAAAAAQXktnZ8i826qQoSk7L32Jyxtqoc%3DmKdq8lt7PsjWId6p2KJoX57paC6KUITrWzsQzRgMwigQrRBypN")

# MongoDB Atlas setup
uri = "mongodb+srv://cjp224:N4IR3nyeoqOak7yD@cs210data.5ghkb6u.mongodb.net/?retryWrites=true&w=majority&appName=CS210Data"
mongo_client = MongoClient(uri, server_api=ServerApi('1'), tls=True, tlsCAFile=certifi.where())

try:
    mongo_client.admin.command('ping')
    print("Pinged your deployment. You successfully connected to MongoDB!")
except Exception as e:
    print("Could not connect to MongoDB:", e)

# Geolocator setup
geolocator = Nominatim(user_agent="geoapiExercises")

# Function to get state from location
def get_state_from_location(lat, lon):
    try:
        location = geolocator.reverse((lat, lon), timeout=10)
        if location and 'address' in location.raw:
            address = location.raw['address']
            return address.get('state', 'Unknown')
    except GeocoderTimedOut:
        return 'Unknown'
    return 'Unknown'

# Function to fetch tweets for a given year
def fetch_tweets_for_year(year):
    start_time = datetime.datetime(year, 1, 1, 0, 0, 0, 0, datetime.timezone.utc)
    end_time = datetime.datetime(year, 12, 31, 23, 59, 59, 999999, datetime.timezone.utc)
    query = "flood -is:retweet -is:reply"

    search_results = client.search_all(query=query, start_time=start_time, end_time=end_time, max_results=10)

    tweets_data = []
    for page in search_results:
        result = expansions.flatten(page)
        for tweet in result:
            if 'geo' in tweet and tweet['geo']:
                lat, lon = tweet['geo']['coordinates']['coordinates']
                state = get_state_from_location(lat, lon)
            else:
                state = 'Unknown'
            tweets_data.append({
                'User': tweet['author_id'],
                'Date Created': tweet['created_at'],
                'Number of Likes': tweet['public_metrics']['like_count'],
                'Source of Tweet': tweet['source'],
                'Tweet': tweet['text'],
                'State': state
            })

    return tweets_data

# Fetch tweets from 2010 to 2020
all_tweets = []
for year in range(2010, 2021):
    print(f"Fetching tweets for {year}...")
    tweets_data = fetch_tweets_for_year(year)
    all_tweets.extend(tweets_data)
    print(f"Number of tweets fetched for {year}: {len(tweets_data)}")

# Create DataFrame from tweets
tweets_df = pd.DataFrame(all_tweets)
print(f"Total number of tweets fetched: {len(tweets_df)}")

if not tweets_df.empty:
    # Clean tweet text
    def clean_tweet(text):
        text = re.sub(r'http\S+', '', text)
        text = re.sub(r'@\S+', '', text)
        text = re.sub(r'#\S+', '', text)
        return text

    tweets_df['Cleaned Tweet'] = tweets_df['Tweet'].apply(clean_tweet)

    # Sentiment Analysis
    def analyze_sentiment(text):
        analysis = TextBlob(text)
        return 'positive' if analysis.sentiment.polarity > 0 else 'negative' if analysis.sentiment.polarity < 0 else 'neutral'

    tweets_df['Sentiment'] = tweets_df['Cleaned Tweet'].apply(analyze_sentiment)

    # Tokenization
    tweets_df['Tokens'] = tweets_df['Cleaned Tweet'].apply(lambda x: x.split())

    # Custom filtering for contextually relevant tweets
    def is_relevant_tweet(text):
        relevant_keywords = ['flood', 'flooding', 'flooded', 'floods', 'rain', 'river', 'storm', 'disaster']
        irrelevant_phrases = ['my inbox', 'my emails', 'my email', 'my phone', 'workload', 'work load', 'calendar']

        for phrase in irrelevant_phrases:
            if phrase in text.lower():
                return False

        return any(keyword in text.lower() for keyword in relevant_keywords)

    tweets_df = tweets_df[tweets_df['Cleaned Tweet'].apply(is_relevant_tweet)]

    print(f"Number of relevant tweets: {len(tweets_df)}")

    db = mongo_client['CS210Data']
    tweets_collection = db['tweets']

    # Insert tweets into MongoDB
    tweets_data = tweets_df.to_dict('records')
    if tweets_data:
        tweets_collection.insert_many(tweets_data)
        print("Tweets inserted into MongoDB successfully.")

        # Retrieve sorted tweets from MongoDB
        sorted_tweets = tweets_collection.find().sort('Date Created', -1)
        tweets_data_sorted = list(sorted_tweets)
        df_tweets_sorted = pd.DataFrame(tweets_data_sorted)

        # Load cleaned flood data
        df_flood = pd.read_json('/mnt/data/usgs_flood_data_cleaned.json')

        # Example comparison: Number of tweets per flood event
        df_flood['Tweet Count'] = df_flood['event_name'].apply(lambda x: df_tweets_sorted['Tweet'].str.contains(x, case=False).sum())

        print(df_flood[['event_name', 'Tweet Count']])
    else:
        print("No valid tweet data to insert into MongoDB.")
else:
    print("tweets_df is empty. No tweets to process.")


In [None]:
# @title Initial Google Trends attempt via their API
!pip install pytrends pymongo[srv] certifi matplotlib seaborn

from pytrends.request import TrendReq
import pandas as pd
from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi
import certifi
from datetime import datetime
import time
import matplotlib.pyplot as plt
import seaborn as sns

# MongoDB connection setup
uri = "mongodb+srv://cjp224:N4IR3nyeoqOak7yD@cs210data.5ghkb6u.mongodb.net/?retryWrites=true&w=majority&appName=CS210Data"
client = MongoClient(uri, server_api=ServerApi('1'), tls=True, tlsCAFile=certifi.where())
try:
    client.admin.command('ping')
    print("Pinged your deployment. You successfully connected to MongoDB!")
except Exception as e:
    print("Could not connect to MongoDB:", e)

# Fetch Google Trends data
pytrends = TrendReq(hl='en-US', tz=360)

def fetch_google_trends_data(keyword, start_year, end_year, max_points=10000):
    all_data = []
    points_fetched = 0
    backoff_time = 1  # Initial backoff time in seconds
    for year in range(start_year, end_year + 1):
        for month in range(1, 13):
            if points_fetched >= max_points:
                break
            timeframe = f"{year}-{month:02d}-01 {year}-{month:02d}-28"
            try:
                pytrends.build_payload([keyword], cat=0, timeframe=timeframe, geo='US', gprop='')
                interest_over_time_df = pytrends.interest_over_time()
                if not interest_over_time_df.empty:
                    interest_over_time_df.reset_index(inplace=True)
                    interest_over_time_df['year'] = year
                    interest_over_time_df['month'] = month
                    all_data.append(interest_over_time_df)
                    points_fetched += len(interest_over_time_df)
                time.sleep(2)  # Add a delay between requests
                backoff_time = 1  # Reset backoff time after a successful request
            except Exception as e:
                print(f"Error fetching data for {timeframe}: {e}")
                print(f"Sleeping for {backoff_time} seconds before retrying...")
                time.sleep(backoff_time)
                backoff_time = min(backoff_time * 2, 60)  # Exponential backoff with a cap at 60 seconds
        if points_fetched >= max_points:
            break
    return pd.concat(all_data, ignore_index=True)

flood_trends_df = fetch_google_trends_data('flood', 2010, 2020)
flood_trends_df['date'] = flood_trends_df['date'].dt.to_period('M')
monthly_interest = flood_trends_df.groupby(['date'])['flood'].mean().reset_index()

# Load USGS flood data
usgs_flood_df = pd.read_json('/content/usgs_flood_data_cleaned.json', lines=True)
usgs_flood_df['event_start_date'] = pd.to_datetime(usgs_flood_df['event_start_date'])
usgs_flood_df['year'] = usgs_flood_df['event_start_date'].dt.year
usgs_flood_df['month'] = usgs_flood_df['event_start_date'].dt.month

# Merge datasets
merged_df = pd.merge(usgs_flood_df, monthly_interest, left_on=['year', 'month'], right_on=['date'])

# Compare Trends and Events
def compare_trends_and_events(df):
    comparison_result = df.groupby(['state'])['flood'].mean().reset_index()
    return comparison_result

comparison_result = compare_trends_and_events(merged_df)
print(comparison_result)

# Store Data in MongoDB
try:
    db = client['CS210Data']
    trends_collection = db['google_trends']
    trends_data = merged_df.to_dict('records')
    if trends_data:
        trends_collection.insert_many(trends_data)
        print("Trends data inserted into MongoDB successfully.")
    else:
        print("No valid trend data to insert into MongoDB.")
except Exception as e:
    print("Could not connect to MongoDB:", e)

# Visualization
plt.figure(figsize=(12, 6))
sns.lineplot(data=monthly_interest, x='date', y='flood')
plt.title('Google Trends Interest Over Time for "Flood" (2010-2020)')
plt.xlabel('Date')
plt.ylabel('Interest')
plt.show()

plt.figure(figsize=(12, 6))
sns.barplot(data=comparison_result, x='state', y='flood')
plt.title('Average Google Trends Interest by State')
plt.xlabel('State')
plt.ylabel('Average Interest')
plt.xticks(rotation=90)
plt.show()

