In [1]:
import matplotlib.pyplot as plt
from countryinfo import CountryInfo
import psycopg2
import pandas as pd
import logging
import json
import os
from datetime import datetime, timedelta
from dotenv import load_dotenv
import pycountry
import time

load_dotenv('.env')
user = os.getenv("user")
password = os.getenv("password")
host = os.getenv("host")
port = os.getenv("port")
database = os.getenv("database")
SAVE_PATH = os.getenv("SAVE_PATH")
E5_BASE_V2_DATA = os.getenv("E5_BASE_V2_DATA")


In [None]:
def filter_last_two_weeks(df:pd.DataFrame) -> pd.DataFrame:
    # Get the current date
    current_date = datetime.now().date()
    
    # Calculate the date two weeks ago from the current date
    two_weeks_ago = current_date - timedelta(days=14)
    
    # Filter the DataFrame to keep only rows with timestamps in the last two weeks
    filtered_df = df[df["timestamp"].dt.date >= two_weeks_ago]
    
    return filtered_df

In [None]:
embeddings_path = E5_BASE_V2_DATA

df_unfiltered = pd.read_parquet(embeddings_path)

df = filter_last_two_weeks(df_unfiltered)

In [None]:
def filter_df_per_country(df: pd.DataFrame, country_name:str) -> pd.DataFrame:
	# Load the JSON file into a Python dictionary
	with open(SAVE_PATH + '/continent_countries_with_capitals.json', 'r') as f:
		data = json.load(f)

	# Function to get country information
	def get_country_info(country_name):
		values = []
		for continent, details in data.items():
			for country in details['Countries']:
				if country['country_name'] == country_name:
					values.append(country['country_name'])
					values.append(country['country_code'])
					values.append(country['capital_english'])
					for subdivision in country['subdivisions']:
						values.append(subdivision['subdivisions_code'])
						values.append(subdivision['subdivisions_name'])
		return values

	# Get information for a specific country
	country_values = get_country_info(country_name)

	# Convert 'location' column to lowercase
	df['location'] = df['location'].str.lower()

	# Convert all country values to lowercase
	country_values = [value.lower() for value in country_values]

	# Create a mask with all False
	mask = pd.Series(False, index=df.index)

	# Update the mask if 'location' column contains any of the country values
	for value in country_values:
		mask |= df['location'].str.contains(value, na=False)

	# Filter DataFrame
	filtered_df = df[mask]

	return filtered_df

In [None]:
result_df = filter_df_per_country(df, "Mexico")

print(result_df, result_df.info())

In [None]:
# look for the values in every column

df = df.applymap(lambda s: s.lower() if type(s) == str else s)

# Convert all country values to lowercase
country_values = [value.lower() for value in country_values]

# Filter DataFrame
# Create a mask with all False
mask = pd.Series(False, index=df.index)

# Update the mask if any string column contains any of the country values
for column in df.select_dtypes(include=[object]).columns:
    for value in country_values:
        mask |= df[column].str.contains(value, na=False)

# Filter DataFrame
filtered_df = df[mask]