In [2]:
import pandas as pd
import numpy as np
from dotenv import load_dotenv
import os 
import psycopg2 
import json
import re
import matplotlib.pyplot as plt
from collections import Counter
from typing import TypedDict
load_dotenv()

URL_DB = os.environ.get("URL_DB")
CONN = psycopg2.connect(URL_DB)
CURSOR = CONN.cursor()

In [None]:


timestamp = "2024-05-01 00:00:00.000000"

CURSOR.execute(
	f"SELECT * FROM main_jobs  WHERE timestamp > '{timestamp}'"
)
new_data = CURSOR.fetchall()
df = pd.DataFrame(new_data)


df.head(25)

In [6]:
def load_json_file(file_path: str):
	with open(file_path, 'r') as file:
		return json.load(file)

def save_json_file(data: dict, file_path: str) -> None:
	with open(file_path, 'w') as file:
		json.dump(data, file, indent=4)

In [None]:


def transform_data(input_data: dict) -> dict[str, dict[str, list[dict[str, list[str]]]]]:
	"""Simplify the data structure of 'all_locations.json' for better and easier mapping"""
	result = {}
	for continent, data in input_data.items():
		result[continent] = {"Countries": []}
		for country in data["Countries"]:
			country_name: str = country["country_name"].upper()
			country_code = country["country_code"].upper()
			if country["capital_english"] != "NaN":
				capital_english = country["capital_english"].upper()
			
			subdivisions = []
			if isinstance(country["subdivisions"], list):
				subdivisions = [sub["subdivisions_name"].upper() for sub in country["subdivisions"]]
			elif country["subdivisions"] != "NaN":
				subdivisions = [country["subdivisions"].upper()]
			
			transformed_country = {
				country_name: [country_code, capital_english] + subdivisions
			}
			result[continent]["Countries"].append(transformed_country)
	
	return result

#Example usage:

"""
input_file = "/root/JobsCrawler/src/notebooks/all_locations.json"
output_file = "/root/JobsCrawler/src/notebooks/all_locations_transformed.json"

input_data = load_json_file(input_file)
transformed_data = transform_data(input_data)
save_json_file(transformed_data, output_file)

print("Data transformation complete. Result saved to", output_file)
"""

In [9]:
data = {
    "title": ["Breaking News: Local Event", "Weather Update for the Week", "New Restaurant Opens Downtown"],
    "link": ["https://example.com/news1", "https://example.com/weather", "https://example.com/restaurant"],
    "description": ["A significant local event occurred today.", "Expect sunny skies and mild temperatures.", "Grand opening of a new Italian eatery."],
    "pubdate": ["2023-06-01", "2023-06-02", "2023-06-03"],
    "location": ["United States POLAND", "MEXICO INDIA", "GERMANY USA"],
    "timestamp": ["2023-06-01 09:00:00", "2023-06-02 08:30:00", "2023-06-03 12:00:00"],
}

df = pd.DataFrame(data)

df

Unnamed: 0,title,link,description,pubdate,location,timestamp
0,Breaking News: Local Event,https://example.com/news1,A significant local event occurred today.,2023-06-01,United States POLAND,2023-06-01 09:00:00
1,Weather Update for the Week,https://example.com/weather,Expect sunny skies and mild temperatures.,2023-06-02,MEXICO INDIA,2023-06-02 08:30:00
2,New Restaurant Opens Downtown,https://example.com/restaurant,Grand opening of a new Italian eatery.,2023-06-03,GERMANY USA,2023-06-03 12:00:00


In [10]:



df['original_index'] = df.index

df['location'] = df['location'].astype(str)

df["location"] = df["location"].str.replace(",", "", regex=False).str.replace(")", "", regex=False).str.replace("(", "", regex=False).str.replace("|", " ", regex=False)

df["location"] = df["location"].str.strip().str.split()
df = df.explode("location").reset_index(drop=True)

df.head(25)

#df.to_csv("/root/JobsCrawler/src/notebooks/all_location_words.csv")


Unnamed: 0,title,link,description,pubdate,location,timestamp,original_index
0,Breaking News: Local Event,https://example.com/news1,A significant local event occurred today.,2023-06-01,United,2023-06-01 09:00:00,0
1,Breaking News: Local Event,https://example.com/news1,A significant local event occurred today.,2023-06-01,States,2023-06-01 09:00:00,0
2,Breaking News: Local Event,https://example.com/news1,A significant local event occurred today.,2023-06-01,POLAND,2023-06-01 09:00:00,0
3,Weather Update for the Week,https://example.com/weather,Expect sunny skies and mild temperatures.,2023-06-02,MEXICO,2023-06-02 08:30:00,1
4,Weather Update for the Week,https://example.com/weather,Expect sunny skies and mild temperatures.,2023-06-02,INDIA,2023-06-02 08:30:00,1
5,New Restaurant Opens Downtown,https://example.com/restaurant,Grand opening of a new Italian eatery.,2023-06-03,GERMANY,2023-06-03 12:00:00,2
6,New Restaurant Opens Downtown,https://example.com/restaurant,Grand opening of a new Italian eatery.,2023-06-03,USA,2023-06-03 12:00:00,2


In [30]:
# Get all unique words
unique_words = df["location"].unique()

# Create a new DataFrame with these unique words
#unique_words_df = pd.DataFrame({"unique_word": unique_words})


#unique_words_df.to_csv("/root/JobsCrawler/src/notebooks/unique_words.csv")



## sliding window approach

In [12]:

class Countries(TypedDict):
	country_name: str
	locations: list[str]

class WorldLocations(TypedDict):
	continent: str
	areas: list[str]
	countries: list[Countries]




def find_tag_in_location_data(word: str, location_data: WorldLocations) -> str:
	word_upper = word.upper()
	for continent, countries in location_data.items():
		
		if word_upper == continent.upper():
			return word_upper
		for zone in countries['Zones']:
			if word_upper == zone:
				return word_upper
		for country in countries['Countries']:
			for country_name, locations in country.items():
				if word_upper == country_name or word_upper in [loc for loc in locations]:
					return country_name
	return ""

def get_location_tags(df: pd.DataFrame, json_file_path: str) -> pd.DataFrame:
	location_data = load_json_file(json_file_path)
	result = []
	i = 0
	while i < len(df):
		current_word = str(df.iloc[i]["location"])
		current_original_index = df.loc[i, "original_index"]
		
		tag = find_tag_in_location_data(current_word, location_data)
		
		if tag:
			result.append(tag)
			i += 1
		else:
			# If no match, try to concatenate with the next word if it has the same original_index
			if i + 1 < len(df) and df.loc[i + 1, "original_index"] == current_original_index:
				next_word = str(df.iloc[i + 1]['location'])

				compound_word = f"{current_word} {next_word}"

				tag = find_tag_in_location_data(compound_word, location_data)
				
				if tag:
					result.extend([tag, tag])
					i += 2
				else:
					result.append(np.nan)
					i += 1
			else:
				result.append(np.nan)
				i += 1

	df['location_tags'] = result
	return df

json_file_path = '/root/JobsCrawler/src/notebooks/all_locations_transformed.json'
result_df = get_location_tags(df, json_file_path)
print(result_df)

#result_df.to_csv("/root/JobsCrawler/src/notebooks/country_mapping1.csv")
nan_count_per_column = result_df.isna().sum()


current_word United
next_word States
compound_word United States
current_word POLAND
current_word MEXICO
current_word INDIA
current_word GERMANY
current_word USA
                           title                            link  \
0     Breaking News: Local Event       https://example.com/news1   
1     Breaking News: Local Event       https://example.com/news1   
2     Breaking News: Local Event       https://example.com/news1   
3    Weather Update for the Week     https://example.com/weather   
4    Weather Update for the Week     https://example.com/weather   
5  New Restaurant Opens Downtown  https://example.com/restaurant   
6  New Restaurant Opens Downtown  https://example.com/restaurant   

                                 description     pubdate location  \
0  A significant local event occurred today.  2023-06-01   United   
1  A significant local event occurred today.  2023-06-01   States   
2  A significant local event occurred today.  2023-06-01   POLAND   
3  Expect sunny s

In [13]:

nan_count_per_column = result_df.isna().sum()

print(nan_count_per_column)


title             0
link              0
description       0
pubdate           0
location          0
timestamp         0
original_index    0
location_tags     0
dtype: int64


### GROUPED WORDS BACK

In [15]:

def clean_and_split(s):
	# Remove brackets and split by comma
	tags = re.findall(r"'([^']*)'", s)
	return tags

#result_df = result_df.dropna(subset=['location_tags'])
result_df['location'] = result_df['location'].astype(str)

result_df['location_tags'] = result_df['location_tags'].fillna('NaN')

# Group by original_index and aggregate the locations and tags
grouped = result_df.groupby('original_index').agg({
    'location': lambda x: ' '.join(x),
    'location_tags': lambda x: ''.join(str(x.unique()))
})

# Reset the index to make original_index a column again
grouped = grouped.reset_index()

# Clean up the location column
grouped['location'] = grouped['location'].apply(lambda x: re.sub(r"[\[\]']", "", x))
grouped['location_tags'] = grouped['location_tags'].apply(clean_and_split)

# Sort by original_index to maintain the original order
grouped = grouped.sort_values('original_index')

# Drop the original_index column
grouped = grouped.drop('original_index', axis=1)

# Reset index after sorting and dropping column
grouped = grouped.reset_index(drop=True)

grouped.head(40)

Unnamed: 0,location,location_tags
0,United States POLAND,"[UNITED STATES, POLAND]"
1,MEXICO INDIA,"[MEXICO, INDIA]"
2,GERMANY USA,"[GERMANY, UNITED STATES]"


#### Plot it 

In [None]:

all_tags = grouped['location_tags'].explode().tolist()

print(all_tags)

# Count the tags
tag_counts = Counter(all_tags)

# Sort tags by count in descending order
sorted_tags = sorted(tag_counts.items(), key=lambda x: x[1], reverse=True)

# Separate tags and counts for plotting
tags, counts = zip(*sorted_tags)

# Create a bar plot
plt.figure(figsize=(55, 12))
plt.bar(tags, counts)
plt.title('Tag Counts')
plt.xlabel('Tags')
plt.ylabel('Count')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()

# Show the plot
plt.show()

# Return the tag counts dictionary


## do the same but for the unique words. To see all the words that are not currently mapped

In [None]:

def simple_add_location_tags(df: pd.DataFrame, json_file_path: str) -> pd.DataFrame:
	location_data = load_json_file(json_file_path)
	result = []
	i = 0
	while i < len(df):
		current_word = str(df.iloc[i, 0])
		
		tag = find_tag_in_location_data(current_word, location_data)
		
		if tag:
			result.append(tag)
		else:
			result.append(np.nan)
		
		i += 1

	df['location_tags'] = result
	return df

json_file_path = '/root/JobsCrawler/src/notebooks/all_locations_transformed.json'


unique_mapped_words_df = simple_add_location_tags(unique_words_df, json_file_path)

unique_mapped_words_df.to_csv("/root/JobsCrawler/src/notebooks/unique_word_mapping.csv")


unique_mapped_words_df


In [None]:
unique_mapped_words_df


In [None]:
nan_count_per_column = unique_mapped_words_df.isna().sum()

print(nan_count_per_column)