In [24]:
from pymongo import MongoClient
import pandas as pd
import time
import requests
import json
from getpass import getpass
import os
import folium
from folium import Choropleth, Circle, Marker, Icon, Map
from folium.plugins import HeatMap, MarkerCluster
import geopandas as gpd
from cartoframes.viz import Map, Layer, popup_element
from dotenv import load_dotenv

In [25]:
token = os.getenv("token")

In [26]:
load_dotenv()

True

In [27]:
client = MongoClient("localhost:27017")
client

db = client["Ironhack"]
c = db.get_collection("companies")

In [28]:
def companies_data():
    condition_1 = {"$or": [{"category_code": "games_video"}]}
    condition_2 = {"number_of_employees": {"$gte": 77, "$lte": 100}}

    projection = {"_id": 0, "name": 1, "number_of_employees": 1, "category_code": 1, "offices": 1, "funding_rounds.raised_amount": 1}

    result = list(c.find({"$and": [condition_1, condition_2]}, projection))

    processed_data = []
    for doc in result:
        if "offices" in doc and len(doc["offices"]) > 0:
            office = doc["offices"][0]
            latitude = office.get("latitude")
            longitude = office.get("longitude")
            country_code = office.get("country_code")
            city = office.get("city")

            if latitude is not None and longitude is not None:
                for funding_round in doc.get("funding_rounds", []):
                    raised_amount = funding_round.get("raised_amount")
                    if pd.notna(raised_amount):  # Filtrar valores no NaN
                        processed_data.append({
                            "name": doc.get("name"),
                            "funding_rounds.raised_amount": raised_amount,
                            "number_of_employees": doc.get("number_of_employees"),
                            "category_code": doc.get("category_code"),
                            "latitude": latitude,
                            "longitude": longitude,
                            "country_code": country_code,
                            "city": city
                        })

    df = pd.DataFrame(processed_data)

    return df

df = companies_data()
df

Unnamed: 0,name,funding_rounds.raised_amount,number_of_employees,category_code,latitude,longitude,country_code,city
0,hi5,20000000,100,games_video,37.788668,-122.400558,USA,San Francisco
1,hi5,15000000,100,games_video,37.788668,-122.400558,USA,San Francisco
2,hi5,3000000,100,games_video,37.788668,-122.400558,USA,San Francisco
3,hi5,14000000,100,games_video,37.788668,-122.400558,USA,San Francisco
4,mig33,10000000,100,games_video,37.580304,-122.343679,SGP,Singapore
5,mig33,13500000,100,games_video,37.580304,-122.343679,SGP,Singapore
6,mig33,8900000,100,games_video,37.580304,-122.343679,SGP,Singapore
7,Social Gaming Network,15000000,100,games_video,37.446823,-122.161523,USA,Los Angeles
8,Social Gaming Network,100000,100,games_video,37.446823,-122.161523,USA,Los Angeles
9,Social Gaming Network,2000000,100,games_video,37.446823,-122.161523,USA,Los Angeles


### We look for the city with more matches

In [29]:
city_counts = df['city'].value_counts()

most_common_city = city_counts.idxmax()
most_common_count = city_counts.max()

print(f"The city with more coincidences is'{most_common_city}' with {most_common_count} matches.")

The city with more coincidences is'San Francisco' with 4 matches.


### We show the city with more matches in the DF

In [30]:
most_common_city = df['city'].mode()[0]

filtered_df = df[df['city'] == most_common_city]

df = filtered_df
df

Unnamed: 0,name,funding_rounds.raised_amount,number_of_employees,category_code,latitude,longitude,country_code,city
0,hi5,20000000,100,games_video,37.788668,-122.400558,USA,San Francisco
1,hi5,15000000,100,games_video,37.788668,-122.400558,USA,San Francisco
2,hi5,3000000,100,games_video,37.788668,-122.400558,USA,San Francisco
3,hi5,14000000,100,games_video,37.788668,-122.400558,USA,San Francisco


### We skip Zinch company as latitude and longitude are not in San Francisco, even if the data shows that so we remove it. It is in other city very far away.

In [31]:
filtered_df = df[df['name'] != 'Zinch']

# Reemplazar el DataFrame original con el filtrado
df = filtered_df
df

Unnamed: 0,name,funding_rounds.raised_amount,number_of_employees,category_code,latitude,longitude,country_code,city
0,hi5,20000000,100,games_video,37.788668,-122.400558,USA,San Francisco
1,hi5,15000000,100,games_video,37.788668,-122.400558,USA,San Francisco
2,hi5,3000000,100,games_video,37.788668,-122.400558,USA,San Francisco
3,hi5,14000000,100,games_video,37.788668,-122.400558,USA,San Francisco


In [32]:
def export_to_json_city_max_count(df, output_directory, output_file):
    # Find the city that appears the most times in the DataFrame
    most_common_city = df['city'].mode()[0]

    # Filter the DataFrame to include only the rows with the most common city
    filtered_df = df[df['city'] == most_common_city]

    try:
        full_file_path = os.path.join(output_directory, output_file)
        os.makedirs(output_directory, exist_ok=True)
        with open(full_file_path, 'w') as json_file:
            json.dump(filtered_df.to_dict(orient="records"), json_file, indent=4)
        print(f"Data exported to '{full_file_path}'")
        return True
    except Exception as e:
        print(f"Error: {e}")
        return False

output_directory = r"C:\Users\photo\Desktop\Ironhack\projects\project-III-geolocation\data"
output_file = "company_data.json"

export_to_json_city_max_count(df, output_directory, output_file)

Data exported to 'C:\Users\photo\Desktop\Ironhack\projects\project-III-geolocation\data\company_data.json'


True