In [1]:
from pymongo import MongoClient
import pandas as pd
import time
import requests
import json
from getpass import getpass
import os
import folium
from folium import Choropleth, Circle, Marker, Icon, Map
from folium.plugins import HeatMap, MarkerCluster
import geopandas as gpd
from cartoframes.viz import Map, Layer, popup_element
from dotenv import load_dotenv

In [2]:
token = os.getenv("token")

In [3]:
load_dotenv()

True

In [4]:
client = MongoClient("localhost:27017")
client

db = client["Ironhack"]
c = db.get_collection("companies")

In [17]:
def companies_data():
    condition_1 = {"$or": [{"category_code": "games_video"}]}
    condition_2 = {"number_of_employees": {"$gte": 87, "$lte": 100}}
    condition_3 = {"funding_rounds": {"$elemMatch": {"raised_amount": {"$gte": 1000000}}}}

    projection = {"_id": 0, "name": 1, "number_of_employees": 1, "category_code": 1, "offices": 1, "funding_rounds.raised_amount": 1}

    result = list(c.find({"$and": [condition_1]}, projection))

    processed_data = []
    for doc in result:
        if "offices" in doc and len(doc["offices"]) > 0:
            office = doc["offices"][0]
            latitude = office.get("latitude")
            longitude = office.get("longitude")
            country_code = office.get("country_code")
            city = office.get("city")

            if latitude is not None and longitude is not None:
                for funding_round in doc.get("funding_rounds", []):
                    raised_amount = funding_round.get("raised_amount")
                    if pd.notna(raised_amount):  # Filtrar valores no NaN
                        processed_data.append({
                            "name": doc.get("name"),
                            "funding_rounds.raised_amount": raised_amount,
                            "number_of_employees": doc.get("number_of_employees"),
                            "category_code": doc.get("category_code"),
                            "latitude": latitude,
                            "longitude": longitude,
                            "country_code": country_code,
                            "city": city
                        })

    df = pd.DataFrame(processed_data)

    return df

df = companies_data()
df

Unnamed: 0,name,funding_rounds.raised_amount,number_of_employees,category_code,latitude,longitude,country_code,city
0,Lala,5700000,,games_video,37.451151,-122.154369,USA,Palo Alto
1,Lala,9400000,,games_video,37.451151,-122.154369,USA,Palo Alto
2,Lala,20000000,,games_video,37.451151,-122.154369,USA,Palo Alto
3,Lala,9050000,,games_video,37.451151,-122.154369,USA,Palo Alto
4,Joost,45000000,0.0,games_video,40.746497,-74.009447,USA,New York
...,...,...,...,...,...,...,...,...
483,Exent,3000000,100.0,games_video,40.752380,-74.005568,USA,New York
484,PopCap Games,22500000,,games_video,47.616169,-122.345190,USA,Seattle
485,Tiny Speck,1500000,45.0,games_video,49.282455,-123.109217,CAN,Vancouver
486,Tiny Speck,5000000,45.0,games_video,49.282455,-123.109217,CAN,Vancouver


### We look for the city with more matches

In [18]:
city_counts = df['city'].value_counts()

most_common_city = city_counts.idxmax()
most_common_count = city_counts.max()

print(f"The city with more coincidences is'{most_common_city}' with {most_common_count} matches.")

The city with more coincidences is'San Francisco' with 79 matches.


### We show the city with more matches in the DF

In [19]:
most_common_city = df['city'].mode()[0]

filtered_df = df[df['city'] == most_common_city]

df = filtered_df
df

Unnamed: 0,name,funding_rounds.raised_amount,number_of_employees,category_code,latitude,longitude,country_code,city
6,Kyte,2250000,40.0,games_video,37.788482,-122.409173,USA,San Francisco
7,Kyte,15000000,40.0,games_video,37.788482,-122.409173,USA,San Francisco
8,Kyte,6100000,40.0,games_video,37.788482,-122.409173,USA,San Francisco
20,Ustream,1700000,250.0,games_video,37.392936,-122.079480,USA,San Francisco
21,Ustream,11800000,250.0,games_video,37.392936,-122.079480,USA,San Francisco
...,...,...,...,...,...,...,...,...
412,Heyzap,3000000,,games_video,37.790554,-122.404149,USA,San Francisco
413,Heyzap,4300000,,games_video,37.790554,-122.404149,USA,San Francisco
442,Unity Technologies,5500000,200.0,games_video,55.692461,12.530107,USA,San Francisco
443,Unity Technologies,12000000,200.0,games_video,55.692461,12.530107,USA,San Francisco


### We skip Zinch company as latitude and longitude are not in San Francisco, even if the data shows that so we remove it. It is in other city very far away.

In [20]:
filtered_df = df[df['name'] != 'Zinch']

# Reemplazar el DataFrame original con el filtrado
df = filtered_df
df

Unnamed: 0,name,funding_rounds.raised_amount,number_of_employees,category_code,latitude,longitude,country_code,city
6,Kyte,2250000,40.0,games_video,37.788482,-122.409173,USA,San Francisco
7,Kyte,15000000,40.0,games_video,37.788482,-122.409173,USA,San Francisco
8,Kyte,6100000,40.0,games_video,37.788482,-122.409173,USA,San Francisco
20,Ustream,1700000,250.0,games_video,37.392936,-122.079480,USA,San Francisco
21,Ustream,11800000,250.0,games_video,37.392936,-122.079480,USA,San Francisco
...,...,...,...,...,...,...,...,...
412,Heyzap,3000000,,games_video,37.790554,-122.404149,USA,San Francisco
413,Heyzap,4300000,,games_video,37.790554,-122.404149,USA,San Francisco
442,Unity Technologies,5500000,200.0,games_video,55.692461,12.530107,USA,San Francisco
443,Unity Technologies,12000000,200.0,games_video,55.692461,12.530107,USA,San Francisco


In [21]:
def export_to_json_city_max_count(df, output_directory, output_file):
    # Find the city that appears the most times in the DataFrame
    most_common_city = df['city'].mode()[0]

    filtered_df = df[df['city'] == most_common_city]

    try:
        full_file_path = os.path.join(output_directory, output_file)
        os.makedirs(output_directory, exist_ok=True)
        with open(full_file_path, 'w') as json_file:
            json.dump(filtered_df.to_dict(orient="records"), json_file, indent=4)
        print(f"Data exported to '{full_file_path}'")
        return True
    except Exception as e:
        print(f"Error: {e}")
        return False

output_directory = r"C:\Users\photo\Desktop\Ironhack\projects\project-III-geolocation\data"
output_file = "company_data.json"

export_to_json_city_max_count(df, output_directory, output_file)

Data exported to 'C:\Users\photo\Desktop\Ironhack\projects\project-III-geolocation\data\company_data.json'


True