In [137]:
import pandas as pd
from pymongo import MongoClient
from bson.json_util import dumps
import re

In [138]:
client = MongoClient("mongodb://localhost/datamad0320")
db = client.get_database()

In [139]:
# Voy a limpiar la base de datos companies
all_companies = db.companies.find({},{"name":1,"offices":1,"total_money_raised":1,"number_of_employees":1,
                                   "founded_year":1,"category_code":1})

In [140]:
df = pd.DataFrame(all_companies)
display(df.head(),df.shape)

Unnamed: 0,_id,name,category_code,number_of_employees,founded_year,total_money_raised,offices
0,52cdef7c4bab8bd675297d8b,AdventNet,enterprise,600.0,1996.0,$0,"[{'description': 'Headquarters', 'address1': '..."
1,52cdef7c4bab8bd675297d92,Flektor,games_video,,,$0,"[{'description': None, 'address1': '8536 Natio..."
2,52cdef7c4bab8bd675297d8c,Zoho,software,1600.0,2005.0,$0,"[{'description': 'Headquarters', 'address1': '..."
3,52cdef7c4bab8bd675297d8a,Wetpaint,web,47.0,2005.0,$39.8M,"[{'description': '', 'address1': '710 - 2nd Av..."
4,52cdef7c4bab8bd675297d93,Fox Interactive Media,web,0.0,1979.0,$0,"[{'description': '', 'address1': '407 N Maple ..."


(18801, 7)

In [141]:
# convierto los id en formato json para que al guardarlos en formato json no me den problema después
for i in range(len(df["_id"])):
    df["_id"].iloc[i] = dumps(df["_id"].iloc[i])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


In [142]:
# separar los objetos de las listas offices para obtener info de lat, long y cities
df = df.explode("offices")
display(df.head(),df.shape)

Unnamed: 0,_id,name,category_code,number_of_employees,founded_year,total_money_raised,offices
0,"{""$oid"": ""52cdef7c4bab8bd675297d8b""}",AdventNet,enterprise,600.0,1996.0,$0,"{'description': 'Headquarters', 'address1': '4..."
1,"{""$oid"": ""52cdef7c4bab8bd675297d92""}",Flektor,games_video,,,$0,"{'description': None, 'address1': '8536 Nation..."
2,"{""$oid"": ""52cdef7c4bab8bd675297d8c""}",Zoho,software,1600.0,2005.0,$0,"{'description': 'Headquarters', 'address1': '4..."
3,"{""$oid"": ""52cdef7c4bab8bd675297d8a""}",Wetpaint,web,47.0,2005.0,$39.8M,"{'description': '', 'address1': '710 - 2nd Ave..."
3,"{""$oid"": ""52cdef7c4bab8bd675297d8a""}",Wetpaint,web,47.0,2005.0,$39.8M,"{'description': '', 'address1': '270 Lafayette..."


(21762, 7)

In [143]:
#función para obtener los datos de ciudad donde se encuentran las oficinas
def getCity(row):
    of=row.offices
    if type(of) == dict:
        if "city" in of:
            if of["city"]:
                return of["city"]
            else:
                return "NoCity"
        else:
            return None
    else:
        return "NoOffice"

city = pd.DataFrame(df.apply(getCity, axis=1))
city.columns=["city"]
city.head()

Unnamed: 0,city
0,Pleasanton
1,Culver City
2,Pleasanton
3,Seattle
3,New York


In [144]:
city_processed = pd.concat([df,city], axis=1)
display(city_processed.head(),city_processed.shape)

Unnamed: 0,_id,name,category_code,number_of_employees,founded_year,total_money_raised,offices,city
0,"{""$oid"": ""52cdef7c4bab8bd675297d8b""}",AdventNet,enterprise,600.0,1996.0,$0,"{'description': 'Headquarters', 'address1': '4...",Pleasanton
1,"{""$oid"": ""52cdef7c4bab8bd675297d92""}",Flektor,games_video,,,$0,"{'description': None, 'address1': '8536 Nation...",Culver City
2,"{""$oid"": ""52cdef7c4bab8bd675297d8c""}",Zoho,software,1600.0,2005.0,$0,"{'description': 'Headquarters', 'address1': '4...",Pleasanton
3,"{""$oid"": ""52cdef7c4bab8bd675297d8a""}",Wetpaint,web,47.0,2005.0,$39.8M,"{'description': '', 'address1': '710 - 2nd Ave...",Seattle
3,"{""$oid"": ""52cdef7c4bab8bd675297d8a""}",Wetpaint,web,47.0,2005.0,$39.8M,"{'description': '', 'address1': '270 Lafayette...",New York


(21762, 8)

In [145]:
# transform office object into GeoPoint for office
def officeToGeoPoint(row):
    office = row.offices
    if type(office) == dict:
        if 'latitude' in office and 'longitude' in office:
            if office["latitude"] and office["longitude"]:
                return ({
                    "type":"Point",
                    "coordinates":[float(office["longitude"]),float(office["latitude"])]
                },"success")
            else:
                return(None,"Invalid lat and long")
        else:
            return (None,"No lat and long keys in office dict")
    return (None,"No office")

In [146]:
cleaned_offices = city_processed.apply(officeToGeoPoint, axis=1, result_type="expand")
cleaned_offices.columns = ["office","state"]
display(cleaned_offices.head(),cleaned_offices.shape)

Unnamed: 0,office,state
0,"{'type': 'Point', 'coordinates': [-121.904945,...",success
1,"{'type': 'Point', 'coordinates': [-118.379768,...",success
2,"{'type': 'Point', 'coordinates': [-121.904945,...",success
3,"{'type': 'Point', 'coordinates': [-122.333253,...",success
3,"{'type': 'Point', 'coordinates': [-73.9964312,...",success


(21762, 2)

In [147]:
company_processed = pd.concat([city_processed,cleaned_offices], axis=1)
display(company_processed.head(),company_processed.shape)

Unnamed: 0,_id,name,category_code,number_of_employees,founded_year,total_money_raised,offices,city,office,state
0,"{""$oid"": ""52cdef7c4bab8bd675297d8b""}",AdventNet,enterprise,600.0,1996.0,$0,"{'description': 'Headquarters', 'address1': '4...",Pleasanton,"{'type': 'Point', 'coordinates': [-121.904945,...",success
1,"{""$oid"": ""52cdef7c4bab8bd675297d92""}",Flektor,games_video,,,$0,"{'description': None, 'address1': '8536 Nation...",Culver City,"{'type': 'Point', 'coordinates': [-118.379768,...",success
2,"{""$oid"": ""52cdef7c4bab8bd675297d8c""}",Zoho,software,1600.0,2005.0,$0,"{'description': 'Headquarters', 'address1': '4...",Pleasanton,"{'type': 'Point', 'coordinates': [-121.904945,...",success
3,"{""$oid"": ""52cdef7c4bab8bd675297d8a""}",Wetpaint,web,47.0,2005.0,$39.8M,"{'description': '', 'address1': '710 - 2nd Ave...",Seattle,"{'type': 'Point', 'coordinates': [-122.333253,...",success
3,"{""$oid"": ""52cdef7c4bab8bd675297d8a""}",Wetpaint,web,47.0,2005.0,$39.8M,"{'description': '', 'address1': '270 Lafayette...",New York,"{'type': 'Point', 'coordinates': [-73.9964312,...",success


(21762, 10)

In [148]:
# voy a limpiar el dato de total_money_raised
#print(list(company_processed["total_money_raised"].value_counts().index))
#El valor de "total_money_raised" está expresado en diferentes monedas y unidades. Voy a ver qué tengo antes de limpiar
def getTypeMoney(value):
    if re.search(r"\d+\.*\d*",value):
        return "".join(value.split((re.search(r"\d+\.*\d*",value)).group()))
company_processed["total_money_raised_cleaning"] = company_processed["total_money_raised"].apply(getTypeMoney)

In [149]:
company_processed["total_money_raised_cleaning"].value_counts()

$      15527
$M      5059
$k       705
€M       201
€k       104
£M        80
£k        37
C$M       19
$B        15
C$k        9
¥M         3
¥B         2
krM        1
Name: total_money_raised_cleaning, dtype: int64

In [150]:
# voy a transformar estos tipos de moneda a $ y todo a la misma unidad (k)
# todas las empresas con moneda ¥ son japonesas, así que se refiere a yenes
def getMoney(row):
    dic_money = {r"^C\$\d+":["C$",0.71],r"^\$\d+":["$",1],r"^\£\d+":["£",1.25],r"^\€\d+":["€",1.09],r"^kr\d+":["kr",0.1],
                r"^\¥\d+":["¥",0.0093]}
    value = row["total_money_raised"]
    if value:
        for key in dic_money.keys():
            if re.search(key,value):
                value = value.split(dic_money[key][0])[1]
                if "k" in value:
                    return (round(float(value.split("k")[0])*(dic_money[key][1]),4),"success")
                elif "M" in value:
                    return (round(float(value.split("M")[0])*(dic_money[key][1])*1000,4),"success")
                elif "B" in value:
                    return (round(float(value.split("B")[0])*(dic_money[key][1])*1000000,4),"success")
                else:
                    return (round(float(value)*(dic_money[key][1])/1000,4),"success2")
    else:
        return (None, "fail")

    
money_conversion = company_processed.apply(getMoney,axis=1,result_type="expand")
money_conversion.columns=["total_money_raised ($k)","money_state"]
money_conversion.head()

Unnamed: 0,total_money_raised ($k),money_state
0,0.0,success2
1,0.0,success2
2,0.0,success2
3,39800.0,success
3,39800.0,success


In [151]:
money_conversion.shape

(21762, 2)

In [152]:
money_conversion.money_state.value_counts()

success2    15527
success      6235
Name: money_state, dtype: int64

In [153]:
company_processed = pd.concat([company_processed,money_conversion], axis=1)
display(company_processed.head(),company_processed.shape)

Unnamed: 0,_id,name,category_code,number_of_employees,founded_year,total_money_raised,offices,city,office,state,total_money_raised_cleaning,total_money_raised ($k),money_state
0,"{""$oid"": ""52cdef7c4bab8bd675297d8b""}",AdventNet,enterprise,600.0,1996.0,$0,"{'description': 'Headquarters', 'address1': '4...",Pleasanton,"{'type': 'Point', 'coordinates': [-121.904945,...",success,$,0.0,success2
1,"{""$oid"": ""52cdef7c4bab8bd675297d92""}",Flektor,games_video,,,$0,"{'description': None, 'address1': '8536 Nation...",Culver City,"{'type': 'Point', 'coordinates': [-118.379768,...",success,$,0.0,success2
2,"{""$oid"": ""52cdef7c4bab8bd675297d8c""}",Zoho,software,1600.0,2005.0,$0,"{'description': 'Headquarters', 'address1': '4...",Pleasanton,"{'type': 'Point', 'coordinates': [-121.904945,...",success,$,0.0,success2
3,"{""$oid"": ""52cdef7c4bab8bd675297d8a""}",Wetpaint,web,47.0,2005.0,$39.8M,"{'description': '', 'address1': '710 - 2nd Ave...",Seattle,"{'type': 'Point', 'coordinates': [-122.333253,...",success,$M,39800.0,success
3,"{""$oid"": ""52cdef7c4bab8bd675297d8a""}",Wetpaint,web,47.0,2005.0,$39.8M,"{'description': '', 'address1': '270 Lafayette...",New York,"{'type': 'Point', 'coordinates': [-73.9964312,...",success,$M,39800.0,success


(21762, 13)

In [154]:
company_processed = company_processed.drop(columns=["total_money_raised_cleaning","offices","total_money_raised"])


In [155]:
display(company_processed.head(),company_processed.shape)

Unnamed: 0,_id,name,category_code,number_of_employees,founded_year,city,office,state,total_money_raised ($k),money_state
0,"{""$oid"": ""52cdef7c4bab8bd675297d8b""}",AdventNet,enterprise,600.0,1996.0,Pleasanton,"{'type': 'Point', 'coordinates': [-121.904945,...",success,0.0,success2
1,"{""$oid"": ""52cdef7c4bab8bd675297d92""}",Flektor,games_video,,,Culver City,"{'type': 'Point', 'coordinates': [-118.379768,...",success,0.0,success2
2,"{""$oid"": ""52cdef7c4bab8bd675297d8c""}",Zoho,software,1600.0,2005.0,Pleasanton,"{'type': 'Point', 'coordinates': [-121.904945,...",success,0.0,success2
3,"{""$oid"": ""52cdef7c4bab8bd675297d8a""}",Wetpaint,web,47.0,2005.0,Seattle,"{'type': 'Point', 'coordinates': [-122.333253,...",success,39800.0,success
3,"{""$oid"": ""52cdef7c4bab8bd675297d8a""}",Wetpaint,web,47.0,2005.0,New York,"{'type': 'Point', 'coordinates': [-73.9964312,...",success,39800.0,success


(21762, 10)

In [156]:
# para empezar a filtrar voy a centrarme en los datos de tener cerca un compañía dedicada a design y tener cerca startups in tech
company_processed["city"][company_processed["category_code"]=="design"].value_counts()

London           1
Berlin           1
San Francisco    1
Brooklyn         1
Ellensburg       1
Collingwood      1
Name: city, dtype: int64

In [157]:
# teniendo en cuenta que el mayor valor de founded_year es 2013, para seleccionar las startups voy a suponer que este analisis 
# se está realizando en 2014. Por lo tanto, selecciono solo aquellas cuyo founded_year sea >= 2010 (3 años antes de 2013)
# y las startups deben ser de technologies y haber alcanzado un valor de 1M$ o más

company_processed["city"][((company_processed["category_code"]=="web")|(company_processed["category_code"]== "software")|
                                      (company_processed["category_code"]=="games_video")|(company_processed["category_code"]=="mobile")|
                                      (company_processed["category_code"]=="network_hosting")|(company_processed["category_code"]=="cleantech")|
                                      (company_processed["category_code"]=="biotech")|(company_processed["category_code"]=="photo_video")|
                                      (company_processed["category_code"]=="hardware")|(company_processed["category_code"]=="messaging"))
                         & (company_processed["founded_year"]>=2010) & (company_processed["total_money_raised ($k)"]>=float(1000))].value_counts()

San Francisco          3
Denver                 2
Scottsdale             2
NoOffice               2
Hopkinton              2
Los Angeles            1
Tel Aviv               1
South San Francisco    1
Waterloo, ON           1
Beverly Hills          1
Fremont                1
NoCity                 1
Santa Clara            1
Santa Monica           1
San Jose               1
London                 1
San Mateo              1
Palo Alto              1
Los Altos              1
New York               1
SOUTH BOSTON           1
Bangalore              1
Name: city, dtype: int64

Parece que San Francisco es la ciudad que reune los requisitos: startups tech con éxito y una empresa dedicada a design.
Guardo el json solo con las office de San Francisco.

In [158]:
company_processed["city"].value_counts()

NoOffice           5057
San Francisco       906
New York            837
NoCity              746
London              616
                   ... 
Lyngby                1
Firenze               1
Stavanger             1
Mangalore             1
Reuil-Malmaison       1
Name: city, Length: 3126, dtype: int64

In [159]:
companiesSF = company_processed[company_processed["city"]=="San Francisco"]
display(companiesSF.head(),companiesSF.shape)

Unnamed: 0,_id,name,category_code,number_of_employees,founded_year,city,office,state,total_money_raised ($k),money_state
6,"{""$oid"": ""52cdef7c4bab8bd675297d8d""}",Digg,news,60.0,2004.0,San Francisco,"{'type': 'Point', 'coordinates': [-122.394523,...",success,45000.0,success
8,"{""$oid"": ""52cdef7c4bab8bd675297d97""}",Scribd,news,50.0,2007.0,San Francisco,"{'type': 'Point', 'coordinates': [-122.404052,...",success,25800.0,success
14,"{""$oid"": ""52cdef7c4bab8bd675297d95""}",StumbleUpon,web,,2002.0,San Francisco,"{'type': 'Point', 'coordinates': [-122.419204,...",success,18500.0,success
22,"{""$oid"": ""52cdef7c4bab8bd675297d94""}",Twitter,social,1300.0,2006.0,San Francisco,"{'type': 'Point', 'coordinates': [-122.4169244...",success,1160000.0,success
24,"{""$oid"": ""52cdef7c4bab8bd675297da4""}",Powerset,search,60.0,2006.0,San Francisco,"{'type': 'Point', 'coordinates': [-122.395289,...",success,22500.0,success


(906, 10)

In [160]:
#borro el indice porque mongo detecta indices iguales y elimina esos documentos del dataset
companiesSF_noIndex = companiesSF.drop(columns="_id")

In [161]:
companiesSF_noIndex

Unnamed: 0,name,category_code,number_of_employees,founded_year,city,office,state,total_money_raised ($k),money_state
6,Digg,news,60.0,2004.0,San Francisco,"{'type': 'Point', 'coordinates': [-122.394523,...",success,45000.0,success
8,Scribd,news,50.0,2007.0,San Francisco,"{'type': 'Point', 'coordinates': [-122.404052,...",success,25800.0,success
14,StumbleUpon,web,,2002.0,San Francisco,"{'type': 'Point', 'coordinates': [-122.419204,...",success,18500.0,success
22,Twitter,social,1300.0,2006.0,San Francisco,"{'type': 'Point', 'coordinates': [-122.4169244...",success,1160000.0,success
24,Powerset,search,60.0,2006.0,San Francisco,"{'type': 'Point', 'coordinates': [-122.395289,...",success,22500.0,success
...,...,...,...,...,...,...,...,...,...
18396,Grassroots,public_relations,,,San Francisco,"{'type': 'Point', 'coordinates': [-122.40185, ...",success,0.0,success2
18523,DJ Nitrogen,games_video,8.0,2007.0,San Francisco,"{'type': 'Point', 'coordinates': [-122.4494082...",success,0.0,success2
18526,G5 Entertainment,games_video,107.0,2006.0,San Francisco,,Invalid lat and long,0.0,success2
18587,Embarcadero Technologies,software,450.0,1993.0,San Francisco,"{'type': 'Point', 'coordinates': [-122.398062,...",success,0.0,success2


In [162]:
#voy a gusradra este dataFrame en json, ya que una vez que elija la ciudad empezaré a hacer las queries con ella
companiesSF_noIndex.to_json("../output/companiesSF.json",orient="records")

In [163]:
companiesSF_noIndex.to_csv("../output/dfcompaniesSF.csv")