In [1]:
import pandas as pd
from pymongo import MongoClient
from bson.json_util import dumps
import re

In [2]:
client = MongoClient("mongodb://localhost/datamad0320")
db = client.get_database()

In [3]:
# Voy a limpiar la base de datos companies
all_companies = db.companies.find({},{"name":1,"offices":1,"total_money_raised":1,"number_of_employees":1,
                                   "founded_year":1,"category_code":1})

In [4]:
df = pd.DataFrame(all_companies)
display(df.head(),df.shape)

Unnamed: 0,_id,name,category_code,number_of_employees,founded_year,total_money_raised,offices
0,52cdef7c4bab8bd675297d8b,AdventNet,enterprise,600.0,1996.0,$0,"[{'description': 'Headquarters', 'address1': '..."
1,52cdef7c4bab8bd675297d92,Flektor,games_video,,,$0,"[{'description': None, 'address1': '8536 Natio..."
2,52cdef7c4bab8bd675297d8c,Zoho,software,1600.0,2005.0,$0,"[{'description': 'Headquarters', 'address1': '..."
3,52cdef7c4bab8bd675297d8a,Wetpaint,web,47.0,2005.0,$39.8M,"[{'description': '', 'address1': '710 - 2nd Av..."
4,52cdef7c4bab8bd675297d93,Fox Interactive Media,web,0.0,1979.0,$0,"[{'description': '', 'address1': '407 N Maple ..."


(18801, 7)

In [5]:
# convierto los id en formato json para que al guardarlos en formato json no me den problema después
for i in range(len(df["_id"])):
    df["_id"].iloc[i] = dumps(df["_id"].iloc[i])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


In [7]:
# separar los objetos de las listas offices para obtener info de lat, long y cities
df = df.explode("offices")
display(df.head(),df.shape)

Unnamed: 0,_id,name,category_code,number_of_employees,founded_year,total_money_raised,offices
0,"{""$oid"": ""52cdef7c4bab8bd675297d8b""}",AdventNet,enterprise,600.0,1996.0,$0,"{'description': 'Headquarters', 'address1': '4..."
1,"{""$oid"": ""52cdef7c4bab8bd675297d92""}",Flektor,games_video,,,$0,"{'description': None, 'address1': '8536 Nation..."
2,"{""$oid"": ""52cdef7c4bab8bd675297d8c""}",Zoho,software,1600.0,2005.0,$0,"{'description': 'Headquarters', 'address1': '4..."
3,"{""$oid"": ""52cdef7c4bab8bd675297d8a""}",Wetpaint,web,47.0,2005.0,$39.8M,"{'description': '', 'address1': '710 - 2nd Ave..."
3,"{""$oid"": ""52cdef7c4bab8bd675297d8a""}",Wetpaint,web,47.0,2005.0,$39.8M,"{'description': '', 'address1': '270 Lafayette..."


(21762, 7)

In [8]:
#función para obtener los datos de ciudad donde se encuentran las oficinas
def getCity(row):
    of=row.offices
    if type(of) == dict:
        if "city" in of:
            if of["city"]:
                return of["city"]
            else:
                return "NoCity"
        else:
            return None
    else:
        return "NoOffice"

city = pd.DataFrame(df.apply(getCity, axis=1))
city.columns=["city"]
city.head()

Unnamed: 0,city
0,Pleasanton
1,Culver City
2,Pleasanton
3,Seattle
3,New York


In [9]:
city_processed = pd.concat([df,city], axis=1)
display(city_processed.head(),city_processed.shape)

Unnamed: 0,_id,name,category_code,number_of_employees,founded_year,total_money_raised,offices,city
0,"{""$oid"": ""52cdef7c4bab8bd675297d8b""}",AdventNet,enterprise,600.0,1996.0,$0,"{'description': 'Headquarters', 'address1': '4...",Pleasanton
1,"{""$oid"": ""52cdef7c4bab8bd675297d92""}",Flektor,games_video,,,$0,"{'description': None, 'address1': '8536 Nation...",Culver City
2,"{""$oid"": ""52cdef7c4bab8bd675297d8c""}",Zoho,software,1600.0,2005.0,$0,"{'description': 'Headquarters', 'address1': '4...",Pleasanton
3,"{""$oid"": ""52cdef7c4bab8bd675297d8a""}",Wetpaint,web,47.0,2005.0,$39.8M,"{'description': '', 'address1': '710 - 2nd Ave...",Seattle
3,"{""$oid"": ""52cdef7c4bab8bd675297d8a""}",Wetpaint,web,47.0,2005.0,$39.8M,"{'description': '', 'address1': '270 Lafayette...",New York


(21762, 8)

In [10]:
# transform office object into GeoPoint for office
def officeToGeoPoint(row):
    office = row.offices
    if type(office) == dict:
        if 'latitude' in office and 'longitude' in office:
            if office["latitude"] and office["longitude"]:
                return ({
                    "type":"Point",
                    "coordinates":[float(office["longitude"]),float(office["latitude"])]
                },"success")
            else:
                return(None,"Invalid lat and long")
        else:
            return (None,"No lat and long keys in office dict")
    return (None,"No office")

In [11]:
cleaned_offices = city_processed.apply(officeToGeoPoint, axis=1, result_type="expand")
cleaned_offices.columns = ["office","state"]
display(cleaned_offices.head(),cleaned_offices.shape)

Unnamed: 0,office,state
0,"{'type': 'Point', 'coordinates': [-121.904945,...",success
1,"{'type': 'Point', 'coordinates': [-118.379768,...",success
2,"{'type': 'Point', 'coordinates': [-121.904945,...",success
3,"{'type': 'Point', 'coordinates': [-122.333253,...",success
3,"{'type': 'Point', 'coordinates': [-73.9964312,...",success


(21762, 2)

In [12]:
company_processed = pd.concat([city_processed,cleaned_offices], axis=1)
display(company_processed.head(),company_processed.shape)

Unnamed: 0,_id,name,category_code,number_of_employees,founded_year,total_money_raised,offices,city,office,state
0,"{""$oid"": ""52cdef7c4bab8bd675297d8b""}",AdventNet,enterprise,600.0,1996.0,$0,"{'description': 'Headquarters', 'address1': '4...",Pleasanton,"{'type': 'Point', 'coordinates': [-121.904945,...",success
1,"{""$oid"": ""52cdef7c4bab8bd675297d92""}",Flektor,games_video,,,$0,"{'description': None, 'address1': '8536 Nation...",Culver City,"{'type': 'Point', 'coordinates': [-118.379768,...",success
2,"{""$oid"": ""52cdef7c4bab8bd675297d8c""}",Zoho,software,1600.0,2005.0,$0,"{'description': 'Headquarters', 'address1': '4...",Pleasanton,"{'type': 'Point', 'coordinates': [-121.904945,...",success
3,"{""$oid"": ""52cdef7c4bab8bd675297d8a""}",Wetpaint,web,47.0,2005.0,$39.8M,"{'description': '', 'address1': '710 - 2nd Ave...",Seattle,"{'type': 'Point', 'coordinates': [-122.333253,...",success
3,"{""$oid"": ""52cdef7c4bab8bd675297d8a""}",Wetpaint,web,47.0,2005.0,$39.8M,"{'description': '', 'address1': '270 Lafayette...",New York,"{'type': 'Point', 'coordinates': [-73.9964312,...",success


(21762, 10)

In [30]:
# voy a limpiar el dato de total_money_raised
#print(list(company_processed["total_money_raised"].value_counts().index))
#El valor de "total_money_raised" está expresado en diferentes monedas y unidades. Voy a ver qué tengo antes de limpiar
def getTypeMoney(value):
    if re.search(r"\d+\.*\d*",value):
        return "".join(value.split((re.search(r"\d+\.*\d*",value)).group()))
company_processed["total_money_raised_cleaning"] = company_processed["total_money_raised"].apply(getTypeMoney)

In [31]:
company_processed["total_money_raised_cleaning"].value_counts()

$      15527
$M      5059
$k       705
€M       201
€k       104
£M        80
£k        37
C$M       19
$B        15
C$k        9
¥M         3
¥B         2
krM        1
Name: total_money_raised_cleaning, dtype: int64