# MapReduce Publicaciones

In [2]:
###############################################################
#!/usr/bin/python
# Purpose: Migrate Products from MySQL to Mongo
# Dependecies : python-pip , pymongo , mysqldb
# apt-get install python-pip ; pip install pymongo;
# apt-get install python-MySQLdb


import pymongo
import datetime
import enum
from pprint import pprint

In [3]:
########################################################################################
mongodb_host            = 'mongodb://localhost:27017'
mongodb_dbname          = 'migrate_bda'
n_collection            = 'publicaciones'
#########################
mclient = pymongo.MongoClient(mongodb_host)
mdb = mclient[mongodb_dbname] 

In [4]:
collections = mdb.list_collection_names()
print(collections)

['graduaciones', 'publicaciones', 'out_num_publicaciones_x_institucion', 'out_num_general_publications']


In [5]:
publicaciones = mdb["publicaciones"]
pprint(publicaciones.find()[0])

{'_id': ObjectId('662355df7b6690d646518e76'),
 'date_id': '1994-10-01',
 'egresado_id': {'egresado_id': 60, 'sede_nombre': 'ORINOQUIA'},
 'institucion_id': None,
 'publicacion_id': {'ano_publiacion': '1994-10-01',
                    'descripcion': 'Investment specific general thousand child '
                                   'hundred group only. Certainly wrong trip '
                                   'forward college. Different couple door '
                                   'total yeah it break remember. Within poor '
                                   'care interview describe.',
                    'publicacion_id': 489,
                    'tipo_publicacion': 'Recently.'},
 'registration_id': 1,
 'revista_id': {'revista_id': 1342, 'revista_nombre': 'Attention.'}}


In [6]:
# Tipo de Publicaciones General

stage_group_general_publications = {
  "$group": {
    "_id" : "$publicacion_id",
    "count_general_publicacions": {"$sum": 1}
  }
}

stage_out_general_publications = {
  "$out": "out_num_general_publications"
}

pipeline = [
  stage_group_general_publications,
  stage_out_general_publications
]

results = publicaciones.aggregate(pipeline)

for publicaciones_mapreduce in mdb["out_num_general_publications"].find():
  print(publicaciones_mapreduce)

{'_id': {'publicacion_id': 978, 'tipo_publicacion': 'Director.', 'ano_publiacion': '2007-08-06', 'descripcion': 'Shake act the position beat. Produce civil since dog goal skill side. Fill after participant brother catch carry statement red. If unit window note phone.'}, 'count_general_publicacions': 14}
{'_id': {'publicacion_id': 44, 'tipo_publicacion': 'Know put.', 'ano_publiacion': '1992-04-08', 'descripcion': 'Even nearly magazine new sit pressure suffer. Eye whose interest man best. Heart hear body everybody expert husband. Marriage base expert enter. Technology toward federal floor one bag.'}, 'count_general_publicacions': 12}
{'_id': {'publicacion_id': 800, 'tipo_publicacion': 'Better.', 'ano_publiacion': '1975-05-22', 'descripcion': 'Mention fire probably. Today then value idea. Among seven against different medical international seem. Thank table suffer surface special share peace. Treatment oil family.'}, 'count_general_publicacions': 5}
{'_id': {'publicacion_id': 660, 'tipo_p

In [7]:
# Institución y cantidad de publicaciones

stage_group_institution_number_publications = {
  "$group": {
    "_id" : "$institucion_id.institucion_nombre",
    "count_publicaciones_x_institucion": {"$sum": 1}
  }
}

stage_out_institution_number_publications = {
  "$out": "out_num_publicaciones_x_institucion"
}

pipeline = [
  stage_group_institution_number_publications,
  stage_out_institution_number_publications
]

results = publicaciones.aggregate(pipeline)

for publications_by_intitutions_mapreduce in mdb["out_num_publicaciones_x_institucion"].find():
  print(publications_by_intitutions_mapreduce)

{'_id': 'Lucas-Hill', 'count_publicaciones_x_institucion': 4}
{'_id': 'Miller, Valenzuela and Rogers', 'count_publicaciones_x_institucion': 1}
{'_id': 'Gonzalez LLC', 'count_publicaciones_x_institucion': 1}
{'_id': 'Ward, Smith and Moreno', 'count_publicaciones_x_institucion': 2}
{'_id': 'Dixon Inc', 'count_publicaciones_x_institucion': 1}
{'_id': 'Valencia, Wilson and Moran', 'count_publicaciones_x_institucion': 1}
{'_id': 'Cuevas and Sons', 'count_publicaciones_x_institucion': 1}
{'_id': 'Beard Group', 'count_publicaciones_x_institucion': 1}
{'_id': 'Boyd Group', 'count_publicaciones_x_institucion': 1}
{'_id': 'Leonard, Mitchell and Henderson', 'count_publicaciones_x_institucion': 1}
{'_id': 'Olson, Walters and Ray', 'count_publicaciones_x_institucion': 2}
{'_id': 'Wilkins-Johnson', 'count_publicaciones_x_institucion': 3}
{'_id': 'Williams, Foster and Williams', 'count_publicaciones_x_institucion': 1}
{'_id': 'Harper Ltd', 'count_publicaciones_x_institucion': 2}
{'_id': 'Davidson-Lee

In [8]:
# Publicaciones por Sede

stage_group_publication_by_sede = {
  "$group": {
    "_id" : "$egresado_id.sede_nombre",
    "count_publications_by_sedes": {"$sum": 1} #count_publicaciones_x_institucion
  }
}

stage_out_publications_by_sede = {
  "$out": "out_num_publications_by_sede" # out_num_publicaciones_x_institucion
}

pipeline = [
  stage_group_publication_by_sede,
  stage_out_publications_by_sede
]

#results = publicaciones.aggregate(pipeline)
results = publicaciones.aggregate(pipeline)


for public_mapreduce in mdb["out_num_publications_by_sede"].find():
  print(public_mapreduce)

{'_id': 'PALMIRA', 'count_publications_by_sedes': 216}
{'_id': 'TUMACO Y DE LA PAZ', 'count_publications_by_sedes': 199}
{'_id': 'MANIZALES', 'count_publications_by_sedes': 200}
{'_id': 'ORINOQUIA', 'count_publications_by_sedes': 190}
{'_id': 'MEDELLÍN', 'count_publications_by_sedes': 193}
{'_id': 'AMAZONIA', 'count_publications_by_sedes': 200}
{'_id': 'CARIBE', 'count_publications_by_sedes': 209}
{'_id': 'BOGOTÁ', 'count_publications_by_sedes': 182}


In [9]:
# Publicaciones por año

stage_group_year = {
  "$group": {
    "_id" : {"$year": {"$toDate": "$date_id"}},
    "count_publications_by_year": {"$sum": 1} # count_publications_by_year
  }
}

stage_out_year = {
  "$out": "out_num_publications_by_year" # out_num_publications_by_year
}

pipeline = [
  stage_group_year,
  stage_out_year
]

results = publicaciones.aggregate(pipeline)

for graduacion_mapreduce in mdb["out_num_publications_by_year"].find():
  print(graduacion_mapreduce)

{'_id': 2022, 'count_egresado': 6}
{'_id': 1972, 'count_egresado': 33}
{'_id': 2011, 'count_egresado': 16}
{'_id': 2000, 'count_egresado': 8}
{'_id': 1973, 'count_egresado': 4}
{'_id': 1988, 'count_egresado': 48}
{'_id': 2021, 'count_egresado': 35}
{'_id': 1976, 'count_egresado': 10}
{'_id': 2019, 'count_egresado': 46}
{'_id': 2006, 'count_egresado': 26}
{'_id': 2017, 'count_egresado': 74}
{'_id': 1995, 'count_egresado': 12}
{'_id': 2008, 'count_egresado': 28}
{'_id': 1986, 'count_egresado': 19}
{'_id': 2003, 'count_egresado': 33}
{'_id': 1979, 'count_egresado': 32}
{'_id': 2007, 'count_egresado': 41}
{'_id': 1987, 'count_egresado': 20}
{'_id': 1974, 'count_egresado': 46}
{'_id': 1989, 'count_egresado': 26}
{'_id': 1978, 'count_egresado': 30}
{'_id': 1981, 'count_egresado': 21}
{'_id': 1994, 'count_egresado': 18}
{'_id': 1984, 'count_egresado': 49}
{'_id': 1991, 'count_egresado': 12}
{'_id': 2014, 'count_egresado': 52}
{'_id': 1992, 'count_egresado': 37}
{'_id': 1999, 'count_egresado':