# MongoDB sample

In this notebook we take a sample of our preprocessed joint dataset, and we put it in the MongoDB db through a SparkJob.

In [None]:
import findspark

findspark.init()

import pyspark
from pyspark.sql import *
from pyspark.sql import functions as sf
import pymongo

In [None]:
sc = pyspark.SparkContext(appName="MongoDB injection")
spark = SparkSession(sc)

## Load the dataset in Spark

After loading the dataset we sample it in order to save it in MongoDB.

In [None]:
hdfs_path = "hdfs://localhost:54310/user/ubuntu/map_reduce/joint_dataset/"
schema = "Title string, Counter string, Authors string, Avg_Rating string, Num_Pages string, Ratings_Count string, Text_Reviews_Count string, Publisher string, Publication_Date string"

df = spark.read.csv(hdfs_path, header=False, inferSchema=False, sep="\t", quote='', escape='', schema=schema)

# Preprocessing
df = df.select([sf.regexp_replace(c, r'\\|\[|\]|\"', '').alias(c) for c in df.columns])

In [None]:
sample_rate = 1.0

# Sampling the dataset
df_pandas = df.sample(sample_rate).toPandas()

# Furtherly preprocessing
df_obj = df_pandas.select_dtypes('object')
df_pandas[df_obj.columns] = df_obj.apply(lambda x:x.str.strip())

# Replacing voids
df_pandas["Avg_Rating"] = df_pandas["Avg_Rating"].replace('', '0')
df_pandas["Num_Pages"] = df_pandas["Num_Pages"].replace('', '0')

# Casting
df_pandas= df_pandas.astype({"Title": 'string', "Counter": 'int', "Authors": 'string', "Avg_Rating": 'float', "Num_Pages": 'int', "Ratings_Count": 'int', "Text_Reviews_Count": 'int', "Publisher": 'string', "Publication_Date": 'string'})

print(df_pandas.dtypes)

df_pandas.head()

## Connection and insertion of the documents

In this section we connect to the MongoDB instance and afterward we insert the documents, that is the books.

In [None]:
import reprlib
from pprint import pprint
import json

r=reprlib.Repr()
r.maxlist=100
r.maxstring=10000
r.maxdict=1000

# Creation of the collection
my_client = pymongo.MongoClient("mongodb://localhost:27017/")
my_db = my_client["goodreads_db"]
my_collection = my_db["books"]

# Converting the pandas DataFrame to a dictionary
my_dictionary= df_pandas.to_dict(orient='records')

# We convert the dictionary into a json string to avoid the insertion of escape characters by the insert_many function
json_str=json.dumps(my_dictionary, ensure_ascii=False)
pprint(r.repr(json_str))

json_obj=json.loads(json_str)
pprint(r.repr(json_obj))

# Inserting the documents
result = my_collection.insert_many(json_obj)

In [None]:
sc.stop()