In [17]:
import findspark

findspark.init()

import pyspark
from pyspark.sql import *
from pyspark.sql import functions as sf
import pymongo

In [18]:
sc = pyspark.SparkContext(appName="MongoDB injection")
spark = SparkSession(sc)

In [19]:
hdfs_path = "hdfs://localhost:54310/user/ubuntu/map_reduce/joint_dataset/"
schema = "Title string, Counter string, Authors string, Avg_Rating string, ISBN13 string, Num_Pages string, Ratings_Count string, Text_Reviews_Count string, Publisher string, Publication_Date string"

df = spark.read.csv(hdfs_path, header=False, inferSchema=False, sep="\t", quote='', escape='', schema=schema)
df = df.select([sf.regexp_replace(c, r'\\|\[|\]|\"', '').alias(c) for c in df.columns])

In [20]:
df_pandas = df.sample(1.0).toPandas()

df_obj = df_pandas.select_dtypes('object')
df_pandas[df_obj.columns] = df_obj.apply(lambda x:x.str.strip())

df_pandas["Avg_Rating"] = df_pandas["Avg_Rating"].replace('', '0')
df_pandas["Num_Pages"] = df_pandas["Num_Pages"].replace('', '0')

df_pandas= df_pandas.astype({"Title": 'string', "Counter": 'int', "Authors": 'string', "Avg_Rating": 'float', "ISBN13": 'string', "Num_Pages": 'int', "Ratings_Count": 'int', "Text_Reviews_Count": 'int', "Publisher": 'string', "Publication_Date": 'string'})

print(df_pandas.dtypes)

df_pandas.head()

Title                 string[python]
Counter                        int64
Authors               string[python]
Avg_Rating                   float64
ISBN13                string[python]
Num_Pages                      int64
Ratings_Count                  int64
Text_Reviews_Count             int64
Publisher             string[python]
Publication_Date      string[python]
dtype: object


Unnamed: 0,Title,Counter,Authors,Avg_Rating,ISBN13,Num_Pages,Ratings_Count,Text_Reviews_Count,Publisher,Publication_Date
0,$30 film school: how to write direct produce...,1,Michael W. Dean,3.49,9781592000678,528,30,4,Cengage Learning,2003-05-13
1,1 000 places to see before you die,1,Patricia Schultz,3.85,9780761104841,992,36303,439,Workman Publishing Company,2003-05-22
2,10 lb penalty,1,Dick Francis,3.9,9780425197455,320,3490,177,G.P. Putnam's Sons,2004-08-03
3,100 great fantasy short short stories,1,Isaac Asimov/Terry Carr/Martin H. Greenberg/Ja...,3.91,9780380699179,395,204,19,Avon,1985-08-01
4,100 love sonnets,1,Pablo Neruda/Stephen Tapscott,4.39,9780292760288,232,12613,392,University of Texas Press,1986-01-01


In [21]:
import reprlib
from pprint import pprint
import json
r=reprlib.Repr()
r.maxlist=100
r.maxstring=10000
r.maxdict=1000

my_client = pymongo.MongoClient("mongodb://localhost:27017/")
my_db = my_client["goodreads_db"]
my_collection = my_db["books"]
my_dictionary= df_pandas.to_dict(orient='records')

json_str=json.dumps(my_dictionary, ensure_ascii=False)
pprint(r.repr(json_str))

json_obj=json.loads(json_str)
pprint(r.repr(json_obj))

result = my_collection.insert_many(json_obj)

('\'[{"Title": "$30 film school: how to write  direct  produce  shoot  edit  '
 'distribute  tour with  and sell your own no-budget digital movie", '
 '"Counter": 1, "Authors": "Michael W. Dean", "Avg_Rating": 3.49, "ISBN13": '
 '"9781592000678", "Num_Pages": 528, "Ratings_Count": 30, '
 '"Text_Reviews_Count": 4, "Publisher": "Cengage Learning", '
 '"Publication_Date": "2003-05-13"}, {"Title": "1 000 places to see before you '
 'die", "Counter": 1, "Authors": "Patricia Schultz", "Avg_Rating": 3.85, '
 '"ISBN13": "9780761104841", "Num_Pages": 992, "Ratings_Count": 36303, '
 '"Text_Reviews_Count": 439, "Publisher": "Workman Publishing Company", '
 '"Publication_Date": "2003-05-22"}, {"Title": "10 lb penalty", "Counter": 1, '
 '"Authors": "Dick Francis", "Avg_Rating": 3.9, "ISBN13": "9780425197455", '
 '"Num_Pages": 320, "Ratings_Count": 3490, "Text_Reviews_Count": 177, '
 '"Publisher": "G.P. Putnam\\\'s Sons", "Publication_Date": "2004-08-03"}, '
 '{"Title": "100 great fantasy short shor

In [22]:
sc.stop()