In [1]:
import pyspark

In [2]:
import re
import pandas as pd

In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode
from pyspark.sql.functions import from_json

In [4]:
from pyspark.sql.types import ArrayType, StructType, StructField, StringType, IntegerType

In [5]:
from pyspark.sql.functions import udf, col

In [6]:
pd.set_option('display.max_columns', None)

In [7]:
spark = SparkSession \
    .builder \
    .appName("Spark_data_clean") \
    .config("spark.master", "spark://rayiMac.modem:7077") \
    .config("spark.jars.packages", "com.crealytics:spark-excel_2.12:3.2.2_0.18.0") \
    .config("spark.executor.cores", "1") \
    .config("spark.executor.num", "1") \
    .getOrCreate()

:: loading settings :: url = jar:file:/opt/spark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/ray/.ivy2/cache
The jars for the packages stored in: /Users/ray/.ivy2/jars
com.crealytics#spark-excel_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-9ba8bc30-4758-4ae5-a35c-36ae518a7927;1.0
	confs: [default]
	found com.crealytics#spark-excel_2.12;3.2.2_0.18.0 in central
	found org.apache.poi#poi;5.2.2 in central
	found commons-codec#commons-codec;1.15 in central
	found org.apache.commons#commons-collections4;4.4 in central
	found org.apache.commons#commons-math3;3.6.1 in central
	found commons-io#commons-io;2.11.0 in central
	found com.zaxxer#SparseBitSet;1.2 in central
	found org.apache.logging.log4j#log4j-api;2.17.2 in central
	found org.apache.poi#poi-ooxml;5.2.2 in central
	found org.apache.poi#poi-ooxml-lite;5.2.2 in central
	found org.apache.xmlbeans#xmlbeans;5.0.3 in central
	found org.apache.commons#commons-compress;1.21 in central
	found com.github.virtuald#curvesapi;1.07 in central
	found com.norbi

In [8]:
df_keywords = spark.read.csv("keywords.csv", header=True, inferSchema=True)

                                                                                

In [9]:
df_keywords.show()

[Stage 2:>                                                          (0 + 1) / 1]

+-----+--------------------+
|   id|            keywords|
+-----+--------------------+
|  862|[{'id': 931, 'nam...|
| 8844|"[{'id': 10090, '...|
|15602|[{'id': 1495, 'na...|
|31357|[{'id': 818, 'nam...|
|11862|[{'id': 1009, 'na...|
|  949|[{'id': 642, 'nam...|
|11860|[{'id': 90, 'name...|
|45325|                  []|
| 9091|[{'id': 949, 'nam...|
|  710|[{'id': 701, 'nam...|
| 9087|[{'id': 833, 'nam...|
|12110|[{'id': 3633, 'na...|
|21032|[{'id': 1994, 'na...|
|10858|[{'id': 840, 'nam...|
| 1408|[{'id': 911, 'nam...|
|  524|[{'id': 383, 'nam...|
| 4584|[{'id': 420, 'nam...|
|    5|"[{'id': 612, 'na...|
| 9273|[{'id': 409, 'nam...|
|11517|[{'id': 380, 'nam...|
+-----+--------------------+
only showing top 20 rows



                                                                                

In [10]:
# Complex Schema
keywords_schema = ArrayType(
    StructType([
        StructField("id", IntegerType(), True),
        StructField("name", StringType(), True)
    ])
)

In [11]:
# convert keywords
df_keywords = df_keywords.withColumn("keywords", from_json(col("keywords"), keywords_schema))

In [12]:
df_keywords.show()

[Stage 3:>                                                          (0 + 1) / 1]

+-----+--------------------+
|   id|            keywords|
+-----+--------------------+
|  862|[{931, jealousy},...|
| 8844|                null|
|15602|[{1495, fishing},...|
|31357|[{818, based on n...|
|11862|[{1009, baby}, {1...|
|  949|[{642, robbery}, ...|
|11860|[{90, paris}, {38...|
|45325|                  []|
| 9091|[{949, terrorist}...|
|  710|[{701, cuba}, {76...|
| 9087|[{833, white hous...|
|12110|[{3633, dracula},...|
|21032|[{1994, wolf}, {6...|
|10858|[{840, usa presid...|
| 1408|[{911, exotic isl...|
|  524|[{383, poker}, {7...|
| 4584|[{420, bowling}, ...|
|    5|                null|
| 9273|[{409, africa}, {...|
|11517|[{380, brother br...|
+-----+--------------------+
only showing top 20 rows



                                                                                

In [23]:
df_exploded = df_keywords.select("id",explode(col("keywords")).alias("keyword_data"))

In [24]:
df_exploded.show()

+-----+--------------------+
|   id|        keyword_data|
+-----+--------------------+
|  862|     {931, jealousy}|
|  862|         {4290, toy}|
|  862|         {5202, boy}|
|  862|  {6054, friendship}|
|  862|     {9713, friends}|
|  862|     {9823, rivalry}|
|  862|{165503, boy next...|
|  862|   {170722, new toy}|
|  862|{187065, toy come...|
|15602|     {1495, fishing}|
|15602|{12392, best friend}|
|15602|{179431, duringcr...|
|15602|   {208510, old men}|
|31357|{818, based on no...|
|31357|{10131, interraci...|
|31357|{14768, single mo...|
|31357|    {15160, divorce}|
|31357|{33455, chick flick}|
|11862|        {1009, baby}|
|11862|{1599, midlife cr...|
+-----+--------------------+
only showing top 20 rows



In [29]:
df_final = df_exploded.select("id",col("keyword_data.id").alias("keywords_id"), col("keyword_data.name").alias("keywords_name"))

In [30]:
df_final.show()

+-----+-----------+--------------------+
|   id|keywords_id|       keywords_name|
+-----+-----------+--------------------+
|  862|        931|            jealousy|
|  862|       4290|                 toy|
|  862|       5202|                 boy|
|  862|       6054|          friendship|
|  862|       9713|             friends|
|  862|       9823|             rivalry|
|  862|     165503|       boy next door|
|  862|     170722|             new toy|
|  862|     187065|   toy comes to life|
|15602|       1495|             fishing|
|15602|      12392|         best friend|
|15602|     179431|duringcreditsstinger|
|15602|     208510|             old men|
|31357|        818|      based on novel|
|31357|      10131|interracial relat...|
|31357|      14768|       single mother|
|31357|      15160|             divorce|
|31357|      33455|         chick flick|
|11862|       1009|                baby|
|11862|       1599|      midlife crisis|
+-----+-----------+--------------------+
only showing top