In [1]:
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import pyspark
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql.functions import *
import pyspark.sql.functions as func
from pyspark import SparkContext, SparkConf
from pyspark.sql.functions import mean, min, max


In [2]:
conf = SparkConf().set("spark.ui.port", "4050")
spark = SparkSession.builder.getOrCreate()



In [3]:
spark



In [4]:
df_movies = spark.read.load('/home/peder/dev/big-data/movie-recommendation-system/data/movies_metadata.csv',
                           format="csv",
                           sep=",",
                           inferSchema="true",
                           header="true")

In [5]:
df_movies.printSchema()


root
 |-- adult: string (nullable = true)
 |-- belongs_to_collection: string (nullable = true)
 |-- budget: string (nullable = true)
 |-- genres: string (nullable = true)
 |-- homepage: string (nullable = true)
 |-- id: string (nullable = true)
 |-- imdb_id: string (nullable = true)
 |-- original_language: string (nullable = true)
 |-- original_title: string (nullable = true)
 |-- overview: string (nullable = true)
 |-- popularity: string (nullable = true)
 |-- poster_path: string (nullable = true)
 |-- production_companies: string (nullable = true)
 |-- production_countries: string (nullable = true)
 |-- release_date: string (nullable = true)
 |-- revenue: string (nullable = true)
 |-- runtime: string (nullable = true)
 |-- spoken_languages: string (nullable = true)
 |-- status: string (nullable = true)
 |-- tagline: string (nullable = true)
 |-- title: string (nullable = true)
 |-- video: string (nullable = true)
 |-- vote_average: string (nullable = true)
 |-- vote_count: string (nu

In [6]:
print("The shape of the dataset is {:d} rows by {:d} columns".format(df_movies.count(), len(df_movies.columns)))

The shape of the dataset is 45572 rows by 24 columns


In [7]:
print(df_movies.filter(col("vote_average").isNull()).count())
print(df_movies.filter(col("vote_count").isNull()).count())
print(df_movies.filter(col("overview").isNull()).count())

498
389
985


In [8]:
df_movies = df_movies.na.drop(subset=["vote_average"])
df_movies = df_movies.na.drop(subset=["vote_count"])
df_movies = df_movies.na.drop(subset=["overview"])


In [9]:
df_movies = df_movies.withColumn("vote_average", df_movies["vote_average"].cast("double"))
df_movies = df_movies.withColumn("vote_count", df_movies["vote_count"].cast("int"))

In [10]:
df_movies = df_movies.filter((df_movies.vote_average >=0) & (df_movies.vote_average<=10))

In [11]:
df_movies.select(['vote_average']).describe().show()


+-------+------------------+
|summary|      vote_average|
+-------+------------------+
|  count|             40786|
|   mean| 5.612511975530867|
| stddev|1.9231620784205472|
|    min|               0.0|
|    max|              10.0|
+-------+------------------+



In [12]:
df_movies.select(['vote_count']).describe().show()


+-------+-----------------+
|summary|       vote_count|
+-------+-----------------+
|  count|            40760|
|   mean|112.1123405299313|
| stddev|490.1629388596058|
|    min|                0|
|    max|            12269|
+-------+-----------------+



In [13]:
df_movies.select(['overview']).show(truncate=False, n=1)


+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|overview                                                                                                                                                                                                                                                                                                       |
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|Led by Woody, Andy's toys live happily in his room until Andy's birthday brings B

In [14]:
def clean_text(df, column_name="content"):
    """
    This fucntion takes the raw text data and apply a standard NLP preprocessing pipeline consisting of the following steps:
      - Text cleaning
      - Tokenization
      - Stopwords removal
      - Stemming (Snowball stemmer)

    parameter: dataframe
    returns: the input dataframe along with the `cleaned_content` column as the results of the NLP preprocessing pipeline

    """
    from pyspark.sql.functions import udf, col, lower, trim, regexp_replace, concat_ws
    from pyspark.ml.feature import Tokenizer, StopWordsRemover
    from nltk.stem.snowball import SnowballStemmer

    # Text preprocessing pipeline
    print("***** Text Preprocessing Pipeline *****\n")

    # 1. Text cleaning
    print("# 1. Text Cleaning\n")
    # 1.a Case normalization
    print("1.a Case normalization:")
    lower_case_news_df = df.select("id", lower(col(column_name)).alias(column_name))
    lower_case_news_df.show(10)
    # 1.b Trimming
    print("1.b Trimming:")
    trimmed_news_df = lower_case_news_df.select("id", trim(col(column_name)).alias(column_name))
    trimmed_news_df.show(10)
    # 1.c Filter out punctuation symbols
    print("1.c Filter out punctuation:")
    no_punct_news_df = trimmed_news_df.select("id", (regexp_replace(col(column_name), "[^a-zA-Z\\s]", "")).alias(column_name))
    no_punct_news_df.show(10)
    # 1.d Filter out any internal extra whitespace
    print("1.d Filter out extra whitespaces:")
    cleaned_news_df = no_punct_news_df.select("id", trim(regexp_replace(col(column_name), " +", " ")).alias(column_name))

    # 2. Tokenization (split text into tokens)
    print("# 2. Tokenization:")
    tokenizer = Tokenizer(inputCol=column_name, outputCol="tokens")
    tokens_df = tokenizer.transform(cleaned_news_df)

    print("# 3. Stopwords removal:")
    stopwords_remover = StopWordsRemover(inputCol="tokens", outputCol="terms")
    terms_df = stopwords_remover.transform(tokens_df)

    print("# 4. Stemming:")
    stemmer = SnowballStemmer(language="english")
    stemmer_udf = udf(lambda tokens: [stemmer.stem(token) for token in tokens], ArrayType(StringType()))
    terms_stemmed_df = terms_df.withColumn("terms_stemmed", stemmer_udf("terms"))
    
    print("# 5. untokenize")
    terms_joined_df = terms_stemmed_df.withColumn("terms_join", concat_ws(" ", "terms_stemmed"))
    return terms_joined_df

In [15]:
df_clean = clean_text(df_movies, 'overview')

***** Text Preprocessing Pipeline *****

# 1. Text Cleaning

1.a Case normalization:
+-----+--------------------+
|   id|            overview|
+-----+--------------------+
|  862|led by woody, and...|
| 8844|when siblings jud...|
|15602|a family wedding ...|
|11862|just when george ...|
|  949|obsessive master ...|
|11860|an ugly duckling ...|
|45325|a mischievous you...|
| 9091|international act...|
|  710|james bond must u...|
| 9087|widowed u.s. pres...|
+-----+--------------------+
only showing top 10 rows

1.b Trimming:
+-----+--------------------+
|   id|            overview|
+-----+--------------------+
|  862|led by woody, and...|
| 8844|when siblings jud...|
|15602|a family wedding ...|
|11862|just when george ...|
|  949|obsessive master ...|
|11860|an ugly duckling ...|
|45325|a mischievous you...|
| 9091|international act...|
|  710|james bond must u...|
| 9087|widowed u.s. pres...|
+-----+--------------------+
only showing top 10 rows

1.c Filter out punctuation:
+-----+--

In [16]:
df_clean.select('terms_stemmed').show(truncate=False, n=1)


+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|terms_stemmed                                                                                                                                                                                                                     |
+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|[led, woodi, andi, toy, live, happili, room, andi, birthday, bring, buzz, lightyear, onto, scene, afraid, lose, place, andi, heart, woodi, plot, buzz, circumst, separ, buzz, woodi, owner, duo, eventu, learn, put, asid, differ]|
+-----------------------------------------------------------------------------------

In [17]:
df_clean.select('terms_join').show(truncate=False, n=2)



+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|terms_join                                                                                                                                                                                                                      |
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|led woodi andi toy live happili room andi birthday bring buzz lightyear onto scene afraid lose place andi heart woodi plot buzz circumst separ buzz woodi owner duo eventu learn put asid differ                                |
|sibl judi peter discov enchant board game open door magic world unwit invit alan adult whos

In [18]:
from pyspark.ml.feature import HashingTF, CountVectorizer, IDF
from pyspark.ml import Pipeline

cv = CountVectorizer(inputCol="terms_stemmed", outputCol="tf_features", vocabSize=2000, minDF=10)
idf = IDF(inputCol="tf_features", outputCol="features")

pipeline = Pipeline(stages=[cv, idf])
features = pipeline.fit(df_clean)
tf_idf_features_df = features.transform(df_clean)

In [19]:
@udf("long")
def num_nonzeros(v):
    return v.numNonzeros()

In [20]:
print("Total n. of zero-length vectors: {:d}".
      format(tf_idf_features_df.where(num_nonzeros("features") == 0).count()))

Total n. of zero-length vectors: 38


In [21]:
tf_idf_features_df = tf_idf_features_df.where(num_nonzeros("features") > 0)

In [22]:
print("Total n. of zero-length vectors (after removal): {:d}".
      format(tf_idf_features_df.where(num_nonzeros("features") == 0).count()))

Total n. of zero-length vectors (after removal): 0


In [23]:
tf_idf_features_df.select('features').count()

40748

In [24]:
df_a = tf_idf_features_df.select(col("features")).first()
df_b = tf_idf_features_df.select(col("features")).take(2)[1]

In [25]:
def cosine_similarity_test(a, b):
    return 1 - a.dot(b)/(a.norm(2)*b.norm(2))
cosine_similarity_test(df_a.features, df_b.features)

0.9666635763568375