In [220]:
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import pyspark
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql.functions import *
import pyspark.sql.functions as func
from pyspark import SparkContext, SparkConf
from pyspark.sql.functions import mean, min, max


In [221]:
conf = SparkConf().set("spark.ui.port", "4050")
spark = SparkSession.builder.getOrCreate()



In [222]:
spark



In [223]:
df_movies = spark.read.load('/home/peder/dev/big-data/movie-recommendation-system/data/movies_metadata.csv',
                           format="csv",
                           sep=",",
                           inferSchema="true",
                           header="true")

In [224]:
df_movies.printSchema()


root
 |-- adult: string (nullable = true)
 |-- belongs_to_collection: string (nullable = true)
 |-- budget: string (nullable = true)
 |-- genres: string (nullable = true)
 |-- homepage: string (nullable = true)
 |-- id: string (nullable = true)
 |-- imdb_id: string (nullable = true)
 |-- original_language: string (nullable = true)
 |-- original_title: string (nullable = true)
 |-- overview: string (nullable = true)
 |-- popularity: string (nullable = true)
 |-- poster_path: string (nullable = true)
 |-- production_companies: string (nullable = true)
 |-- production_countries: string (nullable = true)
 |-- release_date: string (nullable = true)
 |-- revenue: string (nullable = true)
 |-- runtime: string (nullable = true)
 |-- spoken_languages: string (nullable = true)
 |-- status: string (nullable = true)
 |-- tagline: string (nullable = true)
 |-- title: string (nullable = true)
 |-- video: string (nullable = true)
 |-- vote_average: string (nullable = true)
 |-- vote_count: string (nu

In [225]:
print("The shape of the dataset is {:d} rows by {:d} columns".format(df_movies.count(), len(df_movies.columns)))

The shape of the dataset is 45572 rows by 24 columns


In [226]:
print(df_movies.filter(col("vote_average").isNull()).count())
print(df_movies.filter(col("vote_count").isNull()).count())
print(df_movies.filter(col("overview").isNull()).count())

498
389
985


In [227]:
df_movies = df_movies.na.drop(subset=["vote_average"])
df_movies = df_movies.na.drop(subset=["vote_count"])
df_movies = df_movies.na.drop(subset=["overview"])


In [228]:
df_movies = df_movies.withColumn("vote_average", df_movies["vote_average"].cast("double"))
df_movies = df_movies.withColumn("vote_count", df_movies["vote_count"].cast("int"))

In [229]:
df_movies = df_movies.filter((df_movies.vote_average >=0) & (df_movies.vote_average<=10))

In [230]:
df_movies.select(['vote_average']).describe().show()


+-------+------------------+
|summary|      vote_average|
+-------+------------------+
|  count|             40786|
|   mean| 5.612511975530867|
| stddev|1.9231620784205472|
|    min|               0.0|
|    max|              10.0|
+-------+------------------+



In [231]:
df_movies.select(['vote_count']).describe().show()


+-------+-----------------+
|summary|       vote_count|
+-------+-----------------+
|  count|            40760|
|   mean|112.1123405299313|
| stddev|490.1629388596058|
|    min|                0|
|    max|            12269|
+-------+-----------------+



In [232]:
df_movies.select(['overview']).show(truncate=False, n=1)


+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|overview                                                                                                                                                                                                                                                                                                       |
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|Led by Woody, Andy's toys live happily in his room until Andy's birthday brings B

In [233]:
def clean_text(df, column_name="content"):
    """
    This fucntion takes the raw text data and apply a standard NLP preprocessing pipeline consisting of the following steps:
      - Text cleaning
      - Tokenization
      - Stopwords removal
      - Stemming (Snowball stemmer)

    parameter: dataframe
    returns: the input dataframe along with the `cleaned_content` column as the results of the NLP preprocessing pipeline

    """
    from pyspark.sql.functions import udf, col, lower, trim, regexp_replace, concat_ws
    from pyspark.ml.feature import Tokenizer, StopWordsRemover
    from nltk.stem.snowball import SnowballStemmer

    # Text preprocessing pipeline
    print("***** Text Preprocessing Pipeline *****\n")

    # 1. Text cleaning
    print("# 1. Text Cleaning\n")
    # 1.a Case normalization
    print("1.a Case normalization:")
    lower_case_news_df = df.select("id", lower(col(column_name)).alias(column_name))
    lower_case_news_df.show(10)
    # 1.b Trimming
    print("1.b Trimming:")
    trimmed_news_df = lower_case_news_df.select("id", trim(col(column_name)).alias(column_name))
    trimmed_news_df.show(10)
    # 1.c Filter out punctuation symbols
    print("1.c Filter out punctuation:")
    no_punct_news_df = trimmed_news_df.select("id", (regexp_replace(col(column_name), "[^a-zA-Z\\s]", "")).alias(column_name))
    no_punct_news_df.show(10)
    # 1.d Filter out any internal extra whitespace
    print("1.d Filter out extra whitespaces:")
    cleaned_news_df = no_punct_news_df.select("id", trim(regexp_replace(col(column_name), " +", " ")).alias(column_name))

    # 2. Tokenization (split text into tokens)
    print("# 2. Tokenization:")
    tokenizer = Tokenizer(inputCol=column_name, outputCol="tokens")
    tokens_df = tokenizer.transform(cleaned_news_df)

    print("# 3. Stopwords removal:")
    stopwords_remover = StopWordsRemover(inputCol="tokens", outputCol="terms")
    terms_df = stopwords_remover.transform(tokens_df)

    print("# 4. Stemming:")
    stemmer = SnowballStemmer(language="english")
    stemmer_udf = udf(lambda tokens: [stemmer.stem(token) for token in tokens], ArrayType(StringType()))
    terms_stemmed_df = terms_df.withColumn("terms_stemmed", stemmer_udf("terms"))
    
    print("# 5. untokenize")
    terms_joined_df = terms_stemmed_df.withColumn("terms_join", concat_ws(" ", "terms_stemmed"))
    return terms_joined_df

In [234]:
from pyspark.sql.functions import udf, col, lower, trim, regexp_replace, concat_ws
from pyspark.ml.feature import Tokenizer, StopWordsRemover
from nltk.stem.snowball import SnowballStemmer

def cleanse_text(df):    
    lower_df = df.withColumn("overview_lower", lower(col("overview")))
    trim_df = lower_df.withColumn("overview_trim", trim(col("overview_lower")))
    no_punct_df = trim_df.withColumn("overview_no_punct", regexp_replace(col("overview_trim"), "[^a-zA-Z\\s]", ""))
    no_whitespace_df = no_punct_df.withColumn("overview_cleansed", trim(regexp_replace(col("overview_no_punct"), " +", " ")))
    return no_whitespace_df, "overview_cleansed"                  
    
def tokenize_text(df, column_name):
    tokenizer = Tokenizer(inputCol=column_name, outputCol="tokens")
    return tokenizer.transform(df), "tokens"

def stem_tokens(df, column_name):
    stemmer = SnowballStemmer(language="english")
    stemmer_udf = udf(lambda tokens: [stemmer.stem(token) for token in tokens], ArrayType(StringType()))
    return df.withColumn("terms_stemmed", stemmer_udf(column_name)), "terms_stemmed"

def remove_stop_words(df, column_name):
    stopwords_remover = StopWordsRemover(inputCol=column_name, outputCol="terms")
    return stopwords_remover.transform(df), 'terms'

def concat_arr(df, column_name):
    return  df.withColumn("corpus", concat_ws(" ", column_name))

def cleaning_pipe(df):
    df, column_name = cleanse_text(df)
    df, column_name = tokenize_text(df, column_name)
    df, column_name = remove_stop_words(df, column_name)
    df, column_name = stem_tokens(df, column_name)
    return df, column_name

In [235]:
#df_clean = clean_text(df_movies, 'overview')
#df_clean
#from pyspark.ml.feature import Tokenizer
#tokenizer = Tokenizer(inputCol="overview", outputCol="terms_stemmed")
#tokens_df = tokenizer.transform(df_movies)
#df_clean= tokens_df

In [264]:
df_clean, column_name = cleaning_pipe(df_movies)

df_pandas = concat_arr(df_clean, column_name).toPandas()

In [265]:
df_clean.select(column_name).show()


+--------------------+
|       terms_stemmed|
+--------------------+
|[led, woodi, andi...|
|[sibl, judi, pete...|
|[famili, wed, rei...|
|[georg, bank, rec...|
|[obsess, master, ...|
|[ugli, duckl, und...|
|[mischiev, young,...|
|[intern, action, ...|
|[jame, bond, must...|
|[widow, us, presi...|
|[lawyer, show, va...|
|[outcast, halfwol...|
|[allstar, cast, p...|
|[morgan, adam, sl...|
|[life, gambl, par...|
|[rich, mr, dashwo...|
|[ted, bellhop, fi...|
|[summon, ashram, ...|
|[veng, new, york,...|
|[agoraphob, psych...|
+--------------------+
only showing top 20 rows



In [266]:
#df_clean.select('terms_join').show(truncate=False, n=2)



In [267]:
def tf_idf_features(df):
    from sklearn.feature_extraction.text import TfidfVectorizer
    vectorizer = TfidfVectorizer()
    return vectorizer.fit_transform(df['corpus'])

def cosine_similarity(df, index):
    from sklearn.metrics.pairwise import cosine_similarity
    return cosine_similarity(df[index], df)[0] # score item in dataframe against all other records

In [268]:
tf_idf_matrix = tf_idf_features(df_pandas)
df_pandas['tfidf'] = tf_idf_matrix

In [269]:
df_index = pd.Series(df_pandas.index, index=df_pandas['title']).drop_duplicates() # create index

def sort_cos_score(matrix, df):
    arr = []
    for index, row in df.iterrows():
        arr.append(matrix[index])
    df['cosine_score'] = arr
    return df.sort_values(by=['cosine_score'], ascending=False)

def get_similar(title):
    index = df_index[title]
    cos_sim = cosine_similarity(tf_idf_matrix, index)
    return sort_cos_score(cos_sim, df_pandas)['original_title'].head(10)
    return False

In [270]:
get_similar('Batman Begins')

9120                                         Batman Begins
19169    Batman Unmasked: The Psychology of the Dark Kn...
17892              Batman: The Dark Knight Returns, Part 1
32412                                    Batman: Bad Blood
16292                                     Batman: Year One
14023                           Batman: Under the Red Hood
2771                          Batman: Mask of the Phantasm
40352                             Batman Beyond: The Movie
39300                                        Batman & Bill
1197                                        Batman Returns
Name: original_title, dtype: object

In [271]:
get_similar('Memento')

3679                                               Memento
30153    The True Meaning of Pictures: Shelby Lee Adams...
7275                                                  Novo
14879                                              Ghajini
28773                        Hunt for the Labyrinth Killer
23311                                            Implanted
29256                                             Amnesiac
35342                                       Without Memory
40657                                  La sangre iluminada
20066                                           Open Grave
Name: original_title, dtype: object

In [272]:
get_similar('The Avengers')


16100                       The Avengers
23078       Kingsman: The Secret Service
11856    When Willie Comes Marching Home
28516                               Waar
24006            Avengers: Age of Ultron
22022                            Plastic
12218                 Echelon Conspiracy
6217         A Woman Under the Influence
29129             Drums Across the River
4751                         Bad Company
Name: original_title, dtype: object

In [273]:
get_similar('Toy Story')


0                                           Toy Story
13874                                     Toy Story 3
2683                                      Toy Story 2
22181                                       Small Fry
9287                           The 40 Year Old Virgin
21564                     Andy Hardy's Blonde Trouble
26377                                      Hot Splash
34590    Superstar: The Life and Times of Andy Warhol
38380    Andy Peters: Exclamation Mark Question Point
1823                                   Pretty in Pink
Name: original_title, dtype: object

In [274]:
get_similar('Ice Age')


4569                            Ice Age
12505    Ice Age: Dawn of the Dinosaurs
9766              Ice Age: The Meltdown
40467                     Surviving Sid
17296        Ice Age: Continental Drift
13310                          The Thaw
22812      Ice Age: A Mammoth Christmas
11235                         10,000 BC
40465               Мама для мамонтёнка
18059                              龍在天涯
Name: original_title, dtype: object

In [246]:
# same in pyspark but get different scores since tf_idf is calculated differently
from pyspark.ml.feature import HashingTF, CountVectorizer, IDF
from pyspark.ml import Pipeline

cv = CountVectorizer(inputCol=column_name, outputCol="tf_features", vocabSize=20000, minDF=.01, maxDF=.90)
idf = IDF(inputCol="tf_features", outputCol="features")

pipeline = Pipeline(stages=[cv, idf])
features = pipeline.fit(df_clean)
tf_idf_df = features.transform(df_clean)

In [247]:
@udf("long")
def num_nonzeros(v):
    return v.numNonzeros()

In [248]:
print("Total n. of zero-length vectors: {:d}".
      format(tf_idf_df.where(num_nonzeros("features") == 0).count()))

Total n. of zero-length vectors: 155


In [249]:
tf_idf_df = tf_idf_df.where(num_nonzeros("features") > 0)

In [250]:
print("Total n. of zero-length vectors (after removal): {:d}".
      format(tf_idf_df.where(num_nonzeros("features") == 0).count()))

Total n. of zero-length vectors (after removal): 0


In [251]:
tf_idf_df.select('features').count()

40631

In [252]:
tf_idf_df.printSchema()


root
 |-- adult: string (nullable = true)
 |-- belongs_to_collection: string (nullable = true)
 |-- budget: string (nullable = true)
 |-- genres: string (nullable = true)
 |-- homepage: string (nullable = true)
 |-- id: string (nullable = true)
 |-- imdb_id: string (nullable = true)
 |-- original_language: string (nullable = true)
 |-- original_title: string (nullable = true)
 |-- overview: string (nullable = true)
 |-- popularity: string (nullable = true)
 |-- poster_path: string (nullable = true)
 |-- production_companies: string (nullable = true)
 |-- production_countries: string (nullable = true)
 |-- release_date: string (nullable = true)
 |-- revenue: string (nullable = true)
 |-- runtime: string (nullable = true)
 |-- spoken_languages: string (nullable = true)
 |-- status: string (nullable = true)
 |-- tagline: string (nullable = true)
 |-- title: string (nullable = true)
 |-- video: string (nullable = true)
 |-- vote_average: double (nullable = true)
 |-- vote_count: integer (n

In [253]:
def cosine_similarity_vec(a, b):
    return 1 - a.dot(b)/(a.norm(2)*b.norm(2))

def cosine_similarity(id1, id2):
    df_1 = tf_idf_df.select("features").where(tf_idf_df.id == id1).first()
    df_2 = tf_idf_df.select("features").where(tf_idf_df.id == id2).first()
    return cosine_similarity_vec(df_1.features, df_2.features)

cosine_similarity("238","240") # toy story 1 and 2

0.5721506761565958

In [254]:
tf_idf_df.select(["id","title"]).filter(lower(tf_idf_df.title).like('%toy%')).show(truncate=False,n=3)

+-----+----------------+
|id   |title           |
+-----+----------------+
|862  |Toy Story       |
|11597|Toys            |
|25898|Babes in Toyland|
+-----+----------------+
only showing top 3 rows



In [255]:
tf_idf_df.select(["id","title"]).filter(lower(tf_idf_df.title).like('%the prestige%')).show(truncate=False, n=5)

+----+------------+
|id  |title       |
+----+------------+
|1124|The Prestige|
+----+------------+



In [256]:
def cos_sim_dataset(movie_vec):
    udf_func = udf(lambda df:str(cosine_similarity_vec(movie_vec.features, df)))
    return tf_idf_df.withColumn("cosine_similarity", udf_func("features"))

In [257]:
df_vec_batman_forever = tf_idf_df.select("features").where(tf_idf_df.id == "414").first()

df_similarity = cos_sim_dataset(df_vec_batman_forever)
    
df_similarity.select(["cosine_similarity", "id"]).show(truncate=False, n=5)

+------------------+-----+
|cosine_similarity |id   |
+------------------+-----+
|1.0               |862  |
|1.0               |8844 |
|0.9759698416687279|15602|
|1.0               |11862|
|1.0               |949  |
+------------------+-----+
only showing top 5 rows



In [258]:
df_similarity.select(["title","id", "cosine_similarity"]).where(df_movies.id == "8844").first()

Row(title='Jumanji', id='8844', cosine_similarity='1.0')

In [259]:
#df_movies.select(["id","title"]).filter(lower(df_movies.title).like('%the godfather%')).show()


In [260]:
df_similarity = df_similarity.withColumn("cosine_similarity", df_similarity["cosine_similarity"].cast("double"))


In [261]:
df_similarity.orderBy('cosine_similarity', ascending=False).select(["cosine_similarity", "id", "title"]).show()

+-----------------+------+--------------------+
|cosine_similarity|    id|               title|
+-----------------+------+--------------------+
|              1.0| 60410|      Remote Control|
|              1.0|379291|Justice League vs...|
|              1.0|157005|Google and the Wo...|
|              1.0| 75027|          Checkpoint|
|              1.0| 36818|         Smorgasbord|
|              1.0|250852|The Notorious Mr....|
|              1.0|242631|     So This Is Love|
|              1.0|154786|          Wavemakers|
|              1.0|207178|The Lion and the ...|
|              1.0| 34158|    The People Speak|
|              1.0| 85231|The Lone Wolf Mee...|
|              1.0|122544|   The Perfect House|
|              1.0| 66060|            Vacation|
|              1.0|257044|   Game For Vultures|
|              1.0|372691|       White Bondage|
|              1.0|254952|          Me Him Her|
|              1.0|178447|           Wide Open|
|              1.0|347848|  Last Girl St

In [262]:
df_similarity.groupby(["cosine_similarity"]).count().sort("cosine_similarity", ascending=False).show(10)


+------------------+-----+
| cosine_similarity|count|
+------------------+-----+
|               1.0|19642|
|0.9942500815212627|    1|
| 0.994166116296929|    1|
|0.9938847858651231|    1|
|0.9935367136502573|    1|
|0.9931548768957004|    1|
|0.9930463355319291|    1|
|0.9930395897125676|    1|
|0.9928902739532759|    1|
|0.9925960510257771|    1|
+------------------+-----+
only showing top 10 rows

