In [1]:
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import pyspark
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql.functions import *
import pyspark.sql.functions as func
from pyspark import SparkContext, SparkConf
from pyspark.sql.functions import mean, min, max


In [2]:
conf = SparkConf().set("spark.ui.port", "4050")
spark = SparkSession.builder.getOrCreate()



In [3]:
spark



In [4]:
df_movies = spark.read.load('/home/peder/dev/big-data/movie-recommendation-system/data/movies_metadata.csv',
                           format="csv",
                           sep=",",
                           inferSchema="true",
                           header="true")

In [5]:
df_movies.printSchema()


root
 |-- adult: string (nullable = true)
 |-- belongs_to_collection: string (nullable = true)
 |-- budget: string (nullable = true)
 |-- genres: string (nullable = true)
 |-- homepage: string (nullable = true)
 |-- id: string (nullable = true)
 |-- imdb_id: string (nullable = true)
 |-- original_language: string (nullable = true)
 |-- original_title: string (nullable = true)
 |-- overview: string (nullable = true)
 |-- popularity: string (nullable = true)
 |-- poster_path: string (nullable = true)
 |-- production_companies: string (nullable = true)
 |-- production_countries: string (nullable = true)
 |-- release_date: string (nullable = true)
 |-- revenue: string (nullable = true)
 |-- runtime: string (nullable = true)
 |-- spoken_languages: string (nullable = true)
 |-- status: string (nullable = true)
 |-- tagline: string (nullable = true)
 |-- title: string (nullable = true)
 |-- video: string (nullable = true)
 |-- vote_average: string (nullable = true)
 |-- vote_count: string (nu

In [6]:
print("The shape of the dataset is {:d} rows by {:d} columns".format(df_movies.count(), len(df_movies.columns)))

The shape of the dataset is 45572 rows by 24 columns


In [7]:
print(df_movies.filter(col("vote_average").isNull()).count())
print(df_movies.filter(col("vote_count").isNull()).count())
print(df_movies.filter(col("overview").isNull()).count())

498
389
985


In [8]:
df_movies = df_movies.na.drop(subset=["vote_average"])
df_movies = df_movies.na.drop(subset=["vote_count"])
df_movies = df_movies.na.drop(subset=["overview"])


In [9]:
df_movies = df_movies.withColumn("vote_average", df_movies["vote_average"].cast("double"))
df_movies = df_movies.withColumn("vote_count", df_movies["vote_count"].cast("int"))

In [10]:
df_movies = df_movies.filter((df_movies.vote_average >=0) & (df_movies.vote_average<=10))

In [11]:
df_movies.select(['vote_average']).describe().show()


+-------+------------------+
|summary|      vote_average|
+-------+------------------+
|  count|             40786|
|   mean| 5.612511975530867|
| stddev|1.9231620784205472|
|    min|               0.0|
|    max|              10.0|
+-------+------------------+



In [12]:
df_movies.select(['vote_count']).describe().show()


+-------+-----------------+
|summary|       vote_count|
+-------+-----------------+
|  count|            40760|
|   mean|112.1123405299313|
| stddev|490.1629388596058|
|    min|                0|
|    max|            12269|
+-------+-----------------+



In [13]:
df_movies.select(['overview']).show(truncate=False, n=1)


+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|overview                                                                                                                                                                                                                                                                                                       |
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|Led by Woody, Andy's toys live happily in his room until Andy's birthday brings B

In [14]:
def clean_text(df, column_name="content"):
    """
    This fucntion takes the raw text data and apply a standard NLP preprocessing pipeline consisting of the following steps:
      - Text cleaning
      - Tokenization
      - Stopwords removal
      - Stemming (Snowball stemmer)

    parameter: dataframe
    returns: the input dataframe along with the `cleaned_content` column as the results of the NLP preprocessing pipeline

    """
    from pyspark.sql.functions import udf, col, lower, trim, regexp_replace, concat_ws
    from pyspark.ml.feature import Tokenizer, StopWordsRemover
    from nltk.stem.snowball import SnowballStemmer

    # Text preprocessing pipeline
    print("***** Text Preprocessing Pipeline *****\n")

    # 1. Text cleaning
    print("# 1. Text Cleaning\n")
    # 1.a Case normalization
    print("1.a Case normalization:")
    lower_case_news_df = df.select("id", lower(col(column_name)).alias(column_name))
    lower_case_news_df.show(10)
    # 1.b Trimming
    print("1.b Trimming:")
    trimmed_news_df = lower_case_news_df.select("id", trim(col(column_name)).alias(column_name))
    trimmed_news_df.show(10)
    # 1.c Filter out punctuation symbols
    print("1.c Filter out punctuation:")
    no_punct_news_df = trimmed_news_df.select("id", (regexp_replace(col(column_name), "[^a-zA-Z\\s]", "")).alias(column_name))
    no_punct_news_df.show(10)
    # 1.d Filter out any internal extra whitespace
    print("1.d Filter out extra whitespaces:")
    cleaned_news_df = no_punct_news_df.select("id", trim(regexp_replace(col(column_name), " +", " ")).alias(column_name))

    # 2. Tokenization (split text into tokens)
    print("# 2. Tokenization:")
    tokenizer = Tokenizer(inputCol=column_name, outputCol="tokens")
    tokens_df = tokenizer.transform(cleaned_news_df)

    print("# 3. Stopwords removal:")
    stopwords_remover = StopWordsRemover(inputCol="tokens", outputCol="terms")
    terms_df = stopwords_remover.transform(tokens_df)

    print("# 4. Stemming:")
    stemmer = SnowballStemmer(language="english")
    stemmer_udf = udf(lambda tokens: [stemmer.stem(token) for token in tokens], ArrayType(StringType()))
    terms_stemmed_df = terms_df.withColumn("terms_stemmed", stemmer_udf("terms"))
    
    print("# 5. untokenize")
    terms_joined_df = terms_stemmed_df.withColumn("terms_join", concat_ws(" ", "terms_stemmed"))
    return terms_joined_df

In [15]:
from pyspark.sql.functions import udf, col, lower, trim, regexp_replace, concat_ws
from pyspark.ml.feature import Tokenizer, StopWordsRemover
from nltk.stem.snowball import SnowballStemmer

def cleanse_text(df):    
    lower_df = df.withColumn("overview_lower", lower(col("overview")))
    trim_df = lower_df.withColumn("overview_trim", trim(col("overview_lower")))
    no_punct_df = trim_df.withColumn("overview_no_punct", regexp_replace(col("overview_trim"), "[^a-zA-Z\\s]", ""))
    no_whitespace_df = no_punct_df.withColumn("overview_cleansed", trim(regexp_replace(col("overview_no_punct"), " +", " ")))
    return no_whitespace_df, "overview_cleansed"                  
    
def tokenize_text(df, column_name):
    tokenizer = Tokenizer(inputCol=column_name, outputCol="tokens")
    return tokenizer.transform(df), "tokens"

def stem_tokens(df, column_name):
    stemmer = SnowballStemmer(language="english")
    stemmer_udf = udf(lambda tokens: [stemmer.stem(token) for token in tokens], ArrayType(StringType()))
    return df.withColumn("terms_stemmed", stemmer_udf(column_name)), "terms_stemmed"

def remove_stop_words(df, column_name):
    stopwords_remover = StopWordsRemover(inputCol=column_name, outputCol="terms")
    return stopwords_remover.transform(df)

def concat_arr(df, column_name):
    return  df.withColumn("corpus", concat_ws(" ", column_name))

def cleaning_pipe(df):
    df, column_name = cleanse_text(df)
    df, column_name = tokenize_text(df, column_name)
    df, column_name = stem_tokens(df, column_name)
    return df, column_name

In [16]:
#df_clean = clean_text(df_movies, 'overview')
#df_clean
#from pyspark.ml.feature import Tokenizer
#tokenizer = Tokenizer(inputCol="overview", outputCol="terms_stemmed")
#tokens_df = tokenizer.transform(df_movies)
#df_clean= tokens_df

In [17]:
df_clean, column_name = cleaning_pipe(df_movies)

df_pandas = concat_arr(df_clean, column_name).toPandas()

In [18]:
df_clean.select(column_name).show()


+--------------------+
|       terms_stemmed|
+--------------------+
|[led, by, woodi, ...|
|[when, sibl, judi...|
|[a, famili, wed, ...|
|[just, when, geor...|
|[obsess, master, ...|
|[an, ugli, duckl,...|
|[a, mischiev, you...|
|[intern, action, ...|
|[jame, bond, must...|
|[widow, us, presi...|
|[when, a, lawyer,...|
|[an, outcast, hal...|
|[an, allstar, cas...|
|[morgan, adam, an...|
|[the, life, of, t...|
|[rich, mr, dashwo...|
|[it, ted, the, be...|
|[summon, from, an...|
|[a, veng, new, yo...|
|[an, agoraphob, p...|
+--------------------+
only showing top 20 rows



In [19]:
#df_clean.select('terms_join').show(truncate=False, n=2)



In [20]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df_pandas['corpus'])

<40786x60074 sparse matrix of type '<class 'numpy.float64'>'
	with 1622711 stored elements in Compressed Sparse Row format>

In [21]:
df_pandas['tfidf'] = X

In [22]:
batman_begins = df_pandas.loc[df_pandas['id'] == '272'].index.item()
batman_dark_knight = df_pandas.loc[df_pandas['id'] == '862'].index.item()

In [23]:
batman_beings_cosine = cosine_similarity(X[batman_begins], X)[0]
print(batman_beings_cosine[batman_dark_knight])


0.022528706587039055
0.022528706587039055


In [24]:
len(batman_beings_cosine)

40786

In [25]:
df_pandas.shape

(40786, 32)

In [45]:
def sort_cos_score(matrix, df):
    arr = []
    for index, row in df.iterrows():
        arr.append(matrix[index])
    df['cosine_score'] = arr

sort_cos_score(batman_beings_cosine, df_pandas)
df_pandas.iloc[0]

adult                                                                False
belongs_to_collection    {'id': 10194, 'name': 'Toy Story Collection', ...
budget                                                            30000000
genres                   [{'id': 16, 'name': 'Animation'}, {'id': 35, '...
homepage                              http://toystory.disney.com/toy-story
id                                                                     862
imdb_id                                                          tt0114709
original_language                                                       en
original_title                                                   Toy Story
overview                 Led by Woody, Andy's toys live happily in his ...
popularity                                                       21.946943
poster_path                               /rhIRbceoE9lR4veEXuwCC2wARtG.jpg
production_companies        [{'name': 'Pixar Animation Studios', 'id': 3}]
production_countries     

In [49]:
sorted_df = df_pandas.sort_values(by=['cosine_score'], ascending=False)
sorted_df['original_title'].head(20)

9120                                         Batman Begins
19169    Batman Unmasked: The Psychology of the Dark Kn...
17892              Batman: The Dark Knight Returns, Part 1
32412                                    Batman: Bad Blood
16292                                     Batman: Year One
39300                                        Batman & Bill
2771                          Batman: Mask of the Phantasm
11275                                      The Dark Knight
14023                           Batman: Under the Red Hood
40352                             Batman Beyond: The Movie
1197                                        Batman Returns
534                                                 Batman
16491                                The Dark Knight Rises
135                                         Batman Forever
3237                                               Serpico
18290              Batman: The Dark Knight Returns, Part 2
19808                                      Pillow of Dea

In [27]:
from pyspark.ml.feature import HashingTF, CountVectorizer, IDF
from pyspark.ml import Pipeline

cv = CountVectorizer(inputCol=column_name, outputCol="tf_features", vocabSize=20000, minDF=.01, maxDF=.90)
idf = IDF(inputCol="tf_features", outputCol="features")

pipeline = Pipeline(stages=[cv, idf])
features = pipeline.fit(df_clean)
tf_idf_df = features.transform(df_clean)

In [28]:
@udf("long")
def num_nonzeros(v):
    return v.numNonzeros()

In [29]:
print("Total n. of zero-length vectors: {:d}".
      format(tf_idf_df.where(num_nonzeros("features") == 0).count()))

Total n. of zero-length vectors: 42


In [30]:
tf_idf_df = tf_idf_df.where(num_nonzeros("features") > 0)

In [31]:
print("Total n. of zero-length vectors (after removal): {:d}".
      format(tf_idf_df.where(num_nonzeros("features") == 0).count()))

Total n. of zero-length vectors (after removal): 0


In [32]:
tf_idf_df.select('features').count()

40744

In [33]:
tf_idf_df.printSchema()


root
 |-- adult: string (nullable = true)
 |-- belongs_to_collection: string (nullable = true)
 |-- budget: string (nullable = true)
 |-- genres: string (nullable = true)
 |-- homepage: string (nullable = true)
 |-- id: string (nullable = true)
 |-- imdb_id: string (nullable = true)
 |-- original_language: string (nullable = true)
 |-- original_title: string (nullable = true)
 |-- overview: string (nullable = true)
 |-- popularity: string (nullable = true)
 |-- poster_path: string (nullable = true)
 |-- production_companies: string (nullable = true)
 |-- production_countries: string (nullable = true)
 |-- release_date: string (nullable = true)
 |-- revenue: string (nullable = true)
 |-- runtime: string (nullable = true)
 |-- spoken_languages: string (nullable = true)
 |-- status: string (nullable = true)
 |-- tagline: string (nullable = true)
 |-- title: string (nullable = true)
 |-- video: string (nullable = true)
 |-- vote_average: double (nullable = true)
 |-- vote_count: integer (n

In [34]:
def cosine_similarity_vec(a, b):
    return 1 - a.dot(b)/(a.norm(2)*b.norm(2))

def cosine_similarity(id1, id2):
    df_1 = tf_idf_df.select("features").where(tf_idf_df.id == id1).first()
    df_2 = tf_idf_df.select("features").where(tf_idf_df.id == id2).first()
    return cosine_similarity_vec(df_1.features, df_2.features)

cosine_similarity("238","240") # toy story 1 and 2

0.6498177078750322

In [35]:
tf_idf_df.select(["id","title"]).filter(lower(tf_idf_df.title).like('%toy%')).show(truncate=False,n=3)

+-----+----------------+
|id   |title           |
+-----+----------------+
|862  |Toy Story       |
|11597|Toys            |
|25898|Babes in Toyland|
+-----+----------------+
only showing top 3 rows



In [36]:
tf_idf_df.select(["id","title"]).filter(lower(tf_idf_df.title).like('%the dark knight%')).show(truncate=False, n=75)


+------+--------------------------------------------------+
|id    |title                                             |
+------+--------------------------------------------------+
|155   |The Dark Knight                                   |
|49026 |The Dark Knight Rises                             |
|123025|Batman: The Dark Knight Returns, Part 1           |
|142061|Batman: The Dark Knight Returns, Part 2           |
|29751 |Batman Unmasked: The Psychology of the Dark Knight|
|72003 |The Dark Knight                                   |
+------+--------------------------------------------------+



In [37]:
def cos_sim_dataset(movie_vec):
    udf_func = udf(lambda df:str(cosine_similarity_vec(movie_vec.features, df)))
    return tf_idf_df.withColumn("cosine_similarity", udf_func("features"))

In [38]:
df_vec_batman_forever = tf_idf_df.select("features").where(tf_idf_df.id == "414").first()

df_similarity = cos_sim_dataset(df_vec_batman_forever)
    
df_similarity.select(["cosine_similarity", "id"]).show(truncate=False, n=5)

+------------------+-----+
|cosine_similarity |id   |
+------------------+-----+
|0.9840198875239855|862  |
|0.9686641779645443|8844 |
|0.9707876211721896|15602|
|0.9833849583903742|11862|
|0.9750759689823176|949  |
+------------------+-----+
only showing top 5 rows



In [39]:
df_similarity.select(["title","id", "cosine_similarity"]).where(df_movies.id == "8844").first()

Row(title='Jumanji', id='8844', cosine_similarity='0.9686641779645443')

In [40]:
#df_movies.select(["id","title"]).filter(lower(df_movies.title).like('%the godfather%')).show()


In [41]:
df_similarity = df_similarity.withColumn("cosine_similarity", df_similarity["cosine_similarity"].cast("double"))


In [42]:
df_similarity.orderBy('cosine_similarity', ascending=False).select(["cosine_similarity", "id", "title"]).show()

+-----------------+------+--------------------+
|cosine_similarity|    id|               title|
+-----------------+------+--------------------+
|              1.0| 52279|         The Dentist|
|              1.0|211222|The Wildman of Ke...|
|              1.0|  9657|    Otto - The Movie|
|              1.0|130957|The Sons of Great...|
|              1.0|  1659|Texas - Doc Snyde...|
|              1.0|365566|Small People with...|
|              1.0|  4948|          Womanlight|
|              1.0|  1655|Praxis Dr. Hasenbein|
|              1.0|  3102|My Step Brother F...|
|              1.0|121170|          Simply Raw|
|              1.0|113660|              Rumble|
|              1.0|  4290|          Supergator|
|              1.0|  9379|Otto - The Alien ...|
|              1.0|342695|        Cannibal Fog|
|              1.0| 12696|            Hierankl|
|              1.0| 16884|Anders Matthesen:...|
|              1.0|191427|The House of Dark...|
|              1.0|270650|Oy Vey! My Son

In [43]:
df_similarity.groupby(["cosine_similarity"]).count().sort("cosine_similarity", ascending=False).show(10)


+------------------+-----+
| cosine_similarity|count|
+------------------+-----+
|               1.0|  326|
|0.9999213347225796|    1|
| 0.999893976010199|    1|
|0.9998904979463532|    1|
|0.9998851437309468|    1|
|0.9998822752801851|    1|
|0.9998798629071319|    1|
|0.9998766481345249|    1|
|0.9998701623832511|    1|
|0.9998689641002112|    1|
+------------------+-----+
only showing top 10 rows

