In [None]:
import socket
myHostName = socket.gethostname()

print("Name of the localhost is {}".format(myHostName))
myIP = socket.gethostbyname(myHostName)

print("IP address of the localhost is {}".format(myIP))

In [None]:
%matplotlib inline
from IPython.display import Image, HTML
import json
import datetime
import ast
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy import stats

In [None]:
import pyspark.sql.functions as sql_func
from pyspark.sql.types import *
from pyspark.ml.recommendation import ALS, ALSModel
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
from pyspark.mllib.evaluation import RegressionMetrics, RankingMetrics
from pyspark.ml.evaluation import RegressionEvaluator
sc = SparkContext('local')
spark = SparkSession(sc)

In [None]:
data_schema = StructType([
    StructField('userId', IntegerType(), False),
    StructField('movieId', IntegerType(), False),
    StructField('rating', FloatType(), False),
    StructField('timestamp',IntegerType(), False)
])
final_stat = spark.read.csv('C:/Users/catay/Desktop/Big Data/datasets/ratings.csv', header=True, schema=data_schema
).cache()

ratings = (final_stat.select(
    'userId',
    'movieId',
    'rating'
)).cache()

In [None]:
# Load Movies Metadata
df = pd.read_csv('C:/Users/catay/Desktop/Big Data/datasets/movies_metadata.csv', low_memory=False)

In [None]:
df.head().transpose()

In [None]:
###################################################################################################################
###- VISUALIZATION -###
###################################################################################################################

In [None]:
#Understand the Data
df.columns

In [None]:
df.shape

In [None]:
df.info()

In [None]:
#Let us start by removing the features that are not useful to us.
df = df.drop(['imdb_id'], axis=1)

In [None]:
df[df['original_title'] != df['title']][['title', 'original_title']].head()

In [None]:
df = df.drop('original_title', axis=1)

In [None]:
#using the translated, Anglicized name in this analysis and hence, will drop the original titles altogether
df[df['revenue'] == 0].shape

In [None]:
df['revenue'] = df['revenue'].replace(0, np.nan)

In [None]:
#The budget feature has some unclean values that makes Pandas assign it as a generic object.
df['budget'] = pd.to_numeric(df['budget'], errors='coerce')
df['budget'] = df['budget'].replace(0, np.nan)
df[df['budget'].isnull()].shape

In [None]:
df['return'] = df['revenue'] / df['budget']
df[df['return'].isnull()].shape

In [None]:
df['year'] = pd.to_datetime(df['release_date'], errors='coerce').apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)

In [None]:
df['adult'].value_counts()

In [None]:
#There are close to 0 adult movies in this dataset. The adult feature therefore is not of much use to us and 
#can be safely dropped
df = df.drop('adult', axis=1)

In [None]:
#More Analysis

#Production Countries
df['production_countries'] = df['production_countries'].fillna('[]').apply(ast.literal_eval)
df['production_countries'] = df['production_countries'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

In [None]:
s = df.apply(lambda x: pd.Series(x['production_countries']),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'countries'

In [None]:
con_df = df.drop('production_countries', axis=1).join(s)
con_df = pd.DataFrame(con_df['countries'].value_counts())
con_df['country'] = con_df.index
con_df.columns = ['num_movies', 'country']
con_df = con_df.reset_index().drop('index', axis=1)
con_df.head(10)

In [None]:
df_fran = df[df['belongs_to_collection'].notnull()]
df_fran['belongs_to_collection'] = df_fran['belongs_to_collection'].apply(ast.literal_eval).apply(lambda x: x['name'] if isinstance(x, dict) else np.nan)
df_fran = df_fran[df_fran['belongs_to_collection'].notnull()]

In [None]:
fran_pivot = df_fran.pivot_table(index='belongs_to_collection', values='revenue', aggfunc={'revenue': ['mean', 'sum', 'count']}).reset_index()

In [None]:
#Highest Grossing Movie Franchises
fran_pivot.sort_values('sum', ascending=False).head(10)

In [None]:
#Most Successful Movie Franchises (by Average Gross)
fran_pivot.sort_values('mean', ascending=False).head(10)

In [None]:
#Longest Running Franchises
fran_pivot.sort_values('count', ascending=False).head(10)

In [None]:
#Original Language
df['original_language'].drop_duplicates().shape[0]

In [None]:
lang_df = pd.DataFrame(df['original_language'].value_counts())
lang_df['language'] = lang_df.index
lang_df.columns = ['number', 'language']
lang_df.head(20)

In [None]:
plt.figure(figsize=(12,5))
sns.barplot(x='language', y='number', data=lang_df.iloc[1:11])
plt.show()

In [None]:
#Popularity, Vote Average and Vote Count
def clean_numeric(x):
    try:
        return float(x)
    except:
        return np.nan

In [None]:
df['popularity'] = df['popularity'].apply(clean_numeric).astype('float')
df['vote_count'] = df['vote_count'].apply(clean_numeric).astype('float')
df['vote_average'] = df['vote_average'].apply(clean_numeric).astype('float')

In [None]:
df['popularity'].describe()

In [None]:
sns.distplot(df['popularity'].fillna(df['popularity'].median()))
plt.show()

In [None]:
df['popularity'].plot(logy=True, kind='hist')

In [None]:
#Most Popular Movies by Popularity Score
df[['title', 'popularity', 'year']].sort_values('popularity', ascending=False).head(10)

In [None]:
df['vote_count'].describe()

In [None]:
df[['title', 'vote_count', 'year']].sort_values('vote_count', ascending=False).head(10)

In [None]:
#Inception and The Dark Knight, two critically acclaimed and commercially successful Christopher Nolan movies figure at the top of our chart.
df['vote_average'] = df['vote_average'].replace(0, np.nan)
df['vote_average'].describe()

In [None]:
sns.distplot(df['vote_average'].fillna(df['vote_average'].median()))

In [None]:
#Most Critically Acclaimed Movies
df[df['vote_count'] > 2000][['title', 'vote_average', 'vote_count' ,'year']].sort_values('vote_average', ascending=False).head(10)

In [None]:
#Movie Release Dates
month_order = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
day_order = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']

In [None]:
def get_month(x):
    try:
        return month_order[int(str(x).split('-')[1]) - 1]
    except:
        return np.nan

In [None]:
def get_day(x):
    try:
        year, month, day = (int(i) for i in x.split('-'))    
        answer = datetime.date(year, month, day).weekday()
        return day_order[answer]
    except:
        return np.nan

In [None]:
df['day'] = df['release_date'].apply(get_day)
df['month'] = df['release_date'].apply(get_month)

In [None]:
plt.figure(figsize=(12,6))
plt.title("Number of Movies released in a particular month.")
sns.countplot(x='month', data=df, order=month_order)

In [None]:
#Number of Movies by the year
year_count = df.groupby('year')['title'].count()
plt.figure(figsize=(18,5))
year_count.plot()

In [None]:
#Most Expensive Movies of all Time
df[df['budget'].notnull()][['title', 'budget', 'revenue', 'return', 'year']].sort_values('budget', ascending=False).head(10)

In [None]:
#Most Successful Movies
df[(df['return'].notnull()) & (df['budget'] > 5e6)][['title', 'budget', 'revenue', 'return', 'year']].sort_values('return', ascending=False).head(10)

In [None]:
sns.set(font_scale=1.25)

In [None]:
#Genres
df['genres'] = df['genres'].fillna('[]').apply(ast.literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

In [None]:
s = df.apply(lambda x: pd.Series(x['genres']),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'genre'

In [None]:
gen_df = df.drop('genres', axis=1).join(s)

In [None]:
gen_df['genre'].value_counts().shape[0]

In [None]:
pop_gen = pd.DataFrame(gen_df['genre'].value_counts()).reset_index()
pop_gen.columns = ['genre', 'movies']
pop_gen.head(10)

In [None]:
plt.figure(figsize=(18,8))
sns.barplot(x='genre', y='movies', data=pop_gen.head(15))
plt.show()

In [None]:
df['overview'].head(10)

In [None]:
###################################################################################################################
####-- ALS REC --####
###################################################################################################################


In [None]:
(training, test) = ratings.randomSplit([0.7, 0.3], seed=5047)

In [None]:
training.show(5)

In [None]:
test.show(5)

In [None]:
spark.sparkContext.setCheckpointDir("C:/Users/catay/Desktop/calÄ±san_01")

In [None]:
from pyspark.ml.recommendation import ALS    
import numpy as np
def trainModelAndCalcMseValue(rank,iter_num,alpha_val):
    als = ALS(userCol="userId",itemCol="movieId",ratingCol="rating",rank=rank, maxIter=iter_num,alpha = alpha_val, seed=5047)
    
    model = als.fit(training)
    
    predictions = model.transform(test[["userId","movieId"]])  
    
    ratesAndPreds = test.join(other=predictions,on=['userId','movieId'],how='inner').na.drop() 
    
    rating = np.array(ratesAndPreds.select("rating").collect()).ravel()
    prediction = np.array(ratesAndPreds.select("prediction").collect()).ravel()
    print(f"Rank : {rank} - Iterations : {iter_num} - Alpha(Lambda) : {alpha_val} ")
    print("MSE : ", np.mean((rating - prediction)**2))

In [None]:
trainModelAndCalcMseValue(10,10,0.1)

In [None]:
trainModelAndCalcMseValue(10,50,0.1)

In [None]:
trainModelAndCalcMseValue(10,200,0.1)

In [None]:
trainModelAndCalcMseValue(50,10,0.1)

In [None]:
trainModelAndCalcMseValue(50,50,0.1)

In [None]:
trainModelAndCalcMseValue(50,200,0.1)

In [None]:
trainModelAndCalcMseValue(200,10,0.1)

In [None]:
trainModelAndCalcMseValue(200,50,0.1)

In [None]:
trainModelAndCalcMseValue(200,200,0.1)

In [None]:
trainModelAndCalcMseValue(10,10,0.01)

In [None]:
trainModelAndCalcMseValue(10,50,0.01)

In [None]:
trainModelAndCalcMseValue(10,200,0.01)

In [None]:
trainModelAndCalcMseValue(50,10,0.01)

In [None]:
trainModelAndCalcMseValue(50,50,0.01)

In [None]:
trainModelAndCalcMseValue(50,200,0.01)

In [None]:
trainModelAndCalcMseValue(200,10,0.01)

In [None]:
trainModelAndCalcMseValue(200,50,0.01)

In [None]:
trainModelAndCalcMseValue(200,200,0.01)