<a href="https://colab.research.google.com/github/CagataySencan/Hybrid-Movie-Recommender-System/blob/main/HybridMovieRecommanderEngine.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Genel Hazırlıklar

In [None]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy as stats
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
import nltk 
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from string import punctuation
from gensim.parsing.preprocessing import remove_stopwords
from collections import Counter
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import wordnet
!pip install scikit-surprise
from scipy.sparse import csr_matrix
from surprise import Reader,Dataset,SVD
from surprise.model_selection import cross_validate
from sklearn.neighbors import NearestNeighbors
!pip install fuzzywuzzy
from fuzzywuzzy import process

In [None]:
! pip install kaggle

In [3]:
! mkdir ~/.kaggle

In [4]:
!cp /content/kaggle.json ~/.kaggle/kaggle.json

In [5]:
! chmod 600 ~/.kaggle/kaggle.json

In [None]:
! kaggle datasets download rounakbanik/the-movies-dataset
! kaggle datasets download tmdb/tmdb-movie-metadata

In [None]:
! unzip /content/the-movies-dataset.zip
! unzip /content/tmdb-movie-metadata.zip

In [8]:
credits = pd.DataFrame(pd.read_csv('credits.csv'))
keywords = pd.DataFrame(pd.read_csv('keywords.csv'))

# Content Based Filtering


In [None]:
# Content based filtering için hazırlık
movies_metadata_tmdb = pd.DataFrame(pd.read_csv('tmdb_5000_movies.csv'))
movies_tmdb_credits = pd.DataFrame(pd.read_csv('tmdb_5000_credits.csv'))
content_based_movies = movies_metadata_tmdb[['overview','title','genres', 'keywords']]
content_based_credits = movies_tmdb_credits[['cast','crew','movie_id']]
content_based = content_based_movies.join(content_based_credits)
# Yeterli veri olduğu için eksik veri olan satırları çıkartma kararı aldım
content_based = content_based.dropna()

# Bu filtreden en yüksek verimi alabilmek için iki alt filtreye bölme kararı aldım
# İlk filtre önerileri overview'a, ikincisi ise cast, crew, keyword, genre gibi parametrelere göre öneri yapacak
# Bu filtrede memory yetersizliği nedeniyle cosine similarity score hesaplayamadığım için TMDB 5000 Movie Dataset'i kullandım
content_based.head()

### Overview Bazlı Filtre

In [None]:
# overview bazlı filtre :

overview_based = content_based[['title','overview']]
print(overview_based.isnull().sum())
print(overview_based.head())

# Vektörleştirme işlemi 

tf_idf = TfidfVectorizer(stop_words='english')
tfidf_matrix_overview = tf_idf.fit_transform(overview_based['overview'])


# Filmler arasındaki benzerlikleri nümerik bir skorla görmek için 'cosine similarity score' hesabı yapma kararı aldım
tfidf_matrix_overview.shape
similarity_overview = linear_kernel(tfidf_matrix_overview,tfidf_matrix_overview)
overview_based = overview_based.reset_index()
index = pd.Series(overview_based.index, index=overview_based['title']).drop_duplicates()

In [11]:
# Girilen film ismine göre öneri yapacak olan fonksiyon :

def recommend_by_overview(title, sim = similarity_overview) :
  idx = index[title]
  sim_score = list(enumerate(sim[idx]))
  sim_score = sorted(sim_score, key=lambda x: x[1], reverse=True)
  sim_score = sim_score[1:11]
  movie_index = [i[0] for i in sim_score]
  recommendation = overview_based['title'].iloc[movie_index].tolist()
  recDict = {}
  i = 0
  while i < 10 :
    recDict[recommendation[i]] = sim_score[i][1]
    i += 1

  return recDict

### Cast, Crew, Keyword, Genre Bazlı Filtre

In [None]:
features = ['cast', 'crew', 'keywords', 'genres']
feature_based = content_based[['title','cast', 'crew', 'keywords', 'genres']]

for feature in features : 
  feature_based[feature] = feature_based[feature].apply(literal_eval)

# Yönetmenleri crew sütunundan almak için gerekli fonksiyon  
def add_director(j) :
  for i in j :
    if i['job'] == 'Director':
      return i['name']
  return np.nan

# Diğer sütunlardaki bilgilerden ilk 5 tanesini almak için gerekli fonksiyon
def add_list(j) :
  if isinstance(j, list):
        
        names = [i['name'] for i in j]
        
        if len(names) > 5:
            names = names[:5]
        return names
   
  return []       

# Veriyi kullanabilmek için uyumlu forma getirme işlemi
feature_based['director'] = feature_based['crew'].apply(add_director)
features = ['cast', 'keywords', 'genres']

for feature in features:
    feature_based[feature] = feature_based[feature].apply(add_list)

feature_based = feature_based.drop(columns = ['crew'])

In [13]:
# Bütun satırlardaki verileri küçük harfe dönüştürme ve boşlukları düzenleme işlemi 
def clean(j):
    if isinstance(j, list):
        return [str.lower(i.replace(" ", "")) for i in j]
    else:
        if isinstance(j, str):
            return str.lower(j.replace(" ", ""))
        else:
            return ''

features = ['cast', 'keywords', 'director', 'genres']

for feature in features:
    feature_based[feature] = feature_based[feature].apply(clean)

# Vektörleştirme işlemini tek seferde yapabilmek için bütün verileri içeren tek bir sütun oluşturma işlemi
def all_data(x):
    return ' '.join(x['keywords']) + ' ' + ' '.join(x['cast']) + ' ' + x['director'] + ' ' + ' '.join(x['genres'])


feature_based['all'] = feature_based.apply(all_data, axis=1)

tf_idf = TfidfVectorizer(stop_words='english')
tfidf_matrix_feature = tf_idf.fit_transform(feature_based['all'])

# Filmler arasındaki benzerlikleri nümerik bir skorla görmek için 'cosine similarity score' hesabı yapma kararı aldım
tfidf_matrix_feature.shape
similarity_feature = linear_kernel(tfidf_matrix_feature,tfidf_matrix_feature)
feature_based = feature_based.reset_index()
index = pd.Series(feature_based.index, index=feature_based['title']).drop_duplicates()

In [14]:
def recommend_by_feature(title, sim = similarity_feature) :
  idx = index[title]
  sim_score = list(enumerate(sim[idx]))
  sim_score = sorted(sim_score, key=lambda x: x[1], reverse=True)
  sim_score = sim_score[1:11]
  movie_index = [i[0] for i in sim_score]
  recommendation = feature_based['title'].iloc[movie_index].tolist()
  recDict = {}
  i = 0
  while i < 10 :
    recDict[recommendation[i]] = sim_score[i][1]
    i += 1

  return recDict

In [None]:
# İki filtreden de gelen veriler dictionary şeklinde olduğundan dolayı değerden anahtar kelimeyi bulan bir fonksiyona ihtiyacım oldu
def get_key(d, val):
    keys = [k for k, v in d.items() if v == val]
    if keys:
        return keys[0]
    return None

# Buradaki fonksiyonda iki filtreden gelen cosine similarity score değeri bir listeye atanıyor 
# Liste büyükten küçüğe sıralanıp değeri en büyük olan 10 film öneri olarak veriliyor
def final_recommendation_content_based(title) :
  list1 = recommend_by_overview(title)
  list2 = recommend_by_feature(title)
  value_list = list(list1.values()) + list(list2.values())
  value_list.sort(reverse = True)
  recommend_list = []
  i = 0 
  while i < 20 :
    key = get_key(list1,value_list[i])
    if key is None :
      new_key = get_key(list2,value_list[i])
      if new_key in recommend_list :
        i += 1 
        continue
      else :
        recommend_list.append(new_key)    
    else :
      if key in recommend_list :
        i += 1 
        continue
      else :
        recommend_list.append(key)  
    i += 1  
  recommend_list = recommend_list[0:10]
  return recommend_list

final_recommendation_content_based('The Godfather')

# Collaborative Filtering

### User-Based Collaborative Filtering

In [None]:
# Filtre için veri hazırlığı
movies = pd.read_csv('movies_metadata.csv')
ratings = pd.read_csv('ratings_uploaded.csv')
movies = movies[['id','title']]
movies['movieId'] = movies['id']
movies = movies.drop('id', axis = 1)  

In [88]:
# SVD ile kullanıcıya ve girdiği filme göre vereceği rating'i tahmin etme işlemi 
def collaborative_filter(movie_id,user_id) : 
  reader = Reader()

  data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

  svd = SVD()
  cross_validate(svd, data, measures=['RMSE', 'MAE'])
  trainset = data.build_full_trainset()
  svd.fit(trainset)

  score = str(svd.predict(2000, movie_id, 3.5))
  score = score.split()
  return score[9]

# Demographic Filtering

In [None]:
# Bu filtreleme çeşidi proje içerisinde pek önemli bir yer arz etmiyor. Tamamen popülerlik bazlı çalışmaktadır.
movies_metadata = pd.read_csv('movies_metadata.csv')
vote_average= movies_metadata['vote_average'].mean()
vote_count= movies_metadata['vote_count'].quantile(0.9)

demo_movies = movies_metadata.copy().loc[movies_metadata['vote_count'] >= vote_count]

# IMDB'nin film değerlendirmesi için kullandığı formülü uygulayalım
def weighted_rating(x, vote_count=vote_count, vote_average=vote_average):
    vote_numbers = x['vote_count']
    average = x['vote_average']
    # Calculation based on the IMDB formula
    return (vote_numbers/(vote_numbers+vote_count) * average) + (vote_count/(vote_count+vote_numbers) * vote_average)

def demographic_recommender(demo_movies = demo_movies) : 
  demo_movies['score'] = demo_movies.apply(weighted_rating, axis=1)
  demo_movies = demo_movies.sort_values('score', ascending=False)
  return demo_movies['title'].head(3).tolist()    

# Review Based Filtering

In [None]:
reviews = pd.read_csv('reviews.csv')
reviews = reviews.drop('id', axis = 1)

# Text temizleme işlemleri 

nltk.download('wordnet')
nltk.download('vader_lexicon')
print(reviews['sentiment'].value_counts())
reviews['review'] = reviews['review'].astype(str)
for i in range(reviews['review'].count()) : 
   
   text = str(reviews['review'][i])
   text = text.lower()
   text = remove_stopwords(text)
   text = text.translate(str.maketrans('','', punctuation))
   reviews['review'][i] = text

# Common wordlerden temizleme
cnt = Counter()
for text in reviews['review'].values : 
  for word in text.split() :
    cnt[word] += 1


freq = set([w for (w, wc) in cnt.most_common(10)])
def removeFreqwords(text):
    return " ".join([word for word in str(text).split() if word not in freq])

reviews['review'] = reviews['review'].apply(lambda text: removeFreqwords(text))
reviews.head()

# Lemmalama işlemi
lm = WordNetLemmatizer()

def lemmatizer(text):
    return " ".join([lm.lemmatize(word) for word in text.split()])

reviews['review'] = reviews['review'].apply(lambda text: lemmatizer(text))

sia = SentimentIntensityAnalyzer()
reviews['scores'] = reviews['review'].apply(lambda review: sia.polarity_scores(review))

# Polarity score analizi için compound isimli bir sütun oluşturma işlemi
reviews['compound'] = reviews['scores'].apply(lambda comp: comp['compound'])

# Polarity score'a göre duyguları yeniden düzenleme
reviews['comp_score'] = reviews['compound'].apply(lambda c: 1 if c >= 0 else 0)
print(reviews.head)
reviews = reviews.drop(['review','sentiment','scores','compound'],axis = 1)
print(reviews['comp_score'].value_counts())

In [None]:
def review_based_recommender(movie_name):
  values = list(reviews['comp_score'].value_counts()) 
  mov_values = list(reviews.loc[reviews.title == movie_name,'comp_score'].values)
  positive_count = mov_values.count(1)
  negative_count = mov_values.count(0)
  total_count = positive_count + negative_count
  percent_positive = round((positive_count*100)/(total_count))
  percent_positive_text = "%" + str(percent_positive) + " of reviews are positive"
  return percent_positive_text

x = review_based_recommender('The Lost World: Jurassic Park')
x  

# Hybrid Recommender

In [None]:
def final_recommendation(movie_name,user_id):
  content_rec = final_recommendation_content_based(movie_name)
  demo_rec = demographic_recommender()
  review_rec = review_based_recommender(movie_name)
  new_list = list(set(content_rec + demo_rec))
  
  collab_evalution = []
  collab_evalution_id = []
  id_list = []
  name_list = []
  review_list = []
  review_list_temp = []

  for i in new_list :
    isin = movies.loc[movies['title'] == i]
    isin = list(isin['title'])
    for i in isin:
      if(i not in name_list ) :
        name_list.append(i)   
  for j in name_list:
    id = movies.loc[movies['title'] == j]
    id = list(id['id'])
    id_list.append(id)  
  for i in id_list :
      collab_evalution_id.append(i[0])    
  for i in collab_evalution_id :
    collab_evalution.append(collaborative_filter(int(i),user_id))
  for i in name_list :
    isin = reviews.loc[reviews['title'] == i]
    isin = list(isin['title'])
    for j in isin :
      if j not in review_list_temp:
        review_list_temp.append(j)
           

  for i in name_list:
    if i in review_list_temp:
      review_list.append(review_based_recommender(i))
    else:
      review_list.append("There is not any review for this movie")  

  data = {'Movie Name' : name_list,
         'Estimated Rating for User 2000' : collab_evalution,
         'Reviews by Other Users' : review_list}
  final_frame = pd.DataFrame(data)
  final_frame = final_frame.sort_values(by=['Estimated Rating for User 2000'],ascending=False)
  return final_frame.head(10)

recommandation = final_recommendation("Interstellar",2000)
recommandation          