In [24]:
''' IMPLEMENTATION OF:
- Collaborative filtering
- Content-based filtering
- Similiarity filtering (personalized)
- Multicriteria '''

' IMPLEMENTATION OF:\n- Collaborative filtering\n- Content-based filtering\n- Similiarity filtering (personalized)\n- Multicriteria '

In [25]:
import pandas as pd
import numpy as np

In [26]:
path = '/content/drive/MyDrive/imdb_dataset/'

movies = pd.read_csv(path + 'IMDb movies.csv')
names = pd.read_csv(path + 'IMDb names.csv')
ratings = pd.read_csv(path + 'IMDb ratings.csv')
title_principals = pd.read_csv(path + 'IMDb title_principals.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [27]:
movies.columns

Index(['imdb_title_id', 'title', 'original_title', 'year', 'date_published',
       'genre', 'duration', 'country', 'language', 'director', 'writer',
       'production_company', 'actors', 'description', 'avg_vote', 'votes',
       'budget', 'usa_gross_income', 'worlwide_gross_income', 'metascore',
       'reviews_from_users', 'reviews_from_critics'],
      dtype='object')

# Collaborative filtering

Due to the fact that the above dataset doesn't contain a particular user's watch tendency we won't be able to implement collaborative filtering.

In [None]:
ratings

NameError: ignored

# Content-based filtering

In [29]:
movies.columns

Index(['imdb_title_id', 'title', 'original_title', 'year', 'date_published',
       'genre', 'duration', 'country', 'language', 'director', 'writer',
       'production_company', 'actors', 'description', 'avg_vote', 'votes',
       'budget', 'usa_gross_income', 'worlwide_gross_income', 'metascore',
       'reviews_from_users', 'reviews_from_critics'],
      dtype='object')

### 1.Remove redundand columns

In [30]:
movies = movies.drop(['writer','language','reviews_from_users', 'reviews_from_critics', 'budget', 'usa_gross_income', 'worlwide_gross_income', 'date_published', 'production_company', 'votes', 'metascore', 'imdb_title_id', 'original_title'], axis=1)

###2. Remove rows contnaing NaN to avoid problems with processing the data 

In [93]:
movies = movies.dropna()

In [92]:
#movies = movies[movies.director.notna()]
#movies = movies[movies.country.notna()]
#movies = movies[movies.description.notna()]
#movies = movies[movies.actors.notna()]
#movies = movies[movies.description.notna()]

### 3. We'll use only one of the actors (main one) in order to simplify the data and reduce dimensionality

In [35]:
movies.actors = movies.actors.apply(lambda x: ''.join(x.split(',')[0].split()))

###4.Encode text fields like genre, director, country and actor(s) into numerical values


In [36]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
to_be_encoded_columns = ['genre', 'director', 'country', 'actors']

for col in to_be_encoded_columns:
  le.fit(movies[col])  
  movies[col] = le.transform(movies[col])

### 5.In order to extract the essence of the description we'll use rake-nltk to get top 3 most important words from the description. And to vectorize the words we'll use CountVectorizer() from scikit-learn.

In [37]:
!pip install rake-nltk



In [40]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()

In [41]:
from rake_nltk import Rake
import string
r = Rake()  

def get_most_important_words(x):
  x = x.translate(str.maketrans('', '', string.punctuation))
  r.extract_keywords_from_text(x)
  res = r.get_ranked_phrases()[:3]
  res.sort()
  most_important_words = (" ".join(res)).split()[:3]
  return " ".join(most_important_words)

summaries = movies.description.apply(get_most_important_words)
X = vectorizer.fit_transform(summaries)

In [None]:
length = len(movies.description)

keyword_numbers = []
for i in range(0, length):
  lst = list(np.nonzero(X[i].toarray())[1])
  keyword_numbers.append(lst)

### 7. We'll create new feature for out data set, which we'll take the average of the vectorized words. We are doing this because the lower the dimensionality is, the better our algorithm will be. 

In [75]:
keyword_numbers_column = [sum(x)/(len(x)*10000) if len(x) != 0 else 0 for x in keyword_numbers]
normalized_keyword_numbers_column = normalize([keyword_numbers_column])
movies['keyword_weight'] = normalized_keyword_numbers_column[0]

### 8. We'll normalize fields like year, duration and average vote in order to improve the KMeans algorithm

In [47]:
# remove defected year fields
from sklearn.preprocessing import normalize

error_entries = []
for x in movies.year.iteritems():
  y = x[1]
  try:
    int(y)
  except Exception:
    error_entries.append(x[0])
movies = movies.drop(error_entries, axis=0)

In [48]:
movies.year = movies.year.astype('float')

In [49]:
from sklearn.preprocessing import normalize

not_normalized_years = movies.year
normalized_years = normalize([movies.year])
movies.year = normalized_years[0]

not_normalized_duration = movies.duration
normalized_duration = normalize([movies.duration])
movies.duration = normalized_duration[0]

not_normalized_avg_vote = movies.avg_vote
normalized_avg_vote = normalize([movies.avg_vote])
movies.avg_vote = normalized_avg_vote[0]



###9. Finally we'll create our dataframe, which is ready to be fed to the algorithm and will run kmeans. 

In [76]:
kmeans_df = movies[['year', 'genre', 'duration', 'actors', 'avg_vote', 'keyword_weight']]
kmeans_df

Unnamed: 0,year,genre,duration,actors,avg_vote,keyword_weight
0,0.003287,1141,0.001512,4285,0.003384,0.000879
1,0.003308,418,0.002352,9482,0.003498,0.003183
2,0.003316,695,0.001781,3217,0.003326,0.002711
3,0.003318,763,0.003360,12805,0.002982,0.004176
4,0.003316,223,0.002285,29716,0.004015,0.001632
...,...,...,...,...,...,...
85848,0.003504,695,0.003730,19219,0.005047,0.005686
85849,0.003504,490,0.004100,34645,0.003900,0.003167
85850,0.003506,462,0.003192,7520,0.003040,0.004369
85851,0.003506,490,0.003461,12967,0.004416,0.004826


In [77]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=3000, random_state=0).fit(kmeans_df)


###10. It's time to test our algorithm

In [87]:
prediction = kmeans.predict([kmeans_df.loc[310]])

In [None]:
results = [i for i in range(len(kmeans.labels_)) if kmeans.labels_[i] == prediction[0]]
results

In [None]:
movies.loc[results[1]]

###11. We'll save our model in case we want to use it again. 

In [85]:
import pickle
filename = 'drive/MyDrive/kmeans_movie_recommender.sav'
pickle.dump(kmeans, open(filename, 'wb'))

#import joblib
#loaded_model = joblib.load(filename)

In [96]:
ratings

Unnamed: 0,imdb_title_id,weighted_average_vote,total_votes,mean_vote,median_vote,votes_10,votes_9,votes_8,votes_7,votes_6,votes_5,votes_4,votes_3,votes_2,votes_1,allgenders_0age_avg_vote,allgenders_0age_votes,allgenders_18age_avg_vote,allgenders_18age_votes,allgenders_30age_avg_vote,allgenders_30age_votes,allgenders_45age_avg_vote,allgenders_45age_votes,males_allages_avg_vote,males_allages_votes,males_0age_avg_vote,males_0age_votes,males_18age_avg_vote,males_18age_votes,males_30age_avg_vote,males_30age_votes,males_45age_avg_vote,males_45age_votes,females_allages_avg_vote,females_allages_votes,females_0age_avg_vote,females_0age_votes,females_18age_avg_vote,females_18age_votes,females_30age_avg_vote,females_30age_votes,females_45age_avg_vote,females_45age_votes,top1000_voters_rating,top1000_voters_votes,us_voters_rating,us_voters_votes,non_us_voters_rating,non_us_voters_votes
0,tt0000009,5.9,154,5.9,6.0,12,4,10,43,28,28,9,1,5,14,7.2,4.0,6.0,38.0,5.7,50.0,6.6,35.0,6.2,97.0,7.0,1.0,5.9,24.0,5.6,36.0,6.7,31.0,6.0,35.0,7.3,3.0,5.9,14.0,5.7,13.0,4.5,4.0,5.7,34.0,6.4,51.0,6.0,70.0
1,tt0000574,6.1,589,6.3,6.0,57,18,58,137,139,103,28,20,13,16,6.0,1.0,6.1,114.0,6.0,239.0,6.3,115.0,6.1,425.0,6.0,1.0,6.2,102.0,6.0,210.0,6.2,100.0,6.2,50.0,,,5.9,12.0,6.2,23.0,6.6,14.0,6.4,66.0,6.0,96.0,6.2,331.0
2,tt0001892,5.8,188,6.0,6.0,6,6,17,44,52,32,16,5,6,4,,,5.5,25.0,5.8,72.0,6.2,62.0,5.9,146.0,,,5.5,21.0,5.9,67.0,6.2,55.0,5.7,15.0,,,5.8,4.0,5.8,4.0,6.8,7.0,5.4,32.0,6.2,31.0,5.9,123.0
3,tt0002101,5.2,446,5.3,5.0,15,8,16,62,98,117,63,26,25,16,,,5.3,23.0,5.0,111.0,5.3,193.0,5.1,299.0,,,5.2,20.0,4.9,96.0,5.2,171.0,5.9,39.0,,,5.7,3.0,5.5,14.0,6.1,21.0,4.9,57.0,5.5,207.0,4.7,105.0
4,tt0002130,7.0,2237,6.9,7.0,210,225,436,641,344,169,66,39,20,87,7.5,4.0,7.0,402.0,7.0,895.0,7.1,482.0,7.0,1607.0,8.0,2.0,7.0,346.0,7.0,804.0,7.0,396.0,7.2,215.0,7.0,2.0,7.0,52.0,7.3,82.0,7.4,77.0,6.9,139.0,7.0,488.0,7.0,1166.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85850,tt9908390,5.3,398,5.5,6.0,13,9,26,65,104,86,32,28,15,20,6.0,1.0,5.8,42.0,5.3,137.0,5.2,101.0,5.3,267.0,6.0,1.0,5.6,28.0,5.3,121.0,5.2,97.0,6.0,26.0,,,6.2,11.0,5.7,11.0,5.0,2.0,5.5,12.0,6.3,22.0,5.3,214.0
85851,tt9911196,7.7,724,7.9,8.0,65,139,288,170,42,7,5,2,2,4,,,7.5,105.0,7.7,207.0,7.9,194.0,7.8,412.0,,,7.6,70.0,7.6,150.0,7.9,161.0,7.6,109.0,,,7.4,29.0,8.0,47.0,7.3,30.0,7.0,6.0,6.8,13.0,7.7,388.0
85852,tt9911774,7.9,265,7.8,8.0,63,29,61,61,31,5,5,6,1,3,,,1.0,1.0,10.0,1.0,3.0,1.0,6.0,4.0,,,1.0,1.0,10.0,1.0,3.0,1.0,,,,,,,,,,,1.0,1.0,,,2.0,2.0
85853,tt9914286,6.4,194,9.4,10.0,176,0,2,2,1,0,1,1,0,11,,,6.5,2.0,1.8,5.0,3.0,3.0,3.1,10.0,,,6.5,2.0,1.8,5.0,1.0,2.0,4.0,2.0,,,,,,,7.0,1.0,4.0,3.0,1.7,5.0,5.8,5.0
