In [None]:
import warnings;
warnings.simplefilter('ignore')
import pandas as pd
from ast import literal_eval
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from nltk.stem.snowball import SnowballStemmer

In [None]:
data = pd.read_csv('movies_metadata.csv')
credits = pd.read_csv('credits.csv')
keywords = pd.read_csv('keywords.csv')
links = pd.read_csv('links_small.csv')

In [None]:
data.head(3)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0


In [None]:
data['genres'] = data['genres'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

In [None]:
data['year'] = pd.to_datetime(data['release_date'], errors='coerce').dt.year

In [None]:
data = data[data['id'].str.isnumeric()]
data['id'] = data['id'].astype(int)

In [None]:
data = pd.merge(data, credits, on='id')
data = pd.merge(data, keywords, on='id')

In [None]:
links = links[links['tmdbId'].notnull()]['tmdbId'].astype(int)

In [None]:
links.head(3)

Unnamed: 0,tmdbId
0,862
1,8844
2,15602


In [None]:
df = data[data['id'].isin(links)]

In [None]:
df.head(3)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,status,tagline,title,video,vote_average,vote_count,year,cast,crew,keywords
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[Animation, Comedy, Family]",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,Released,,Toy Story,False,7.7,5415.0,1995.0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...","[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,False,,65000000,"[Adventure, Fantasy, Family]",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,1995.0,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...","[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[Romance, Comedy]",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0,1995.0,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...","[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."


In [None]:
df['cast'] = df['cast'].apply(literal_eval)
df['cast']

Unnamed: 0,cast
0,"[{'cast_id': 14, 'character': 'Woody (voice)',..."
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '..."
2,"[{'cast_id': 2, 'character': 'Max Goldman', 'c..."
3,"[{'cast_id': 1, 'character': 'Savannah 'Vannah..."
4,"[{'cast_id': 1, 'character': 'George Banks', '..."
...,...
45985,"[{'cast_id': 1, 'character': 'Ash Ketchum (voi..."
46424,"[{'cast_id': 2, 'character': 'Ebba', 'credit_i..."
46425,"[{'cast_id': 2, 'character': 'Ebba', 'credit_i..."
46426,"[{'cast_id': 2, 'character': 'Ebba', 'credit_i..."


In [None]:
df['cast'] = df['cast'].apply(lambda x:[i['name'] for i in x] if isinstance(x, list) else [])
df['cast'][0]

['Tom Hanks',
 'Tim Allen',
 'Don Rickles',
 'Jim Varney',
 'Wallace Shawn',
 'John Ratzenberger',
 'Annie Potts',
 'John Morris',
 'Erik von Detten',
 'Laurie Metcalf',
 'R. Lee Ermey',
 'Sarah Freeman',
 'Penn Jillette']

In [None]:
df['cast'] = df['cast'].apply(lambda x: x[:5] if len(x) >=5 else x)
df['cast'] = df['cast'].apply(lambda x: [str.lower(i.replace(" ", '')) for i in x])
df['cast'].head(3)

Unnamed: 0,cast
0,"[tomhanks, timallen, donrickles, jimvarney, wa..."
1,"[robinwilliams, jonathanhyde, kirstendunst, br..."
2,"[waltermatthau, jacklemmon, ann-margret, sophi..."


In [None]:
df['crew'] = df['crew'].apply(literal_eval)

In [None]:
df['crew'].iloc[0][0]

{'credit_id': '52fe4284c3a36847f8024f49',
 'department': 'Directing',
 'gender': 2,
 'id': 7879,
 'job': 'Director',
 'name': 'John Lasseter',
 'profile_path': '/7EdqiNbr4FRjIhKHyPPdFfEEEFG.jpg'}

In [None]:
def get_director(x):
  for i in x:
    if i['job'] == 'Director':
      return i['name']
  return np.nan


df['director'] = df['crew'].apply(get_director)

In [None]:
df['director'].head(3)

Unnamed: 0,director
0,John Lasseter
1,Joe Johnston
2,Howard Deutch


In [None]:
df['director'] = df['director'].astype('str').apply(lambda x: str.lower(x.replace(" ", "")))
df['director'].head(3)

Unnamed: 0,director
0,johnlasseter
1,joejohnston
2,howarddeutch


In [None]:
df['keywords'].iloc[0]

"[{'id': 931, 'name': 'jealousy'}, {'id': 4290, 'name': 'toy'}, {'id': 5202, 'name': 'boy'}, {'id': 6054, 'name': 'friendship'}, {'id': 9713, 'name': 'friends'}, {'id': 9823, 'name': 'rivalry'}, {'id': 165503, 'name': 'boy next door'}, {'id': 170722, 'name': 'new toy'}, {'id': 187065, 'name': 'toy comes to life'}]"

In [None]:
df['keywords'] = df['keywords'].apply(literal_eval)
df['keywords'] = df['keywords'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
df['keywords'].head(3)

Unnamed: 0,keywords
0,"[jealousy, toy, boy, friendship, friends, riva..."
1,"[board game, disappearance, based on children'..."
2,"[fishing, best friend, duringcreditsstinger, o..."


In [None]:
s = df.apply(lambda x: pd.Series(x['keywords']), axis=1).stack().reset_index(level=1, drop=True)
s.name = 'keyword'
s = s.value_counts()
s = s[s > 1]

In [None]:
def filter_words(x):
  words = []
  for i in x:
    if i in s:
      words.append(i)

  return words

df['keywords'] = df['keywords'].apply(filter_words)
df['keywords'].head(3)

Unnamed: 0,keywords
0,"[jealousy, toy, boy, friendship, friends, riva..."
1,"[board game, disappearance, based on children'..."
2,"[fishing, best friend, duringcreditsstinger]"


Много слов, которые считаются разными из-за того, что находятся в разных формах. Нужно использовать stemmer

In [None]:
stemmer = SnowballStemmer('english')

def stem_words(x):
  words = []
  for token in x :
    try:
      new = stemmer.stem(token)
      words.append(new)
    except:
      words.append(token)

  return words

df['keywords'] = df['keywords'].apply(lambda x : stem_words(x))
df['keywords'] = df['keywords'].apply(lambda x: [i.replace(" ", "").lower() for i in x])

In [None]:
df['keywords'].head(3)

Unnamed: 0,keywords
0,"[jealousi, toy, boy, friendship, friend, rival..."
1,"[boardgam, disappear, basedonchildren'sbook, n..."
2,"[fish, bestfriend, duringcreditssting]"


In [None]:
def concat(x):
  c = x['keywords'] + x['cast'] + [x['director']] + x['genres']
  soup = ' '.join([str(i).lower() for i in c])
  return soup

df['soup'] = df.apply(lambda x: concat(x), axis=1)

In [None]:
df['soup'].iloc[0]

'jealousi toy boy friendship friend rivalri boynextdoor newtoy toycomestolif tomhanks timallen donrickles jimvarney wallaceshawn johnlasseter animation comedy family'

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['soup'])


In [None]:
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [None]:
ratings = pd.read_csv('ratings_small.csv')
ratings.head(1)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144


In [None]:
def recomend(title, df=df, cosine_sim=cosine_sim, top = 10):
  movie_id = df['id'][df['title'] == title].values[0]
  id = df.index[df['id'] == movie_id][0]
  cos = pd.Series(cosine_sim[id])
  cos[id] = 0
  ind = cos.sort_values(ascending=False).head(top).index
  for num, i in enumerate(ind):
    print(f"{num + 1}. {df['title'].iloc[i]}")


In [None]:
recomend('Toy Story')

1. Toy Story of Terror!
2. Toy Story 2
3. Toy Story 3
4. Luxo Jr.
5. Big
6. Cars 2
7. Quest for Camelot
8. Cars
9. The Lego Movie
10. Pinocchio


In [None]:
recomend('My Neighbor Totoro')

1. Shanghai Triad
2. The Story of Qiu Ju
3. Red Sorghum
4. To Live
5. Raise the Red Lantern
6. Up the Yangtze
7. Curse of the Golden Flower
8. What Women Want
9. A Personal Journey with Martin Scorsese Through American Movies
10. House of Flying Daggers
