In [27]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cosine, correlation

In [28]:
ratings = pd.read_csv('ml-1m/ratings.dat',
                      sep="::",
                      names=["user_id", "movie_id", "rating"],
                      usecols=range(3),
                      engine = 'python'
                      )
ratings.head()

Unnamed: 0,user_id,movie_id,rating
0,1,1193,5
1,1,661,3
2,1,914,3
3,1,3408,4
4,1,2355,5


In [29]:
movies = pd.read_csv('ml-1m/movies.dat',
                     sep="::",
                     names=["movie_id", "movie_title",],
                     usecols=range(2),
                     encoding='latin-1',
                     engine = 'python'
                     )
movies.head()

Unnamed: 0,movie_id,movie_title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)


In [30]:
movie_ratings = pd.merge(ratings, movies)
movie_ratings.head(100)

Unnamed: 0,user_id,movie_id,rating,movie_title
0,1,1193,5,One Flew Over the Cuckoo's Nest (1975)
1,2,1193,5,One Flew Over the Cuckoo's Nest (1975)
2,12,1193,4,One Flew Over the Cuckoo's Nest (1975)
3,15,1193,4,One Flew Over the Cuckoo's Nest (1975)
4,17,1193,5,One Flew Over the Cuckoo's Nest (1975)
...,...,...,...,...
95,329,1193,4,One Flew Over the Cuckoo's Nest (1975)
96,331,1193,4,One Flew Over the Cuckoo's Nest (1975)
97,332,1193,5,One Flew Over the Cuckoo's Nest (1975)
98,333,1193,3,One Flew Over the Cuckoo's Nest (1975)


In [31]:
round(movie_ratings.describe(), 2)

Unnamed: 0,user_id,movie_id,rating
count,1000209.0,1000209.0,1000209.0
mean,3024.51,1865.54,3.58
std,1728.41,1096.04,1.12
min,1.0,1.0,1.0
25%,1506.0,1030.0,3.0
50%,3070.0,1835.0,4.0
75%,4476.0,2770.0,4.0
max,6040.0,3952.0,5.0


In [32]:
ratings_matrix = movie_ratings.pivot_table(index=['movie_id'], columns=['user_id'], values='rating')
ratings_matrix.fillna(0, inplace=True)
ratings_matrix.head()

user_id,1,2,3,4,5,6,7,8,9,10,...,6031,6032,6033,6034,6035,6036,6037,6038,6039,6040
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,0.0,0.0,0.0,0.0,4.0,0.0,4.0,5.0,5.0,...,0.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,3.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.0,2.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [33]:
movie_similarity = 1 - pairwise_distances(ratings_matrix.values, metric='cosine')
np.fill_diagonal(movie_similarity, 0)
ratings_matrix = pd.DataFrame(movie_similarity)
ratings_matrix.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3696,3697,3698,3699,3700,3701,3702,3703,3704,3705
0,0.0,0.390349,0.267943,0.178789,0.256569,0.347373,0.30149,0.125709,0.10662,0.377459,...,0.099502,0.020966,0.084105,0.081826,0.045949,0.309676,0.186633,0.093479,0.042829,0.182691
1,0.390349,0.0,0.240946,0.155457,0.24997,0.244827,0.262772,0.196521,0.158469,0.3862,...,0.061819,0.015209,0.07531,0.095573,0.074271,0.21365,0.140781,0.087013,0.026063,0.122185
2,0.267943,0.240946,0.0,0.192788,0.30829,0.18702,0.29223,0.092122,0.128378,0.245601,...,0.038492,0.065507,0.049512,0.087377,0.050985,0.190575,0.104837,0.062258,0.010073,0.097786
3,0.178789,0.155457,0.192788,0.0,0.27199,0.12517,0.220024,0.049554,0.060334,0.133707,...,0.055486,0.0533,0.002227,0.025278,0.025204,0.118902,0.096318,0.022588,0.024769,0.095154
4,0.256569,0.24997,0.30829,0.27199,0.0,0.148114,0.305107,0.095512,0.138392,0.237681,...,0.026632,0.083898,0.046399,0.047542,0.016156,0.174554,0.092403,0.051633,0.01075,0.112835
5,0.347373,0.244827,0.18702,0.12517,0.148114,0.0,0.184966,0.055532,0.172145,0.418485,...,0.089106,0.025354,0.017274,0.112076,0.087213,0.236447,0.201419,0.115331,0.029136,0.222836
6,0.30149,0.262772,0.29223,0.220024,0.305107,0.184966,0.0,0.049023,0.083145,0.248029,...,0.066875,0.051497,0.037842,0.065268,0.051835,0.191689,0.11766,0.059262,0.036102,0.138879
7,0.125709,0.196521,0.092122,0.049554,0.095512,0.055532,0.049023,0.0,0.045263,0.107235,...,0.028519,0.072446,0.064868,0.059819,0.06635,0.090387,0.080523,0.084976,0.072141,0.045523
8,0.10662,0.158469,0.128378,0.060334,0.138392,0.172145,0.083145,0.045263,0.0,0.216823,...,0.046188,0.014033,0.020523,0.103986,0.049767,0.092347,0.099554,0.004956,0.0,0.057881
9,0.377459,0.3862,0.245601,0.133707,0.237681,0.418485,0.248029,0.107235,0.216823,0.0,...,0.072576,0.049577,0.04195,0.121969,0.090955,0.237227,0.136374,0.09717,0.018359,0.161396


In [36]:
def search_movie(keyword):
    try:
        print(movies[movies['movie_title'].str.contains(keyword)])

    except:
        print("見つかりません")


keyword = str(input("探したい映画の名前の一部を入力してください："))
search_movie(keyword)

探したい映画の名前の一部を入力してください：Back
見つかりません


In [35]:
try:
    movie_name = input("好きな映画を入力してください：")
    name = movies[movies['movie_title'] == movie_name].index.tolist()
    name = name[0]

    movies['similarity'] = ratings_matrix.iloc[name]
    movies.columns = ['movie_id', 'title', 'similarity']
    print("あなたの入力した映画に基づいたオススメの映画です", "\n", movies.sort_values(["similarity"], ascending=False)[0:5])

except:
    print("その映画はデータベースにありません。")

好きな映画を入力してください：Stand by Me (1986)
あなたの入力した映画に基づいたオススメの映画です 
       movie_id                                              title  similarity
3595      3664          Puppet Master 5: The Final Chapter (1994)    0.340636
1808      1877                                  Little Men (1998)    0.333555
214        216                               Billy Madison (1995)    0.328033
3467      3536                           Keeping the Faith (2000)    0.327584
318        321  Strawberry and Chocolate (Fresa y chocolate) (...    0.318108
