In [52]:
import pandas as pd
import numpy as np
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cosine, correlation

# データの読み込み

In [53]:
ratings = pd.read_csv('ml-100k/u.data', sep="\t", names=["user_id", "movie_id", "rating"], usecols=range(3), encoding='latin-1')
ratings.head()

Unnamed: 0,user_id,movie_id,rating
0,196,242,3
1,186,302,3
2,22,377,1
3,244,51,2
4,166,346,1


In [54]:
movies = pd.read_csv('ml-100k/u.item', sep="|", names=["movie_id", "movie_title", "release_date"], usecols=range(3), encoding='latin-1')
movies.head()

Unnamed: 0,movie_id,movie_title,release_date
0,1,Toy Story (1995),01-Jan-1995
1,2,GoldenEye (1995),01-Jan-1995
2,3,Four Rooms (1995),01-Jan-1995
3,4,Get Shorty (1995),01-Jan-1995
4,5,Copycat (1995),01-Jan-1995


# データの結合

In [55]:
movie_ratings = pd.merge(ratings, movies)
movie_ratings.head(10)

Unnamed: 0,user_id,movie_id,rating,movie_title,release_date
0,196,242,3,Kolya (1996),24-Jan-1997
1,63,242,3,Kolya (1996),24-Jan-1997
2,226,242,5,Kolya (1996),24-Jan-1997
3,154,242,3,Kolya (1996),24-Jan-1997
4,306,242,5,Kolya (1996),24-Jan-1997
5,296,242,4,Kolya (1996),24-Jan-1997
6,34,242,5,Kolya (1996),24-Jan-1997
7,271,242,4,Kolya (1996),24-Jan-1997
8,201,242,4,Kolya (1996),24-Jan-1997
9,209,242,4,Kolya (1996),24-Jan-1997


# 基礎統計量

In [56]:
round(movie_ratings.describe(), 2)

Unnamed: 0,user_id,movie_id,rating
count,100000.0,100000.0,100000.0
mean,462.48,425.53,3.53
std,266.61,330.8,1.13
min,1.0,1.0,1.0
25%,254.0,175.0,3.0
50%,447.0,322.0,4.0
75%,682.0,631.0,4.0
max,943.0,1682.0,5.0


 # 欠損値の確認

In [57]:
movie_ratings.isnull().sum()

user_id         0
movie_id        0
rating          0
movie_title     0
release_date    9
dtype: int64

# ピボットテーブルの作成

In [58]:
ratings_matrix = ratings.pivot_table(index=['movie_id'],columns=['user_id'],values='rating')
ratings_matrix.fillna(0, inplace=True)
ratings_matrix.head(100)

user_id,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,4.0,0.0,0.0,4.0,4.0,0.0,0.0,0.0,4.0,...,2.0,3.0,4.0,0.0,4.0,0.0,0.0,5.0,0.0,0.0
2,3.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,...,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
3,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,3.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,4.0,...,5.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0
5,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96,5.0,0.0,0.0,0.0,0.0,0.0,5.0,3.0,0.0,0.0,...,4.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,4.0
97,3.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,...,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2.0
98,4.0,0.0,0.0,0.0,3.0,5.0,4.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,5.0
99,3.0,0.0,0.0,0.0,3.0,0.0,5.0,0.0,0.0,5.0,...,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0


# コサイン類似度

In [59]:
movie_similarity = 1 - pairwise_distances(ratings_matrix.values, metric='cosine')
np.fill_diagonal(movie_similarity, 0)
ratings_matrix = pd.DataFrame(movie_similarity)
ratings_matrix.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1672,1673,1674,1675,1676,1677,1678,1679,1680,1681
0,0.0,0.402382,0.330245,0.454938,0.286714,0.116344,0.620979,0.481114,0.496288,0.273935,...,0.035387,0.0,0.0,0.0,0.035387,0.0,0.0,0.0,0.047183,0.047183
1,0.402382,0.0,0.273069,0.502571,0.318836,0.083563,0.383403,0.337002,0.255252,0.171082,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.078299,0.078299
2,0.330245,0.273069,0.0,0.324866,0.212957,0.106722,0.372921,0.200794,0.273669,0.158104,...,0.0,0.0,0.0,0.0,0.032292,0.0,0.0,0.0,0.0,0.096875
3,0.454938,0.502571,0.324866,0.0,0.334239,0.090308,0.489283,0.490236,0.419044,0.252561,...,0.0,0.0,0.094022,0.094022,0.037609,0.0,0.0,0.0,0.056413,0.075218
4,0.286714,0.318836,0.212957,0.334239,0.0,0.037299,0.334769,0.259161,0.272448,0.055453,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.094211
5,0.116344,0.083563,0.106722,0.090308,0.037299,0.0,0.139617,0.083876,0.151064,0.203097,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.620979,0.383403,0.372921,0.489283,0.334769,0.139617,0.0,0.423515,0.527462,0.318623,...,0.0,0.051498,0.0,0.0,0.051498,0.0,0.0,0.0,0.051498,0.051498
7,0.481114,0.337002,0.200794,0.490236,0.259161,0.083876,0.423515,0.0,0.424429,0.267764,...,0.0,0.082033,0.065627,0.065627,0.082033,0.0,0.0,0.0,0.082033,0.0
8,0.496288,0.255252,0.273669,0.419044,0.272448,0.151064,0.527462,0.424429,0.0,0.288514,...,0.0,0.0,0.05736,0.05736,0.0717,0.0,0.0,0.0,0.05736,0.0717
9,0.273935,0.171082,0.158104,0.252561,0.055453,0.203097,0.318623,0.267764,0.288514,0.0,...,0.0,0.0,0.080264,0.080264,0.0,0.0,0.0,0.0,0.0,0.0


# 名前検索

In [61]:
def searchMovie(keyword):
    try:
         print(movies[movies['movie_title'].str.contains(keyword)])
    
    except:
        print("見つかりません")


keyword = str(input("探したい映画の名前の一部を入力してください："))
searchMovie(keyword)

探したい映画の名前の一部を入力してください：Forrest
    movie_id          movie_title release_date
68        69  Forrest Gump (1994)  01-Jan-1994


# レコメンドシステム

In [62]:
try:
    movie_name = input("好きな映画を入力してください：")
    name = movies[movies['movie_title'] == movie_name].index.tolist()
    name = name[0]

    movies['similarity'] = ratings_matrix.iloc[name]
    movies.columns = ['movie_id', 'title', 'release_date', 'similarity']
    print("あなたの入力した映画に基づいたオススメの映画です", "\n", movies.sort_values(["similarity"], ascending=False)[0:5])

except:
    print("その映画はデータベースにありません。")

好きな映画を入力してください：Forrest Gump (1994)
あなたの入力した映画に基づいたオススメの映画です 
      movie_id                              title release_date  similarity
422       423  E.T. the Extra-Terrestrial (1982)  01-Jan-1982    0.742996
27         28                   Apollo 13 (1995)  01-Jan-1995    0.737262
173       174     Raiders of the Lost Ark (1981)  01-Jan-1981    0.736637
96         97          Dances with Wolves (1990)  01-Jan-1990    0.718935
203       204          Back to the Future (1985)  01-Jan-1985    0.718532
