# 추천 시스템 개발
## MovieLens Data
https://grouplens.org/datasets/movielens/100k/

In [1]:
import pandas as pd

## MovieLens data 불러오기

In [2]:
data = pd.read_csv('data/ml-100k/u.data', sep='\t', header=None)
info = pd.read_csv('data/ml-100k/u.info', header=None, sep=' ')
item = pd.read_csv('data/ml-100k/u.item',
                   engine='python',
                   sep='|',
                   header=None)
genre = pd.read_csv('data/ml-100k/u.genre',
                    engine='python',
                    sep='|',
                    header=None)
occupation = pd.read_csv('data/ml-100k/u.occupation',
                         engine='python',
                         header=None)
user = pd.read_csv('data/ml-100k/u.user', sep='|', header=None)

# Data Preprocessing

In [3]:
info = info.rename({1: 'table', 0: 'number'}, axis=1)

In [4]:
user.rename(
    {
        0: 'user_id',
        1: 'age',
        2: 'gender',
        3: 'occupation',
        4: 'zip_code'
    },
    axis=1,
    inplace=True)

In [5]:
item.rename(
    {
        0: "movie_id",
        1: "movie_title",
        2: "release date",
        4: "IMDb_URL",
        5: "unknown",
        6: "Action",
        7: "Action",
        8: "Adventure",
        9: "Animation",
        10: "Children's",
        11: "Comedy",
        12: "Crime",
        13: "Documentary",
        14: "Drama",
        15: "Fantasy",
        16: "Film_Noir",
        17: "Horror",
        18: "Musical",
        19: "Mystery",
        20: "Romance",
        21: "SF",
        22: "Thriller",
        23: "War",
        24: "Western"
    },
    axis=1,
    inplace=True)

In [6]:
data.rename({
    0: 'user_id',
    1: 'item_id',
    2: 'rating',
    3: 'timestamp'
},
            axis=1,
            inplace=True)

In [7]:
occupation.rename({0: 'occupation'}, axis=1, inplace=True)

In [8]:
import re

itemColumns = """
movie id    | movie title | release date | video release date |
IMDb URL    | unknown     | Action       | Adventure          | 
Animation   | Children's  | Comedy       | Crime              |
Documentary | Drama       | Fantasy      | Film-Noir          | 
Horror      | Musical     | Mystery      | Romance            | 
Sci-Fi      | Thriller    | War          | Western            |
"""
itemCols = {
    k: re.sub(r'[\s]', '_', col.strip())
    for k, col in enumerate(itemColumns.split('|'))
}

In [9]:
item.rename(itemCols, axis=1, inplace=True)
item.drop(columns='video_release_date', inplace=True)

In [10]:
data.merge(user)

Unnamed: 0,user_id,item_id,rating,timestamp,age,gender,occupation,zip_code
0,196,242,3,881250949,49,M,writer,55105
1,196,393,4,881251863,49,M,writer,55105
2,196,381,4,881251728,49,M,writer,55105
3,196,251,3,881251274,49,M,writer,55105
4,196,655,5,881251793,49,M,writer,55105
5,196,67,5,881252017,49,M,writer,55105
6,196,306,4,881251021,49,M,writer,55105
7,196,238,4,881251820,49,M,writer,55105
8,196,663,5,881251911,49,M,writer,55105
9,196,111,4,881251793,49,M,writer,55105


In [11]:
data['timestamp'] = pd.to_datetime(data['timestamp'], unit='s')

In [12]:
data.drop(columns='timestamp', inplace=True)

# Movie Recommend System

In [13]:
data.set_index(['user_id', 'item_id']).unstack().fillna(0)

Unnamed: 0_level_0,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating
item_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,4.0,0.0,0.0,0.0,0.0,0.0,2.0,4.0,4.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,5.0,0.0,0.0,5.0,5.0,5.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,5.0,4.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,4.0,0.0,0.0,4.0,0.0,0.0,4.0,0.0,4.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
pd.crosstab(data.user_id, data.item_id, data.rating,
            aggfunc=lambda x: x).fillna(0)

item_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,4.0,0.0,0.0,0.0,0.0,0.0,2.0,4.0,4.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,5.0,0.0,0.0,5.0,5.0,5.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,5.0,4.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,4.0,0.0,0.0,4.0,0.0,0.0,4.0,0.0,4.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
data.pivot_table('rating', 'user_id', 'item_id').fillna(0)

item_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,4.0,0.0,0.0,0.0,0.0,0.0,2.0,4.0,4.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,5.0,0.0,0.0,5.0,5.0,5.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,5.0,4.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,4.0,0.0,0.0,4.0,0.0,0.0,4.0,0.0,4.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
data.pivot('user_id', 'item_id', 'rating').fillna(0)

item_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,4.0,0.0,0.0,0.0,0.0,0.0,2.0,4.0,4.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,5.0,0.0,0.0,5.0,5.0,5.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,5.0,4.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,4.0,0.0,0.0,4.0,0.0,0.0,4.0,0.0,4.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
rating = data.pivot('user_id', 'item_id', 'rating').fillna(0)

In [18]:
data[['user_id']].corr()

Unnamed: 0,user_id
user_id,1.0


In [19]:
data.corr()

Unnamed: 0,user_id,item_id,rating
user_id,1.0,0.010377,-0.009371
item_id,0.010377,1.0,-0.189124
rating,-0.009371,-0.189124,1.0


In [20]:
rating_t = rating.transpose().corr()

In [21]:
rating_t.loc[42].sort_values(ascending=False).iloc[1:5]

user_id
577    0.513391
290    0.502035
864    0.501236
311    0.496347
Name: 42, dtype: float64

In [22]:
def recommendation_user(user_id, k):
    return rating_t.loc[user_id].sort_values(ascending=False).iloc[1:k]

In [23]:
recommendation_user(42, 5)

user_id
577    0.513391
290    0.502035
864    0.501236
311    0.496347
Name: 42, dtype: float64

In [24]:
rating_t

user_id,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.106322,-0.016424,0.021716,0.302592,0.345976,0.320017,0.276208,0.038812,0.288491,...,0.285923,0.068878,0.189272,0.146979,0.119444,0.060175,0.247483,0.115504,0.108984,0.323231
2,0.106322,1.000000,0.082680,0.160323,0.020218,0.197844,0.022886,0.072772,0.143716,0.106861,...,0.106416,0.288550,0.325233,0.408466,0.289112,0.204094,0.190848,0.144177,0.138292,0.054235
3,-0.016424,0.082680,1.000000,0.331674,-0.028322,0.017673,-0.013384,0.055385,0.043505,0.012080,...,-0.020605,0.018746,0.124174,0.046247,0.088974,-0.001145,0.127539,0.084532,0.102025,-0.023601
4,0.021716,0.160323,0.331674,1.000000,-0.002426,0.030206,0.041056,0.171116,0.089372,0.024018,...,0.016641,0.019738,0.105135,0.179598,0.122649,0.010890,0.174904,0.140811,0.149575,0.025367
5,0.302592,0.020218,-0.028322,-0.002426,1.000000,0.153409,0.279677,0.209280,0.025496,0.117166,...,0.272866,0.039333,0.013999,0.039519,0.085548,0.024292,0.182466,0.112571,0.096603,0.248991
6,0.345976,0.197844,0.017673,0.030206,0.153409,1.000000,0.396173,0.152073,0.156114,0.495458,...,0.313191,0.065787,0.104422,0.188816,0.062601,0.059487,0.295949,0.113858,0.266345,0.194925
7,0.320017,0.022886,-0.013384,0.041056,0.279677,0.396173,1.000000,0.231497,0.105950,0.398928,...,0.369051,0.050326,-0.024422,0.056375,0.048249,0.032591,0.251988,0.008168,0.210390,0.301997
8,0.276208,0.072772,0.055385,0.171116,0.209280,0.152073,0.231497,1.000000,0.067515,0.188152,...,0.196475,0.042266,0.046584,0.072511,0.133390,0.068270,0.213408,0.129096,0.142832,0.262813
9,0.038812,0.143716,0.043505,0.089372,0.025496,0.156114,0.105950,0.067515,1.000000,0.172100,...,0.050186,0.032568,0.137980,0.117132,0.095013,0.021669,0.096886,0.132400,0.070695,0.044727
10,0.288491,0.106861,0.012080,0.024018,0.117166,0.495458,0.398928,0.188152,0.172100,1.000000,...,0.278060,0.027112,0.093566,0.108955,0.025289,0.018157,0.287583,0.057182,0.154295,0.138188


In [25]:
rating = data.pivot('item_id', 'user_id', 'rating').fillna(0)

---

# module화

In [26]:
item_corr = rating.T.corr()

In [27]:
def recommendation_item(item_id, k):
    return item_corr.loc[item_id].sort_values(ascending=False).iloc[1:k]

In [28]:
recommendation_item(42, 5)

item_id
721    0.486474
56     0.434368
92     0.417250
433    0.411974
Name: 42, dtype: float64

---

In [29]:
data[data.user_id == 42].item_id.sort_values(ascending=False)

7120     1051
74673    1050
17820    1049
70653    1048
20791    1047
         ... 
6844       25
21019      15
8184       12
84817       2
20548       1
Name: item_id, Length: 183, dtype: int64

In [30]:
set(data[data.user_id == 914].item_id) - set(data[data.user_id == 42])

{88,
 111,
 155,
 197,
 216,
 313,
 371,
 381,
 387,
 402,
 451,
 643,
 692,
 724,
 732,
 736,
 739,
 775,
 778,
 781,
 1259,
 1355,
 1406}

In [31]:
t = set(data[data.user_id == 914].item_id) - set(data[data.user_id == 42])

In [32]:
data[(data.item_id.isin(t)) & (data.rating == 5)].sort_values('rating',
                                                              ascending=False)

Unnamed: 0,user_id,item_id,rating
16,122,387,5
64262,640,313,5
64725,152,724,5
64751,862,111,5
64781,642,739,5
64932,878,736,5
64994,416,451,5
65164,514,111,5
65167,907,724,5
65218,823,216,5


In [33]:
def recommendation_user(user_id, k, n):
    s = rating_t.loc[user_id].sort_values(ascending=False)[1:k + 1].index
    temp = pd.DataFrame(columns=['user_id', 'item_id', 'rating'])
    for j, i in enumerate(s):
        t = set(data[data.user_id == i].item_id) - set(
            data[data.user_id == user_id].item_id)
        u = data[(data.item_id.isin(t)) & (data.rating == 5) &
                 (data.user_id == i)]
        u.rating = u.rating * (1 - 0.05 * j)
        temp = pd.concat([temp, u])
    else:
        # return temp.drop_duplicates(keep=False).sort_values('item_id')
        return temp.groupby(['item_id'
                             ]).mean().sort_values('rating',
                                                   ascending=False)[:n]

In [34]:
# user_id: 사용자
# k:       자신과 비슷한 사용자의 수
# n:       영화 추천 수

def recommendation_user(user_id, k, n):
    s = rating_t.loc[user_id].sort_values(ascending=False)[1:k + 1].index
    temp = pd.DataFrame(columns=['user_id', 'item_id', 'rating'])
    for j, i in enumerate(s):
        t = set(data[data.user_id == i].item_id) - set(
            data[data.user_id == user_id].item_id)
        u = data[(data.item_id.isin(t)) & (data.rating == 5) &
                 (data.user_id == i)]
        u.rating = u.rating * (1 - 0.05 * j)
        temp = pd.concat([temp, u])
    else:
        temp.rename({'item_id': 'movie_id'}, axis=1, inplace=True)
        temp = temp.merge(item[['movie_id', 'movie_title']])
        temp.drop_duplicates(inplace=True)
        temp = temp.groupby('movie_title')
        temp = temp.mean()
        temp = temp.sort_values('rating', ascending=False)
        return list(temp[:n].index)

In [35]:
result = recommendation_user(1, 5, 10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [36]:
result

['Wings of Desire (1987)',
 "William Shakespeare's Romeo and Juliet (1996)",
 'Bob Roberts (1992)',
 'Stealing Beauty (1996)',
 'City of Lost Children, The (1995)',
 'Lawrence of Arabia (1962)',
 'True Lies (1994)',
 'Titanic (1997)',
 'Rear Window (1954)',
 'My Left Foot (1989)']

# 참고
https://github.com/vinta/awesome-python