In [589]:

import math
import numpy as np
from numpy import linalg as LA
from scipy.sparse import coo_matrix
import pandas as pd
from IPython.display import HTML, display

In [590]:
np.set_printoptions(precision=5)
pd.set_option('display.precision', 5)

In [591]:

movies = pd.read_csv('../src/movies_w_imgurl.csv')

In [592]:

movieIdToIndex = {}
indexToMovieId = {}
colIdx = 0
for movieId in movies.movieId:
    movieIdToIndex[movieId] = colIdx
    indexToMovieId[colIdx] = movieId
    colIdx += 1

In [593]:
ratings = pd.read_csv('../src/ratings-9_1.csv')
train = ratings[ratings['type'] == 'train'][['userId', 'movieId', 'rating']]
test = ratings[ratings['type'] == 'test'][['userId', 'movieId', 'rating']]

In [594]:
userIdToIndex = {}
indexToUserId = {}
rowIdx = 0
for userId in ratings.userId.unique():
    userIdToIndex[userId] = rowIdx
    indexToUserId[rowIdx] = userId
    rowIdx += 1

In [595]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp,type
0,1,31,2.5,1260759144,train
1,1,1029,3.0,1260759179,test
2,1,1061,3.0,1260759182,train
3,1,1129,2.0,1260759185,train
4,1,1172,4.0,1260759205,train
5,1,1263,2.0,1260759151,test
6,1,1287,2.0,1260759187,train
7,1,1293,2.0,1260759148,train
8,1,1339,3.5,1260759125,train
9,1,1343,2.0,1260759131,train


In [596]:
rows = []
cols = []
vals = []
for row in ratings.itertuples():
    rows.append(userIdToIndex[row.userId])
    cols.append(movieIdToIndex[row.movieId])
    vals.append(row.rating)
coomat = coo_matrix((vals, (rows, cols)), shape=(rowIdx, colIdx))

In [597]:
norms = LA.norm(coomat.toarray(), ord=2, axis=1)

In [598]:
np.seterr(divide='ignore', invalid='ignore')
normmat = np.divide(coomat.transpose().toarray(), norms).T


In [599]:
np.nan_to_num(normmat)

array([[ 0.     ,  0.     ,  0.     , ...,  0.     ,  0.     ,  0.     ],
       [ 0.     ,  0.     ,  0.     , ...,  0.     ,  0.     ,  0.     ],
       [ 0.     ,  0.     ,  0.     , ...,  0.     ,  0.     ,  0.     ],
       ..., 
       [ 0.     ,  0.     ,  0.     , ...,  0.     ,  0.     ,  0.     ],
       [ 0.17997,  0.     ,  0.     , ...,  0.     ,  0.     ,  0.     ],
       [ 0.11684,  0.     ,  0.     , ...,  0.     ,  0.     ,  0.     ]])

In [600]:
sims = pd.DataFrame(data=np.matmul(normmat, normmat.T), index=ratings.userId.unique(), columns=ratings.userId.unique())

In [601]:
sims

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,662,663,664,665,666,667,668,669,670,671
1,1.00000,0.00000,0.00000,0.07448,0.01682,0.00000,0.08388,0.00000,0.01284,0.00000,...,0.00000,0.00000,0.01447,0.04372,0.00000,0.00000,0.00000,0.06292,0.00000,0.01747
2,0.00000,1.00000,0.12429,0.11882,0.10365,0.00000,0.21299,0.11319,0.11333,0.04321,...,0.47731,0.06320,0.07774,0.16416,0.46628,0.42546,0.08465,0.02414,0.17059,0.11318
3,0.00000,0.12429,1.00000,0.08164,0.15153,0.06069,0.15471,0.24978,0.13447,0.11467,...,0.16121,0.06420,0.17613,0.15836,0.17710,0.12456,0.12491,0.08098,0.13661,0.17019
4,0.07448,0.11882,0.08164,1.00000,0.13065,0.07965,0.31975,0.19101,0.03042,0.13719,...,0.11432,0.04723,0.13658,0.25403,0.12191,0.08873,0.06848,0.10431,0.05451,0.21161
5,0.01682,0.10365,0.15153,0.13065,1.00000,0.06380,0.09589,0.16571,0.08662,0.03237,...,0.19103,0.02114,0.14617,0.22424,0.13972,0.05825,0.04293,0.03836,0.06264,0.22509
6,0.00000,0.00000,0.06069,0.07965,0.06380,1.00000,0.00000,0.12850,0.02174,0.04526,...,0.01296,0.00903,0.12448,0.08260,0.00000,0.00000,0.01956,0.02458,0.01947,0.08770
7,0.08388,0.21299,0.15471,0.31975,0.09589,0.00000,1.00000,0.14957,0.05973,0.18649,...,0.20583,0.07754,0.13479,0.14764,0.16849,0.23205,0.05877,0.07315,0.09624,0.26867
8,0.00000,0.11319,0.24978,0.19101,0.16571,0.12850,0.14957,1.00000,0.15736,0.16272,...,0.10837,0.08596,0.27438,0.23152,0.12211,0.06901,0.11237,0.05514,0.24769,0.40641
9,0.01284,0.11333,0.13447,0.03042,0.08662,0.02174,0.05973,0.15736,1.00000,0.12734,...,0.07819,0.10494,0.07755,0.15577,0.06069,0.06641,0.19449,0.02929,0.38443,0.16850
10,0.00000,0.04321,0.11467,0.13719,0.03237,0.04526,0.18649,0.16272,0.12734,1.00000,...,0.03773,0.04045,0.12643,0.10227,0.03532,0.03265,0.09856,0.06055,0.15865,0.18970


In [602]:
topK = 6
userId = 33
simUsers = sims.loc[33,:].sort_values(ascending=False).head(11)

In [603]:
simUsers

33     1.00000
457    0.23000
598    0.21592
461    0.19819
350    0.19495
83     0.19023
15     0.18783
463    0.18423
509    0.18105
580    0.18027
439    0.18017
Name: 33, dtype: float64

# 방법2. userid,장르비교법 (제가 그냥 생각해서 코딩한거)

* genres

In [604]:
movieGenres = pd.DataFrame(
    data=movies['genres']  
    .str
    .split('|')   
    .apply(pd.Series)
    .stack() # 정리하기 편하게 stack으로 쌓아준다.
    ,columns=['genre'])
movieGenres.index = movieGenres.index.droplevel(1)

* idf

In [605]:
genres = pd.DataFrame(data = movieGenres.groupby('genre')['genre'].count())
genres.columns= ['movieCount']
totalItems = movies.shape[0]
genres['idf'] = genres['movieCount'].apply(lambda x: math.log10(totalItems/x))
movieGenreWeights = movieGenres.join(genres['idf'], on='genre')

In [606]:
movieWeights = movies[['movieId']]

for genre in genres.index: # genres들을 순서대로 뽑아준다.
    movieWeights = movieWeights\
    .join(movieGenreWeights[movieGenreWeights['genre'] == genre]
    # movieGenreWeights['genre']와 genre가 일치하는것들만 뽑아내서 join시킨다.
    # 이유: 각 genre별 column을 만들어주기위해.
    [['idf']].rename(columns={'idf':genre})) 
    #movieGenreWeights에 있던 idf를 genre와 합쳐준다.
    movieWeights.fillna(0, inplace=True)
    # 빈값인 NaN값을 0 으로 채워준다. 
movieWeights

Unnamed: 0,movieId,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,0.00000,0.00000,0.90309,1.30103,1.17609,0.30103,0.00000,0.00000,0.00000,...,0.0,0.0,0.0,0.0,0.00000,0.00000,0.00000,0.00000,0.0,0.0
1,2,0.00000,0.00000,0.90309,0.00000,1.17609,0.00000,0.00000,0.00000,0.00000,...,0.0,0.0,0.0,0.0,0.00000,0.00000,0.00000,0.00000,0.0,0.0
2,3,0.00000,0.00000,0.00000,0.00000,0.00000,0.30103,0.00000,0.00000,0.00000,...,0.0,0.0,0.0,0.0,0.00000,0.69897,0.00000,0.00000,0.0,0.0
3,4,0.00000,0.00000,0.00000,0.00000,0.00000,0.30103,0.00000,0.00000,0.30103,...,0.0,0.0,0.0,0.0,0.00000,0.69897,0.00000,0.00000,0.0,0.0
4,5,0.00000,0.00000,0.00000,0.00000,0.00000,0.30103,0.00000,0.00000,0.00000,...,0.0,0.0,0.0,0.0,0.00000,0.00000,0.00000,0.00000,0.0,0.0
5,6,0.00000,0.69897,0.00000,0.00000,0.00000,0.00000,0.90309,0.00000,0.00000,...,0.0,0.0,0.0,0.0,0.00000,0.00000,0.00000,0.69897,0.0,0.0
6,7,0.00000,0.00000,0.00000,0.00000,0.00000,0.30103,0.00000,0.00000,0.00000,...,0.0,0.0,0.0,0.0,0.00000,0.69897,0.00000,0.00000,0.0,0.0
7,8,0.00000,0.00000,0.90309,0.00000,1.17609,0.00000,0.00000,0.00000,0.00000,...,0.0,0.0,0.0,0.0,0.00000,0.00000,0.00000,0.00000,0.0,0.0
8,9,0.00000,0.69897,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,...,0.0,0.0,0.0,0.0,0.00000,0.00000,0.00000,0.00000,0.0,0.0
9,10,0.00000,0.69897,0.90309,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,...,0.0,0.0,0.0,0.0,0.00000,0.00000,0.00000,0.69897,0.0,0.0


* 시간이 오래걸림 원하는 유저 id만 따로 뽑아서 계산하면 금방걸리고 오차도 사라질듯함

In [607]:
count=0
idxcheck=0
taste=pd.DataFrame()
genres_df=pd.DataFrame()
moviedict={}
for idx,value in movieWeights['movieId'].iteritems():
    moviedict[value]=idx # value가  movieId이고 idx는 index

for userId in range(1,int(train.ix[lastIndex]["userId"])):
    
    for i in train.index[idxcheck:]:
       
        movieId=train['movieId'][i]
        #print i,userId,movieId
        if train['userId'][i] ==userId:
    
            _list=train['rating'][i]*movieWeights.ix[moviedict[movieId]]
            genres_df.insert(0,'d',_list,1)
            # 위 insert방식은 dataframe 이 한번 만들어지고 그 dataframe을 밑에 taste에 더하게되서 시간소비가 걸림.
            # 이거대신 바로 평균을내고 그 평균으로 taste를 만들면 편할것.
            
        else:
            #print "error"
            #print genres_df
            genres_df=genres_df.drop('movieId')
            genres_df=genres_df.mean(axis=1)
            taste.insert(userId-1,userId,genres_df.T,1)
            #print taste
            genres_df=pd.DataFrame()
            
            break
        idxcheck+=1
        #if count==20:
        #    break
        count+=1
taste=taste.T

In [609]:
taste

Unnamed: 0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
1,0.00000,0.52423,0.93131,0.24394,0.14701,0.16933,0.28222,0.00000,0.24459,0.38292,0.00000,0.37500,0.00000,0.00000,0.00000,0.15290,0.71596,0.74266,0.00000,0.32482
2,0.00000,0.53394,0.46409,0.32526,0.39203,0.40137,0.41392,0.00000,0.57279,0.46414,0.00000,0.19444,0.19676,0.20804,0.26758,0.73780,0.21696,0.64072,0.28754,0.24061
3,0.00000,0.66014,0.52179,0.33249,0.35283,0.33782,0.74254,0.30684,0.62882,0.37131,0.00000,0.30000,0.33449,0.15130,0.20069,0.52811,0.35870,0.56694,0.76678,0.00000
4,0.00000,0.92264,1.01146,0.68397,1.06184,0.57626,0.75344,0.05738,0.38532,0.82114,0.05236,0.37714,0.05060,0.75479,0.27523,0.40341,0.83906,0.63107,0.14985,0.02970
5,0.00000,0.28991,0.74402,0.57659,0.72837,0.68245,0.30274,0.15691,0.42418,0.61393,0.00000,0.12500,0.40247,0.65765,0.21209,1.10803,0.33135,0.24623,0.19605,0.00000
6,0.00000,0.79882,1.18262,0.43368,0.44803,0.32612,0.52680,0.00000,0.50888,0.75589,0.00000,0.23810,0.31622,0.19453,0.24369,0.24131,0.79344,0.39109,0.41078,0.00000
7,0.00000,0.89868,1.31359,0.82793,0.74842,0.44568,0.39877,0.00000,0.32449,0.56421,0.00000,0.06494,0.13799,0.42443,0.07819,0.33587,1.13606,0.38126,0.16132,0.11249
8,0.00000,0.73323,0.67732,0.13393,0.24214,0.31726,1.01376,0.03692,0.55336,0.38770,0.12576,0.23529,0.13889,0.04673,0.57845,0.55164,0.50028,0.81547,0.33152,0.00000
9,0.00000,0.41303,0.38997,0.29569,0.16038,0.35576,0.71837,0.00000,0.74573,0.45570,0.00000,0.18182,0.00000,0.12379,0.19156,0.98491,0.40236,0.57188,0.34505,0.00000
10,0.00000,1.16495,0.85678,0.00000,0.00000,0.26244,0.57890,0.00000,0.47084,0.14281,0.00000,0.61538,0.00000,0.10475,0.37050,0.35845,0.72096,0.89612,0.14156,0.00000


idlist=[]
idcount=0
for i in range(0,count):
    idlist.append(idcount)
    idcount+=1
tasteWeights.insert(0,'movieId',idlist,1)

In [610]:
tasteNorms = pd.DataFrame(
    data=LA.norm(taste
                 .iloc[:,1:] # movieId를 빼고 읽는다.
                 .values, 
                 # value를 통해 값만 가져온다. numpy.array형식으로 바꾸어주는건데
                 #그냥 빠지고 dataframe자체로 넣어도 문제없다.
                ord=2, axis=1),
    #index=movieWeights.index, << 없어도 상관없음
    columns=['norm2'])
tasteNorms

Unnamed: 0,norm2
0,1.69461
1,1.72332
2,1.95397
3,2.69755
4,2.17512
5,2.21430
6,2.52131
7,2.10141
8,1.87226
9,2.17613


In [611]:
normalizedTasteWeights = pd.DataFrame(index=taste.index) #빈 데이타 프레임만들기
norms = tasteNorms['norm2']
for genre in genres.index:
    normalizedTasteWeights[genre] = taste[genre].divide(norms)
normalizedTasteWeights

Unnamed: 0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
1,0.00000,0.30420,0.54042,0.14155,0.08531,0.09826,0.16376,0.00000,0.14193,0.22220,0.00000,0.21760,0.00000,0.00000,0.00000,0.08872,0.41545,0.43095,0.00000,0.18849
2,0.00000,0.27326,0.23751,0.16646,0.20063,0.20541,0.21183,0.00000,0.29314,0.23754,0.00000,0.09951,0.10070,0.10647,0.13694,0.37759,0.11103,0.32791,0.14716,0.12314
3,0.00000,0.24472,0.19343,0.12325,0.13080,0.12523,0.27527,0.11375,0.23311,0.13765,0.00000,0.11121,0.12400,0.05609,0.07440,0.19577,0.13297,0.21017,0.28425,0.00000
4,0.00000,0.42418,0.46501,0.31445,0.48818,0.26493,0.34639,0.02638,0.17715,0.37751,0.02407,0.17339,0.02326,0.34701,0.12653,0.18546,0.38576,0.29013,0.06889,0.01365
5,0.00000,0.13093,0.33601,0.26040,0.32894,0.30820,0.13672,0.07086,0.19156,0.27726,0.00000,0.05645,0.18176,0.29700,0.09578,0.50040,0.14964,0.11120,0.08854,0.00000
6,0.00000,0.31683,0.46905,0.17200,0.17770,0.12934,0.20894,0.00000,0.20183,0.29980,0.00000,0.09443,0.12542,0.07716,0.09665,0.09571,0.31469,0.15511,0.16292,0.00000
7,0.00000,0.42765,0.62510,0.39399,0.35615,0.21209,0.18976,0.00000,0.15441,0.26849,0.00000,0.03090,0.06566,0.20198,0.03721,0.15983,0.54062,0.18143,0.07677,0.05353
8,0.00000,0.39163,0.36176,0.07153,0.12933,0.16945,0.54146,0.01972,0.29556,0.20707,0.06717,0.12567,0.07418,0.02496,0.30896,0.29464,0.26720,0.43555,0.17707,0.00000
9,0.00000,0.18980,0.17920,0.13588,0.07370,0.16348,0.33011,0.00000,0.34269,0.20941,0.00000,0.08355,0.00000,0.05689,0.08803,0.45260,0.18490,0.26280,0.15856,0.00000
10,0.00000,0.38702,0.28464,0.00000,0.00000,0.08719,0.19232,0.00000,0.15642,0.04745,0.00000,0.20444,0.00000,0.03480,0.12309,0.11908,0.23952,0.29771,0.04703,0.00000


In [612]:
sims = pd.DataFrame(data=np.matmul(normalizedTasteWeights, normalizedTasteWeights.T))#, index=taste['movieId'])
#sims.columns = index=taste['movieId']

# 0,0 1,1 등 같은 행열에는 1.0값이 나와야 하는데 이상한 소수점이 나오는현상.
* 아래의 일반 테스트 에서는 1.0로 잘나오는데 막상 할땐 이상하게나옴

In [613]:
test1=LA.norm(normalizedTasteWeights.ix[3])
df=pd.DataFrame(data=normalizedTasteWeights.ix[3].divide(test1))

norm=0.0
_list=[]
for i in range(0,len(df)):
    norm += df.ix[i]*df.ix[i]

dot=0.0
for i in range(0,len(df)):
    dot += float(df.ix[i])*df.ix[i]

print dot/(norm*norm)

3    1.0
Name: (no genres listed), dtype: float64


In [614]:
sims

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,660,661,662,663,664,665,666,667,668,669
0,0.96696,0.66722,0.51602,1.00033,0.59457,0.75818,1.02133,0.88128,0.59380,0.62722,...,0.75061,0.53225,0.78516,1.39826,0.71747,0.79441,0.57539,0.76533,0.65605,
1,0.66722,0.77784,0.58378,0.93973,0.76292,0.65287,0.82921,0.89087,0.71290,0.50614,...,0.67273,0.58212,0.63504,1.15556,0.82680,0.73813,0.68547,0.92736,0.52576,
2,0.51602,0.58378,0.52469,0.74036,0.55827,0.55180,0.66126,0.74410,0.56008,0.42157,...,0.58047,0.44283,0.50596,0.99511,0.60276,0.61336,0.53351,0.85867,0.41037,
3,1.00033,0.93973,0.74036,1.53806,1.04183,1.00259,1.39900,1.15265,0.82777,0.69906,...,0.93329,0.73885,1.01504,1.62776,1.16735,0.95478,0.83354,1.10450,0.69922,
4,0.59457,0.76292,0.55827,1.04183,0.96493,0.67950,0.94619,0.78480,0.69137,0.40896,...,0.65552,0.59554,0.67102,0.94772,0.95035,0.68154,0.65096,0.74110,0.41455,
5,0.75818,0.65287,0.55180,1.00259,0.67950,0.77129,0.99490,0.84749,0.59161,0.52789,...,0.77187,0.51676,0.73905,1.25326,0.74583,0.70098,0.56608,0.81505,0.50876,
6,1.02133,0.82921,0.66126,1.39900,0.94619,0.99490,1.43956,1.01724,0.72892,0.65939,...,0.95150,0.68599,1.04173,1.54569,1.05294,0.90766,0.73806,0.85585,0.64868,
7,0.88128,0.89087,0.74410,1.15265,0.78480,0.84749,1.01724,1.25977,0.87392,0.73118,...,0.90483,0.67264,0.81103,1.71667,0.88094,0.93222,0.80125,1.33131,0.74514,
8,0.59380,0.71290,0.56008,0.82777,0.69137,0.59161,0.72892,0.87392,0.74022,0.47952,...,0.65860,0.51727,0.55150,1.06994,0.74215,0.66512,0.64679,0.99542,0.48877,
9,0.62722,0.50614,0.42157,0.69906,0.40896,0.52789,0.65939,0.73118,0.47952,0.52266,...,0.54453,0.39544,0.53431,1.12451,0.48977,0.57802,0.43636,0.65174,0.52104,
