In [1]:
docs = [
  '먹고 싶은 사과', 
  '먹고 싶은 바나나', 
  '길고 노란 바나나 바나나', 
  '저는 과일이 좋아요' 
]

### CountVectorizer

In [3]:
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer()
countvect = vect.fit_transform(docs) 
countvect

<4x9 sparse matrix of type '<class 'numpy.int64'>'
	with 12 stored elements in Compressed Sparse Row format>

In [4]:
countvect.toarray()

array([[0, 0, 0, 1, 0, 1, 1, 0, 0],
       [0, 0, 0, 1, 1, 0, 1, 0, 0],
       [0, 1, 1, 0, 2, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0, 0, 1, 1]], dtype=int64)

In [5]:
vect.vocabulary_

{'먹고': 3,
 '싶은': 6,
 '사과': 5,
 '바나나': 4,
 '길고': 1,
 '노란': 2,
 '저는': 7,
 '과일이': 0,
 '좋아요': 8}

In [6]:
import pandas as pd
countvect_df=pd.DataFrame(countvect.toarray(), columns=sorted(vect.vocabulary_))
countvect_df

Unnamed: 0,과일이,길고,노란,먹고,바나나,사과,싶은,저는,좋아요
0,0,0,0,1,0,1,1,0,0
1,0,0,0,1,1,0,1,0,0
2,0,1,1,0,2,0,0,0,0
3,1,0,0,0,0,0,0,1,1


In [7]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_similarity(countvect_df, countvect_df)
#0번과 1번이 가장 유사하다

array([[1.        , 0.66666667, 0.        , 0.        ],
       [0.66666667, 1.        , 0.47140452, 0.        ],
       [0.        , 0.47140452, 1.        , 0.        ],
       [0.        , 0.        , 0.        , 1.        ]])

### TF-IDF

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
vect = TfidfVectorizer()
tfvect = vect.fit(docs)

In [9]:
tfidv_df = pd.DataFrame(tfvect.transform(docs).toarray(), columns = sorted(vect.vocabulary_))
tfidv_df

Unnamed: 0,과일이,길고,노란,먹고,바나나,사과,싶은,저는,좋아요
0,0.0,0.0,0.0,0.526405,0.0,0.667679,0.526405,0.0,0.0
1,0.0,0.0,0.0,0.57735,0.57735,0.0,0.57735,0.0,0.0
2,0.0,0.47212,0.47212,0.0,0.74445,0.0,0.0,0.0,0.0
3,0.57735,0.0,0.0,0.0,0.0,0.0,0.0,0.57735,0.57735


In [10]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_similarity(tfidv_df, tfidv_df)
#0번과 1번 유사

array([[1.        , 0.60784064, 0.        , 0.        ],
       [0.60784064, 1.        , 0.42980824, 0.        ],
       [0.        , 0.42980824, 1.        , 0.        ],
       [0.        , 0.        , 0.        , 1.        ]])

### 영화 추천 실습

In [19]:
path='C:/Users/rangc/Desktop/연세대/DSL/2023-1/추천시스템/'

In [20]:
data = pd.read_csv(path + 'movies_metadata.csv',low_memory=False)
data.shape

(45466, 24)

In [21]:
data.columns
# 이 중에서 영화에 대한 overview를 보고 영화 추천을 진행할 것임. 

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')

In [22]:
data = data[data['overview'].notnull()].reset_index(drop=True)
data.shape
#결측치 제거하니까 44512개의 영화

(44512, 24)

In [23]:
tfidf = TfidfVectorizer(stop_words='english') #유의미하지 않은 단어 제거
tfidf_matrix = tfidf.fit_transform(data['overview'])
tfidf_matrix.shape
#75827개의 단어

(44512, 75827)

In [24]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [25]:
cosine_matrix.shape

(44512, 44512)

In [26]:
cosine_matrix

array([[1.        , 0.01502134, 0.        , ..., 0.        , 0.00593341,
        0.        ],
       [0.01502134, 1.        , 0.04679784, ..., 0.        , 0.02195764,
        0.0092421 ],
       [0.        , 0.04679784, 1.        , ..., 0.        , 0.0140129 ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.00593341, 0.02195764, 0.0140129 , ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.0092421 , 0.        , ..., 0.        , 0.        ,
        1.        ]])

In [27]:
id2movie={}
for i,c in enumerate(data['title']):
    id2movie[i]=c
id2movie

{0: 'Toy Story',
 1: 'Jumanji',
 2: 'Grumpier Old Men',
 3: 'Waiting to Exhale',
 4: 'Father of the Bride Part II',
 5: 'Heat',
 6: 'Sabrina',
 7: 'Tom and Huck',
 8: 'Sudden Death',
 9: 'GoldenEye',
 10: 'The American President',
 11: 'Dracula: Dead and Loving It',
 12: 'Balto',
 13: 'Nixon',
 14: 'Cutthroat Island',
 15: 'Casino',
 16: 'Sense and Sensibility',
 17: 'Four Rooms',
 18: 'Ace Ventura: When Nature Calls',
 19: 'Money Train',
 20: 'Get Shorty',
 21: 'Copycat',
 22: 'Assassins',
 23: 'Powder',
 24: 'Leaving Las Vegas',
 25: 'Othello',
 26: 'Now and Then',
 27: 'Persuasion',
 28: 'The City of Lost Children',
 29: 'Shanghai Triad',
 30: 'Dangerous Minds',
 31: 'Twelve Monkeys',
 32: 'Babe',
 33: 'Carrington',
 34: 'Dead Man Walking',
 35: 'Across the Sea of Time',
 36: 'It Takes Two',
 37: 'Clueless',
 38: 'Cry, the Beloved Country',
 39: 'Richard III',
 40: 'Dead Presidents',
 41: 'Restoration',
 42: 'Mortal Kombat',
 43: 'To Die For',
 44: 'How To Make An American Quilt',
 

In [28]:
## Toy Story(0번째 index)와 유사한 컨텐츠 들을 순차적으로 나열해보자.
sim_scores = [(i, c) for i, c in enumerate(cosine_matrix[0]) if i != 0]
sim_scores

[(1, 0.015021337734112407),
 (2, 0.0),
 (3, 0.0),
 (4, 0.0),
 (5, 0.0),
 (6, 0.0),
 (7, 0.0),
 (8, 0.0),
 (9, 0.0),
 (10, 0.0),
 (11, 0.0),
 (12, 0.0),
 (13, 0.0),
 (14, 0.0),
 (15, 0.0),
 (16, 0.0),
 (17, 0.038431224890767154),
 (18, 0.0),
 (19, 0.0),
 (20, 0.009721561198218091),
 (21, 0.0),
 (22, 0.0),
 (23, 0.0),
 (24, 0.0),
 (25, 0.0),
 (26, 0.0),
 (27, 0.0),
 (28, 0.0),
 (29, 0.0),
 (30, 0.0),
 (31, 0.0),
 (32, 0.01857414379717094),
 (33, 0.0),
 (34, 0.0),
 (35, 0.0),
 (36, 0.0),
 (37, 0.0),
 (38, 0.0),
 (39, 0.0),
 (40, 0.0),
 (41, 0.006333608272897851),
 (42, 0.0),
 (43, 0.0),
 (44, 0.008885561668288446),
 (45, 0.0),
 (46, 0.0),
 (47, 0.0),
 (48, 0.012991917214332644),
 (49, 0.009104015774670727),
 (50, 0.010702160599824236),
 (51, 0.0),
 (52, 0.0),
 (53, 0.02009302651658688),
 (54, 0.0),
 (55, 0.025156565975556554),
 (56, 0.02072355927180967),
 (57, 0.0),
 (58, 0.03330011980769779),
 (59, 0.0),
 (60, 0.0),
 (61, 0.007607953998094784),
 (62, 0.0),
 (63, 0.009419904160600692),
 (

In [29]:
#유사도 높은 순으로 정렬해보자
sim_scores=sorted(sim_scores, key = lambda x:x[1], reverse=True )
sim_scores[:10] #상위 10개

[(15282, 0.5321733978946077),
 (2979, 0.47214559370670484),
 (10271, 0.274962516260823),
 (24316, 0.27322653023092314),
 (23646, 0.23543946958082806),
 (28893, 0.22397858775140161),
 (42572, 0.21761842522811847),
 (37778, 0.2159367770908928),
 (41893, 0.20190977282766223),
 (8303, 0.19868494439439036)]

In [31]:
sim_scores=[(id2movie[i], score) for i, score in sim_scores[0:10]]
sim_scores

[('Toy Story 3', 0.5321733978946077),
 ('Toy Story 2', 0.47214559370670484),
 ('The 40 Year Old Virgin', 0.274962516260823),
 ('Small Fry', 0.27322653023092314),
 ("Andy Hardy's Blonde Trouble", 0.23543946958082806),
 ('Hot Splash', 0.22397858775140161),
 ('Andy Kaufman Plays Carnegie Hall', 0.21761842522811847),
 ('Superstar: The Life and Times of Andy Warhol', 0.2159367770908928),
 ('Andy Peters: Exclamation Mark Question Point', 0.20190977282766223),
 ('The Champ', 0.19868494439439036)]