# MovieLens 데이터 기반 이용자별 장르 선호도 테이블 만들기

In [1]:
from pyhive import hive
from sklearn.preprocessing import MinMaxScaler
from sqlalchemy import create_engine
from tqdm import tqdm
import numpy as np
import os
import pandas as pd

In [2]:
hive_cnx = hive.Connection(
      host='hd02.pdmnu.com'
    , username='sweetbarrow'
    , database='mlens'
    , auth='NOSASL'
)
hive_cnx

<pyhive.hive.Connection at 0x1fb8f743ec8>

In [3]:
mysql_cnx = create_engine('mysql+pymysql://root:!panda8902@pc.pdmnu.com:3306/mlens?charset=utf8mb4', encoding='utf-8', echo=True)
mysql_cnx

Engine(mysql+pymysql://root:***@pc.pdmnu.com:3306/mlens?charset=utf8mb4)

In [4]:
ratings = pd.read_sql('SELECT * FROM mlens.ratings', hive_cnx)
ratings

Unnamed: 0,ratings.userid,ratings.movieid,ratings.rating,ratings.timestamp
0,1,307,3.5,1256677221
1,1,481,3.5,1256677456
2,1,1091,1.5,1256677471
3,1,1257,4.5,1256677460
4,1,1449,4.5,1256677264
...,...,...,...,...
27753439,283228,8542,4.5,1379882795
27753440,283228,8712,4.5,1379882751
27753441,283228,34405,4.5,1379882889
27753442,283228,44761,4.5,1354159524


In [5]:
del ratings['ratings.timestamp']
ratings

Unnamed: 0,ratings.userid,ratings.movieid,ratings.rating
0,1,307,3.5
1,1,481,3.5
2,1,1091,1.5
3,1,1257,4.5
4,1,1449,4.5
...,...,...,...
27753439,283228,8542,4.5
27753440,283228,8712,4.5
27753441,283228,34405,4.5
27753442,283228,44761,4.5


In [6]:
new_columns = {
      'ratings.userid': 'user_id'
    , 'ratings.movieid': 'movie_id'
    , 'ratings.rating': 'rating'
}

In [7]:
ratings.rename(columns=new_columns, inplace=True)
ratings

Unnamed: 0,user_id,movie_id,rating
0,1,307,3.5
1,1,481,3.5
2,1,1091,1.5
3,1,1257,4.5
4,1,1449,4.5
...,...,...,...
27753439,283228,8542,4.5
27753440,283228,8712,4.5
27753441,283228,34405,4.5
27753442,283228,44761,4.5


In [9]:
scaler = MinMaxScaler()
scaled_rating = scaler.fit_transform(ratings['rating'].to_numpy().reshape(-1, 1))
scaled_rating

array([[0.66666667],
       [0.66666667],
       [0.22222222],
       ...,
       [0.88888889],
       [0.88888889],
       [0.88888889]])

In [10]:
scaled_rating_df = pd.DataFrame(scaled_rating, columns=['rating_scaled'])
scaled_rating_df

Unnamed: 0,rating_scaled
0,0.666667
1,0.666667
2,0.222222
3,0.888889
4,0.888889
...,...
27753439,0.888889
27753440,0.888889
27753441,0.888889
27753442,0.888889


In [11]:
ratings_scaled = pd.merge(ratings, scaled_rating_df, how='left', left_index=True, right_index=True)
ratings_scaled

Unnamed: 0,user_id,movie_id,rating,rating_scaled
0,1,307,3.5,0.666667
1,1,481,3.5,0.666667
2,1,1091,1.5,0.222222
3,1,1257,4.5,0.888889
4,1,1449,4.5,0.888889
...,...,...,...,...
27753439,283228,8542,4.5,0.888889
27753440,283228,8712,4.5,0.888889
27753441,283228,34405,4.5,0.888889
27753442,283228,44761,4.5,0.888889


In [12]:
del ratings_scaled['rating']
ratings_scaled

Unnamed: 0,user_id,movie_id,rating_scaled
0,1,307,0.666667
1,1,481,0.666667
2,1,1091,0.222222
3,1,1257,0.888889
4,1,1449,0.888889
...,...,...,...
27753439,283228,8542,0.888889
27753440,283228,8712,0.888889
27753441,283228,34405,0.888889
27753442,283228,44761,0.888889


In [None]:
ratings_scaled.to_sql('ratings_scaled', mysql_cnx, if_exists='append', index=False)

In [13]:
ratings_scaled = pd.read_sql('SELECT * FROM mlens.ratings_scaled', hive_cnx)
ratings_scaled

Unnamed: 0,ratings_scaled.user_id,ratings_scaled.movie_id,ratings_scaled.rating_scaled
0,1,307,0.666667
1,1,481,0.666667
2,1,1091,0.222222
3,1,1257,0.888889
4,1,1449,0.888889
...,...,...,...
27753439,283228,8542,0.888889
27753440,283228,8712,0.888889
27753441,283228,34405,0.888889
27753442,283228,44761,0.888889


In [14]:
new_columns = {
      'ratings_scaled.user_id': 'user_id'
    , 'ratings_scaled.movie_id': 'movie_id'
    , 'ratings_scaled.rating_scaled': 'rating_scaled'
}

In [15]:
ratings_scaled.rename(columns=new_columns, inplace=True)
ratings_scaled

Unnamed: 0,user_id,movie_id,rating_scaled
0,1,307,0.666667
1,1,481,0.666667
2,1,1091,0.222222
3,1,1257,0.888889
4,1,1449,0.888889
...,...,...,...
27753439,283228,8542,0.888889
27753440,283228,8712,0.888889
27753441,283228,34405,0.888889
27753442,283228,44761,0.888889


In [16]:
ratings_scaled['rating_scaled'].unique()

array([0.66666667, 0.22222222, 0.88888889, 0.44444444, 0.77777778,
       0.55555556, 0.33333333, 1.        , 0.11111111, 0.        ])

In [17]:
ratings_rescaled = pd.DataFrame(ratings_scaled[['user_id', 'movie_id']], columns=['user_id', 'movie_id', 'rating_rescaled'])
for _ in range(len(ratings_scaled)):
    if ratings_scaled.loc[_, 'rating_scaled'] < 0.45:
        ratings_rescaled.at[_, 'rating_rescaled'] = -1
    elif ratings_scaled.loc[_, 'rating_scaled'] > 0.6:
        ratings_rescaled.at[_, 'rating_rescaled'] = 1
    else:
        ratings_rescaled.at[_, 'rating_rescaled'] = 0
ratings_rescaled

Unnamed: 0,user_id,movie_id,rating_rescaled
0,1,307,1.0
1,1,481,1.0
2,1,1091,-1.0
3,1,1257,1.0
4,1,1449,1.0
...,...,...,...
27753439,283228,8542,1.0
27753440,283228,8712,1.0
27753441,283228,34405,1.0
27753442,283228,44761,1.0


In [None]:
ratings_rescaled.to_sql('ratings_rescaled', mysql_cnx, if_exists='append', index=False)

In [19]:
ratings_rescaled = pd.read_sql('SELECT * FROM mlens.ratings_rescaled', hive_cnx)
ratings_rescaled

Unnamed: 0,ratings_rescaled.user_id,ratings_rescaled.movie_id,ratings_rescaled.rating_rescaled
0,1,307,1
1,1,481,1
2,1,1091,-1
3,1,1257,1
4,1,1449,1
...,...,...,...
27753439,283228,8542,1
27753440,283228,8712,1
27753441,283228,34405,1
27753442,283228,44761,1


In [20]:
new_columns = {
      'ratings_rescaled.user_id': 'user_id'
    , 'ratings_rescaled.movie_id': 'movie_id'
    , 'ratings_rescaled.rating_rescaled': 'rating_rescaled'
}

In [21]:
ratings_rescaled.rename(columns=new_columns, inplace=True)
ratings_rescaled

Unnamed: 0,user_id,movie_id,rating_rescaled
0,1,307,1
1,1,481,1
2,1,1091,-1
3,1,1257,1
4,1,1449,1
...,...,...,...
27753439,283228,8542,1
27753440,283228,8712,1
27753441,283228,34405,1
27753442,283228,44761,1


In [22]:
movies_genres_onehot = pd.read_sql('SELECT * FROM mlens.movies_genres_onehot', hive_cnx)
movies_genres_onehot

Unnamed: 0,movies_genres_onehot.movie_id,movies_genres_onehot.title,movies_genres_onehot.action,movies_genres_onehot.adventure,movies_genres_onehot.animation,movies_genres_onehot.children,movies_genres_onehot.comedy,movies_genres_onehot.crime,movies_genres_onehot.documentary,movies_genres_onehot.drama,...,movies_genres_onehot.film_noir,movies_genres_onehot.horror,movies_genres_onehot.imax,movies_genres_onehot.musical,movies_genres_onehot.mystery,movies_genres_onehot.romance,movies_genres_onehot.sci_fi,movies_genres_onehot.thriller,movies_genres_onehot.war,movies_genres_onehot.western
0,1,Toy Story (1995),0,1,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji (1995),0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men (1995),0,0,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,4,Waiting to Exhale (1995),0,0,0,0,1,0,0,1,...,0,0,0,0,0,1,0,0,0,0
4,5,Father of the Bride Part II (1995),0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58093,129417,Four Ways Out (1951),0,0,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
58094,129419,The Facts of Murder (1959),0,0,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
58095,129421,Jealousy (1953),0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
58096,129423,Pigs (1972),0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


In [23]:
new_columns = {
      'movies_genres_onehot.movie_id': 'movie_id'
    , 'movies_genres_onehot.title': 'title'
    , 'movies_genres_onehot.action': 'action'
    , 'movies_genres_onehot.adventure': 'adventure'
    , 'movies_genres_onehot.animation': 'animation'
    , 'movies_genres_onehot.children': 'children'
    , 'movies_genres_onehot.comedy': 'comedy'
    , 'movies_genres_onehot.crime': 'crime'
    , 'movies_genres_onehot.documentary': 'documentary'
    , 'movies_genres_onehot.drama': 'drama'
    , 'movies_genres_onehot.fantasy': 'fantasy'
    , 'movies_genres_onehot.film_noir': 'film_noir'
    , 'movies_genres_onehot.horror': 'horror'
    , 'movies_genres_onehot.imax': 'imax'
    , 'movies_genres_onehot.musical': 'musical'
    , 'movies_genres_onehot.mystery': 'mystery'
    , 'movies_genres_onehot.romance': 'romance'
    , 'movies_genres_onehot.sci_fi': 'sci_fi'
    , 'movies_genres_onehot.thriller': 'thriller'
    , 'movies_genres_onehot.war': 'war'
    , 'movies_genres_onehot.western': 'western'
}

In [24]:
movies_genres_onehot.rename(columns=new_columns, inplace=True)
movies_genres_onehot

Unnamed: 0,movie_id,title,action,adventure,animation,children,comedy,crime,documentary,drama,...,film_noir,horror,imax,musical,mystery,romance,sci_fi,thriller,war,western
0,1,Toy Story (1995),0,1,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji (1995),0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men (1995),0,0,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,4,Waiting to Exhale (1995),0,0,0,0,1,0,0,1,...,0,0,0,0,0,1,0,0,0,0
4,5,Father of the Bride Part II (1995),0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58093,129417,Four Ways Out (1951),0,0,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
58094,129419,The Facts of Murder (1959),0,0,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
58095,129421,Jealousy (1953),0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
58096,129423,Pigs (1972),0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


In [25]:
movies_genres_onehot.sort_values('movie_id', inplace=True, ignore_index=True)
movies_genres_onehot

Unnamed: 0,movie_id,title,action,adventure,animation,children,comedy,crime,documentary,drama,...,film_noir,horror,imax,musical,mystery,romance,sci_fi,thriller,war,western
0,1,Toy Story (1995),0,1,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji (1995),0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men (1995),0,0,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,4,Waiting to Exhale (1995),0,0,0,0,1,0,0,1,...,0,0,0,0,0,1,0,0,0,0
4,5,Father of the Bride Part II (1995),0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58093,193876,The Great Glinka (1946),0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
58094,193878,Les tribulations dune caissière (2011),0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
58095,193880,Her Name Was Mumu (2016),0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
58096,193882,Flora (2017),0,1,0,0,0,0,0,1,...,0,1,0,0,0,0,1,0,0,0


In [26]:
movies_genres_onehot_new = movies_genres_onehot.drop('title', axis=1)
movies_genres_onehot_new

Unnamed: 0,movie_id,action,adventure,animation,children,comedy,crime,documentary,drama,fantasy,film_noir,horror,imax,musical,mystery,romance,sci_fi,thriller,war,western
0,1,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1,2,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,3,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
3,4,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0
4,5,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58093,193876,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
58094,193878,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
58095,193880,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
58096,193882,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0


In [27]:
genres_columns = []
for _ in movies_genres_onehot_new.columns:
    if _ == 'movie_id':
        pass
    else:
        genres_columns.append(_)
genres_columns

['action',
 'adventure',
 'animation',
 'children',
 'comedy',
 'crime',
 'documentary',
 'drama',
 'fantasy',
 'film_noir',
 'horror',
 'imax',
 'musical',
 'mystery',
 'romance',
 'sci_fi',
 'thriller',
 'war',
 'western']

In [28]:
genres_columns_df = pd.DataFrame(columns=genres_columns)
genres_columns_df

Unnamed: 0,action,adventure,animation,children,comedy,crime,documentary,drama,fantasy,film_noir,horror,imax,musical,mystery,romance,sci_fi,thriller,war,western


In [29]:
ratings_genres = pd.DataFrame(ratings_rescaled['user_id'].unique(), columns=['user_id'])
ratings_genres

Unnamed: 0,user_id
0,1
1,2
2,3
3,4
4,5
...,...
283223,283224
283224,283225
283225,283226
283226,283227


In [30]:
ratings_genres = pd.merge(ratings_genres, genres_columns_df, how='left', left_index=True, right_index=True)
ratings_genres.fillna(0, inplace=True)
ratings_genres

Unnamed: 0,user_id,action,adventure,animation,children,comedy,crime,documentary,drama,fantasy,film_noir,horror,imax,musical,mystery,romance,sci_fi,thriller,war,western
0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
283223,283224,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
283224,283225,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
283225,283226,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
283226,283227,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [31]:
ratings_rescaled.sort_values('user_id', inplace=True, ignore_index=True)
ratings_rescaled

Unnamed: 0,user_id,movie_id,rating_rescaled
0,1,307,1
1,1,3893,1
2,1,3826,-1
3,1,3698,1
4,1,3424,1
...,...,...,...
27753439,283228,950,1
27753440,283228,947,1
27753441,283228,946,1
27753442,283228,940,1


In [None]:
t = tqdm(total=len(ratings_rescaled))
for index in range(len(ratings_rescaled)):
    user_id = ratings_rescaled.at[index, 'user_id']
    movie_id = ratings_rescaled.at[index, 'movie_id']
    rating = ratings_rescaled.at[index, 'rating_rescaled']
    current_ratings_index = ratings_genres[ratings_genres['user_id'] == user_id].index[0]
    if rating == 1:
        for genre in genres_columns:
            ratings_genres.at[current_ratings_index, genre] += movies_genres_onehot_new[movies_genres_onehot_new['movie_id'] == movie_id][genre]
    elif rating == -1:
        for genre in genres_columns:
            ratings_genres.at[current_ratings_index, genre] -= movies_genres_onehot_new[movies_genres_onehot_new['movie_id'] == movie_id][genre]
    t.update(1)

In [None]:
ratings_genres.sort_values('user_id', inplace=True, ignore_index=True)
ratings_genres

In [None]:
ratings_genres.to_sql('ratings_genres', mysql_cnx, if_exists='append', index=False)