<b>Tasks:</b>
- 
- 

In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import seaborn as sns

In [2]:
# Step 1. Load data
movies = pd.read_csv('dataset/movies.csv')
ratings = pd.read_csv('dataset/ratings.csv')
tags = pd.read_csv('dataset/tags.csv')

In [3]:
# transforming genres
movies['genres'] = movies['genres'].apply(lambda x: ''.join(x.lower().replace('-', '').replace(' ', '').replace('|', ' ')))

movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),adventure animation children comedy fantasy
1,2,Jumanji (1995),adventure children fantasy
2,3,Grumpier Old Men (1995),comedy romance
3,4,Waiting to Exhale (1995),comedy drama romance
4,5,Father of the Bride Part II (1995),comedy


In [4]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [5]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


In [6]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [7]:
# Step 2. Split data. Init the Rating table as main

# change order

ratings = ratings.sample(frac=1) 

# split data
ratings_train = ratings.iloc[:70000, :]
ratings_test = ratings.iloc[70000:, :]

# add a bool field in the Tags table
tags['exist_in_train'] = tags.movieId.isin(ratings_train.movieId.unique().tolist())

len(ratings_train), len(ratings_test)

(70000, 30836)

In [8]:
# Step 3.1. Preparing data. Constrating a table of features for Movies [movieId, title, genres, tags, ratings features]
"""
Здесь я предположил следующее. Т.к. нам надо предсказывать оценку юзера к объекту, то обогатить таблицу с жанрами фильмов необходимо только трейновыми данными.
Т.о. теги и средние оценки для фильма будем рассчитывать только на тех данных, которые есть в трейне, избегая тем самым "взгляда в будущее".
"""

def prepare_string(line):
    return ' '.join(line.str.replace(' ', '').str.lower())


def round_mean_rating(rating):
    if rating >= 4.8:
        return  5.0
    elif rating >= 4.3:
        return 4.5
    elif rating >= 3.8:
        return 4.0
    elif rating >= 3.3:
        return 3.5
    elif rating >= 2.8:
        return 3.0
    elif rating >= 2.3:
        return 2.5
    elif rating >= 1.8:
        return 2.0
    elif rating >= 1.3:
        return 1.5
    elif rating >= 0.8:
        return 1.0
    elif rating >= 0.3:
        return 0.5
    return 0.0
    
    
# field "all movie tags" from train selection
movie_group = tags.loc[tags.exist_in_train].groupby(['movieId'])['tag'].apply(prepare_string)

# field user_mean_rating, etc are like fetures. Only from train selection
movie_mean_rating = ratings_train.groupby('movieId').agg({'rating': ['mean', 'median', 'var', 'count']})


# Ganres are known in advance. Join genres and tags
movie_group = movies.set_index('movieId').join(movie_group).join(movie_mean_rating['rating'])
movie_group['mean'] = movie_group['mean'].apply(round_mean_rating)

# changing Nulls
movie_group['tag'] = movie_group['tag'].apply(lambda x: x if isinstance(x, str) else 'none')
movie_group['median'] = movie_group['median'].fillna(-1)
movie_group['var'] = movie_group['var'].fillna(-1)
movie_group['count'] = movie_group['count'].fillna(-1)

movie_group.head()

Unnamed: 0_level_0,title,genres,tag,mean,median,var,count
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,Toy Story (1995),adventure animation children comedy fantasy,pixar pixar fun,4.0,4.0,0.733962,144.0
2,Jumanji (1995),adventure children fantasy,fantasy magicboardgame robinwilliams game,3.5,3.0,0.801906,78.0
3,Grumpier Old Men (1995),comedy romance,moldy old,3.0,3.0,1.17915,39.0
4,Waiting to Exhale (1995),comedy drama romance,none,2.5,3.0,0.8,5.0
5,Father of the Bride Part II (1995),comedy,pregnancy remake,3.0,3.0,0.840278,36.0


In [9]:
# уникальных фильмов в тесте , фильмы упоминание которых отсуствует в трейне
len(ratings_test.movieId.unique()), len(tags.loc[tags.exist_in_train == False].movieId.unique())

(6144, 87)

In [10]:
# Step 3.2. Preparing data. Constrating a feature-table for Users-Movies [userId, tf-idf for genres, tf-idf for tags, rating mean(median/variance)] + [features from movie_group]

def construct_feature_table(t_tags, t_ratings, t_movies):

    # concat tags and genres into one string
    users_tags = t_tags.groupby(['userId', 'movieId']).agg({'tag' : prepare_string})

    # calculating mean, median and variance rating
    user_mean_rating = t_ratings.groupby('userId').agg({'rating': ['mean']})
    #print(len(users_tags))
    
    # joining all features
    user_features = pd.merge(t_ratings[['userId', 'movieId', 'rating']], users_tags, how='left', on=['userId', 'movieId'])
    print(len(user_features))
    
    user_features = user_features.rename(columns={'rating': 'target_rating', 'tag': 'user_tags_for_film'})
    user_features['user_tags_for_film'] = user_features['user_tags_for_film'].apply(lambda x: x if isinstance(x, str) else 'none')
    
    # adding features from movie_group
    user_features = pd.merge(user_features, movie_group[['genres', 'tag', 'mean', 'median', 'var', 'count']], how='left', on=['movieId']).rename(columns={'tag': 'film_tags', 'mean': 'film_mean', 'median': 'film_median', 'var': 'film_var', 'count': 'film_count'})
    print(len(user_features))
    
    return user_features


features = construct_feature_table(tags.loc[tags.exist_in_train], ratings_train, movie_group)
features.head()

70000
70000


Unnamed: 0,userId,movieId,target_rating,user_tags_for_film,genres,film_tags,film_mean,film_median,film_var,film_count
0,440,194,2.5,none,comedy drama,none,3.5,3.5,1.469697,12.0
1,177,65261,4.0,none,adventure animation children fantasy,none,4.0,4.0,1.277778,9.0
2,121,380,3.0,none,action adventure comedy romance thriller,spies,3.5,3.5,0.858457,123.0
3,19,2100,3.0,none,comedy fantasy romance,mermaid,3.0,3.0,0.813063,37.0
4,325,293,3.0,none,action crime drama thriller,assassin jeanreno hitmen action assassin assas...,4.0,4.0,0.678094,87.0


In [11]:
# Test data
test = construct_feature_table(tags.loc[tags.exist_in_train == False], ratings_test, movie_group)
test.head()

30836
30836


Unnamed: 0,userId,movieId,target_rating,user_tags_for_film,genres,film_tags,film_mean,film_median,film_var,film_count
0,298,33495,0.5,none,comedy,none,3.0,3.0,0.5,6.0
1,541,480,5.0,none,action adventure scifi thriller,dinosaur,3.5,4.0,0.766642,149.0
2,562,5294,5.0,none,crime drama thriller,religion,3.5,4.0,0.3125,9.0
3,91,5346,2.5,none,drama romance,none,1.0,1.0,-1.0,1.0
4,387,31590,3.0,none,crime drama thriller,none,0.0,-1.0,-1.0,-1.0


In [12]:
# Step 4. TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.compose import ColumnTransformer # for commiting CountVectorizer on 2 column

text_fetures = ['genres', 'film_tags', 'user_tags_for_film']
num_features = ['userId', 'movieId', 'target_rating', 'film_mean', 'film_median', 'film_var', 'film_count']


ct = ColumnTransformer([("cv1", CountVectorizer(), 'genres'), ("cv2", CountVectorizer(), 'film_tags'), ("cv3", CountVectorizer(), 'user_tags_for_film')])
tfidf_vector = TfidfTransformer()


X_train = ct.fit_transform(features[text_fetures])
X_train = tfidf_vector.fit_transform(X_train)
X_train = pd.DataFrame(X_train.toarray(), columns=ct.get_feature_names_out()).join(features[num_features])

X_test = ct.transform(test[text_fetures])
X_test = tfidf_vector.transform(X_test)
X_test = pd.DataFrame(X_test.toarray(), columns=ct.get_feature_names_out()).join(test[num_features])


In [13]:
X_train

Unnamed: 0,cv1__action,cv1__adventure,cv1__animation,cv1__children,cv1__comedy,cv1__crime,cv1__documentary,cv1__drama,cv1__fantasy,cv1__filmnoir,...,cv3__wrongfulimprisonment,cv3__youngermen,cv3__zombies,userId,movieId,target_rating,film_mean,film_median,film_var,film_count
0,0.000000,0.000000,0.000000,0.000000,0.585075,0.000000,0.0,0.564863,0.000000,0.0,...,0.0,0.0,0.0,440,194,2.5,3.5,3.5,1.469697,12.0
1,0.000000,0.364642,0.549536,0.509274,0.000000,0.000000,0.0,0.000000,0.470131,0.0,...,0.0,0.0,0.0,177,65261,4.0,4.0,4.0,1.277778,9.0
2,0.241283,0.267957,0.000000,0.000000,0.214982,0.000000,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,121,380,3.0,3.5,3.5,0.858457,123.0
3,0.000000,0.000000,0.000000,0.000000,0.200354,0.000000,0.0,0.000000,0.321969,0.0,...,0.0,0.0,0.0,19,2100,3.0,3.0,3.0,0.813063,37.0
4,0.050477,0.000000,0.000000,0.000000,0.000000,0.064717,0.0,0.043421,0.000000,0.0,...,0.0,0.0,0.0,325,293,3.0,4.0,4.0,0.678094,87.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69995,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.285769,0.000000,0.0,...,0.0,0.0,0.0,111,56757,3.5,3.5,4.0,0.746047,23.0
69996,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.156583,0.000000,0.0,...,0.0,0.0,0.0,423,1653,5.0,4.0,4.0,0.722531,59.0
69997,0.335162,0.000000,0.000000,0.000000,0.000000,0.429712,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,387,52281,4.0,3.5,4.0,1.222222,18.0
69998,0.257336,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,179,377,5.0,3.5,3.5,0.896387,125.0


In [14]:
# cute inition 
X_train, X_test, y_train, y_test = X_train.drop(['target_rating'], axis=1), X_test.drop(['target_rating'], axis=1), X_train['target_rating'], X_test['target_rating']

In [15]:
### Step 5. Bayer's optimize using hyperopt
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from hyperopt import hp, rand, fmin, tpe, Trials, STATUS_OK


list_n_estimators = list(range(1,40, 10))
list_criterion = ['squared_error', 'absolute_error']
list_max_depth = list(range(2,8,2))
list_min_samples_split = list(range(2,4,1))
list_min_samples_leaf = list(range(2,4,1))


#пространство поиска параметров для hyperopt
search_space = [
    hp.choice(label='n_estimators',options=list_n_estimators),
    hp.choice(label='criterion', options=list_criterion),
    hp.choice(label='max_depth', options=list_max_depth),
    hp.choice(label='min_samples_split', options=list_min_samples_split),
    hp.choice(label='min_samples_leaf', options=list_min_samples_leaf),
]

# sample позвоялет оценить пространство параметров.
from hyperopt.pyll.stochastic import sample

    
#необходимо создать функцию
def objective(params):
  svr = RandomForestRegressor(n_estimators=params[0], criterion=params[1], max_depth=params[2], min_samples_split=params[3], min_samples_leaf=params[4],  random_state=42) 
  scores = cross_val_score(svr, X_train, y_train, cv=3, scoring="neg_mean_squared_error")
  return scores.mean()


tpe_algo = tpe.suggest
tpe_trials = Trials()  

tpe_best = fmin(fn=objective, space=search_space, algo=tpe_algo, trials=tpe_trials, 
                max_evals=3)

print(tpe_best)

100%|██████████| 3/3 [30:42<00:00, 614.01s/trial, best loss: -0.8039785690016585]
{'criterion': 1, 'max_depth': 1, 'min_samples_leaf': 1, 'min_samples_split': 1, 'n_estimators': 0}


In [17]:
### Bagging
from sklearn.ensemble import  BaggingRegressor
from sklearn.metrics import mean_squared_error

bagging = BaggingRegressor(RandomForestRegressor( n_estimators=list_n_estimators[tpe_best.get('n_estimators')],
                                                  criterion=list_criterion[tpe_best.get('criterion')],
                                                  max_depth=list_max_depth[tpe_best.get('max_depth')],
                                                  min_samples_split=list_min_samples_split[tpe_best.get('min_samples_split')],
                                                  min_samples_leaf=list_min_samples_leaf[tpe_best.get('min_samples_leaf')],
                                                  random_state=42
                                                ),
                           n_estimators=10,
                           max_samples=0.5,
                           max_features=0.5,
                           random_state=10)


bagging.fit(X_train, y_train)
predict = bagging.predict(X_test)
mean_squared_error(predict, y_test)

1.2074788193994033