In [80]:
import pandas as pd
import numpy as np

from tqdm import tqdm_notebook

from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer, TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

%matplotlib inline

In [81]:
links = pd.read_csv('../data/links.csv')
movies = pd.read_csv('../data/movies.csv')
ratings = pd.read_csv('../data/ratings.csv')
tags = pd.read_csv('../data/tags.csv')

In [82]:
movies.head(10)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


In [83]:
def change_string(s):
    return ' '.join(s.replace(' ', '').replace('-', '').split('|'))

In [84]:
movie_genres = [change_string(g) for g in movies.genres.values]

In [85]:
movie_genres[:10]

['Adventure Animation Children Comedy Fantasy',
 'Adventure Children Fantasy',
 'Comedy Romance',
 'Comedy Drama Romance',
 'Comedy',
 'Action Crime Thriller',
 'Comedy Romance',
 'Adventure Children',
 'Action',
 'Action Adventure Thriller']

In [86]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(movie_genres)

In [87]:
X_train_counts

<58098x20 sparse matrix of type '<class 'numpy.int64'>'
	with 106107 stored elements in Compressed Sparse Row format>

In [88]:
count_vect.vocabulary_

{'adventure': 1,
 'animation': 2,
 'children': 3,
 'comedy': 4,
 'fantasy': 8,
 'romance': 15,
 'drama': 7,
 'action': 0,
 'crime': 5,
 'thriller': 17,
 'horror': 10,
 'mystery': 13,
 'scifi': 16,
 'imax': 11,
 'documentary': 6,
 'war': 18,
 'musical': 12,
 'western': 19,
 'filmnoir': 9,
 'nogenreslisted': 14}

In [89]:
X_train_counts.toarray()

array([[0, 1, 1, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0]])

In [90]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [91]:
## Альтернативный способ
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(movie_genres)

In [92]:
tfidf_vectorizer.vocabulary_

{'adventure': 1,
 'animation': 2,
 'children': 3,
 'comedy': 4,
 'fantasy': 8,
 'romance': 15,
 'drama': 7,
 'action': 0,
 'crime': 5,
 'thriller': 17,
 'horror': 10,
 'mystery': 13,
 'scifi': 16,
 'imax': 11,
 'documentary': 6,
 'war': 18,
 'musical': 12,
 'western': 19,
 'filmnoir': 9,
 'nogenreslisted': 14}

In [93]:
movies_to_ganre = X_train_tfidf.toarray()

In [94]:
X_train_tfidf.toarray()

array([[0.        , 0.44222387, 0.49338636, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.53634667, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.55960809, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.6207864 , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [95]:
neigh = NearestNeighbors(n_neighbors=7, n_jobs=-1, metric='euclidean') 
neigh.fit(X_train_tfidf)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='euclidean',
                 metric_params=None, n_jobs=-1, n_neighbors=7, p=2, radius=1.0)

In [96]:
test = change_string("Adventure|Comedy|Fantasy|Crime")

X_tfidf2 = tfidf_vectorizer.transform([test])

res = neigh.kneighbors(X_tfidf2, return_distance=True)

In [97]:
res

(array([[0.        , 0.        , 0.        , 0.42056775, 0.51729616,
         0.51729616, 0.51729616]]),
 array([[29183, 29184, 29185, 12784, 19038, 38217, 16496]]))

In [98]:
movies.iloc[res[1][0]]

Unnamed: 0,movieId,title,genres
29183,127040,Fantomas (Fantômas) (1964),Adventure|Comedy|Crime|Fantasy
29184,127042,Fantomas Unleashed (Fantômas se déchaîne) (1965),Adventure|Comedy|Crime|Fantasy
29185,127044,Fantomas vs. Scotland Yard (Fantômas contre Sc...,Adventure|Comedy|Crime|Fantasy
12784,60074,Hancock (2008),Action|Adventure|Comedy|Crime|Fantasy
19038,94015,Mirror Mirror (2012),Adventure|Comedy|Fantasy
38217,148886,Dinosaur Island (1994),Adventure|Comedy|Fantasy
16496,82854,Gulliver's Travels (2010),Adventure|Comedy|Fantasy


## SMF DIF

In [99]:
links = pd.read_csv('../data/links.csv')
movies = pd.read_csv('../data/movies.csv')
ratings = pd.read_csv('../data/ratings.csv')
tags = pd.read_csv('../data/tags.csv')

In [100]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [101]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [102]:
movies_with_tags = movies.join(tags.set_index('movieId'), on='movieId')

In [103]:
movies_with_tags.head()

Unnamed: 0,movieId,title,genres,userId,tag,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,336.0,pixar,1139046000.0
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,474.0,pixar,1137207000.0
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,567.0,fun,1525286000.0
1,2,Jumanji (1995),Adventure|Children|Fantasy,62.0,fantasy,1528844000.0
1,2,Jumanji (1995),Adventure|Children|Fantasy,62.0,magic board game,1528844000.0


In [104]:
movies_with_tags[movies_with_tags.title == 'Jumanji (1995)']

Unnamed: 0,movieId,title,genres,userId,tag,timestamp
1,2,Jumanji (1995),Adventure|Children|Fantasy,62.0,fantasy,1528844000.0
1,2,Jumanji (1995),Adventure|Children|Fantasy,62.0,magic board game,1528844000.0
1,2,Jumanji (1995),Adventure|Children|Fantasy,62.0,Robin Williams,1528844000.0
1,2,Jumanji (1995),Adventure|Children|Fantasy,474.0,game,1137376000.0


In [105]:
movies_with_tags.tag.unique()

array(['pixar', 'fun', 'fantasy', ..., 'star wars', 'gintama', 'remaster'],
      dtype=object)

In [106]:
movies_with_tags.dropna(inplace=True)

In [107]:
movies_with_tags.title.unique().shape

(1572,)

In [108]:
tag_strings = []
movies = []

for movie, group in tqdm_notebook(movies_with_tags.groupby('title')):
    tag_strings.append(' '.join([str(s).replace(' ', '').replace('-', '') for s in group.tag.values]))
    movies.append(movie)

HBox(children=(IntProgress(value=0, max=1572), HTML(value='')))




In [109]:
tag_strings[:5]

['artistic Funny humorous inspiring intelligent quirky romance ZooeyDeschanel',
 'lawyers',
 'creepy suspense',
 'Shakespearesortof',
 'dogs remake']

In [110]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(tag_strings)

In [111]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [112]:
neigh = NearestNeighbors(n_neighbors=10, n_jobs=-1, metric='manhattan') 
neigh.fit(X_train_tfidf)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='manhattan',
                 metric_params=None, n_jobs=-1, n_neighbors=10, p=2,
                 radius=1.0)

In [113]:
for i in range(len(movies)):
    if 'Magnolia (1999)' == movies[i]:
        print(i)

822


In [114]:
tag_strings[822]

'L.A.'

In [115]:
test = change_string('fantasy magic board game Robin Williams game')

predict = count_vect.transform([test])
X_tfidf2 = tfidf_transformer.transform(predict)

res = neigh.kneighbors(X_tfidf2, return_distance=True)

In [116]:
res

(array([[0., 0., 1., 1., 1., 1., 1., 1., 1., 1.]]),
 array([[661, 822, 947, 955, 954, 953, 951, 950, 949, 959]]))

In [117]:
for i in res[1][0]:
    print(movies[i])

In a Lonely Place (1950)
Magnolia (1999)
Neon Genesis Evangelion: Death & Rebirth (Shin seiki Evangelion Gekijô-ban: Shito shinsei) (1997)
Night and Day (1946)
Nicholas Nickleby (2002)
Niagara (1953)
Never Been Kissed (1999)
Network (1976)
Net, The (1995)
Night of the Hunter, The (1955)


## SMF DIF about start date

In [118]:
movies = pd.read_csv('../data/movies.csv')

In [119]:
movies[:10]

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


In [120]:
tfidfs_on_genre = X_train_tfidf.toarray()

In [121]:
tfidfs_on_genre

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.42034242],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [122]:
for x in range(tfidfs_on_genre.shape[1]):
    col_name = 'g{}'.format(x)
    movies[col_name] = pd.Series(tfidfs_on_genre[:, x])

In [123]:
movies[:10]

Unnamed: 0,movieId,title,genres,g0,g1,g2,g3,g4,g5,g6,...,g1462,g1463,g1464,g1465,g1466,g1467,g1468,g1469,g1470,g1471
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.420342
1,2,Jumanji (1995),Adventure|Children|Fantasy,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,Grumpier Old Men (1995),Comedy|Romance,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,Father of the Bride Part II (1995),Comedy,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,6,Heat (1995),Action|Crime|Thriller,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,7,Sabrina (1995),Comedy|Romance,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,8,Tom and Huck (1995),Adventure|Children,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,9,Sudden Death (1995),Action,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,10,GoldenEye (1995),Action|Adventure|Thriller,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [124]:
def extract_year(s):
    try:
        return int(s[-5:-1])
    except:
        return 0

In [125]:
movies['year'] = movies['title'].apply(extract_year)

In [126]:
movies[:10]

Unnamed: 0,movieId,title,genres,g0,g1,g2,g3,g4,g5,g6,...,g1463,g1464,g1465,g1466,g1467,g1468,g1469,g1470,g1471,year
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.420342,1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1995
2,3,Grumpier Old Men (1995),Comedy|Romance,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1995
4,5,Father of the Bride Part II (1995),Comedy,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1995
5,6,Heat (1995),Action|Crime|Thriller,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1995
6,7,Sabrina (1995),Comedy|Romance,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1995
7,8,Tom and Huck (1995),Adventure|Children,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1995
8,9,Sudden Death (1995),Action,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1995
9,10,GoldenEye (1995),Action|Adventure|Thriller,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1995


In [127]:
movies = pd.get_dummies(movies, columns=['year'])

In [128]:
movies.head()

Unnamed: 0,movieId,title,genres,g0,g1,g2,g3,g4,g5,g6,...,year_2009,year_2010,year_2011,year_2012,year_2013,year_2014,year_2015,year_2016,year_2017,year_2018
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji (1995),Adventure|Children|Fantasy,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men (1995),Comedy|Romance,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,5,Father of the Bride Part II (1995),Comedy,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [129]:
to_train = movies.columns

In [130]:
to_train = [s for i, s in enumerate(movies.columns) if i > 1]

In [131]:
to_train_df = movies[to_train]

In [132]:
to_train_df.fillna(0)

Unnamed: 0,genres,g0,g1,g2,g3,g4,g5,g6,g7,g8,...,year_2009,year_2010,year_2011,year_2012,year_2013,year_2014,year_2015,year_2016,year_2017,year_2018
0,Adventure|Animation|Children|Comedy|Fantasy,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,Adventure|Children|Fantasy,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,Comedy|Romance,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,Comedy|Drama|Romance,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,Comedy,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58093,(no genres listed),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
58094,Comedy,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,1,0,0,0,0,0,0,0
58095,Drama,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,1,0,0
58096,Adventure|Drama|Horror|Sci-Fi,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,1,0


In [133]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(to_train_df)

In [134]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [135]:
neigh = NearestNeighbors(n_neighbors=10, n_jobs=-1, metric='manhattan') 
neigh.fit(X_train_tfidf)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='manhattan',
                 metric_params=None, n_jobs=-1, n_neighbors=10, p=2,
                 radius=1.0)

In [136]:
test = change_string('Adventure|Animation|Children|Comedy|Fantasy')

predict = count_vect.transform([test])
X_tfidf2 = tfidf_transformer.transform(predict)

res = neigh.kneighbors(X_tfidf2, return_distance=True)

In [138]:
movies.iloc[res[1][0]]

Unnamed: 0,movieId,title,genres,g0,g1,g2,g3,g4,g5,g6,...,year_2009,year_2010,year_2011,year_2012,year_2013,year_2014,year_2015,year_2016,year_2017,year_2018
1075,1097,E.T. the Extra-Terrestrial (1982),Children|Drama|Sci-Fi,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1076,1098,"Search for One-eye Jimmy, The (1996)",Comedy,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1072,1094,"Crying Game, The (1992)",Drama|Romance|Thriller,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1073,1095,Glengarry Glen Ross (1992),Drama,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1074,1096,Sophie's Choice (1982),Drama,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1070,1092,Basic Instinct (1992),Crime|Mystery|Thriller,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1079,1101,Top Gun (1986),Action|Romance,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1078,1100,Days of Thunder (1990),Action|Drama|Romance,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1068,1090,Platoon (1986),Drama|War,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1077,1099,"Christmas Carol, A (1938)",Children|Drama|Fantasy,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
