In [49]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn import metrics

from sklearn.cluster import KMeans, MiniBatchKMeans

import logging
from optparse import OptionParser
import sys
from time import time

import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
import pickle

import pandas as pd
import re
from pandas.io.json import json_normalize
import json

In [52]:
def convert_ids(ids_in_csv):
    return pd.to_numeric(ids_in_csv, errors='coerce').astype('int64')

def convert_to_float(ids_in_csv):
    return pd.to_numeric(ids_in_csv, errors='coerce').astype('float64')

def to_json(csv_entry):
    return json.loads(re.sub('\'', '"', csv_entry))

movies_metadata_df = pd.read_csv('../data/the-movies-dataset/movies_metadata.csv'
                                 , converters={ 'id': lambda x: convert_ids(x)
                                               , 'imdb_id': lambda x: convert_ids(x)
                                               ,'popularity': lambda x: convert_to_float(x)
                                               ,'genres': lambda x: to_json(x)}
                                 , usecols=['id', 'original_title'
                                                , 'genres', 'homepage'
                                                , 'overview', 'popularity', 'poster_path'
                                                , 'release_date', 'revenue', 'runtime'
                                                , 'spoken_languages', 'tagline', 'title'
                                                , 'vote_average', 'vote_count']
                                , dtype={'populariy': np.float64}
                                , parse_dates=True)


In [106]:
mini_df = movies_metadata_df[['id', 'popularity', 'release_date', 'revenue', 'runtime', 'vote_average', 'genres']]

In [107]:
mini_df['release_year'] = pd.to_datetime(movies_metadata_df['release_date'], errors='coerce').apply(lambda x: x.year)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [64]:
mini_df.head()

Unnamed: 0,popularity,release_date,revenue,runtime,vote_average,genres,release_year
0,21.946943,1995-10-30,373554033.0,81.0,7.7,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",1995.0
1,17.015539,1995-12-15,262797249.0,104.0,6.9,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",1995.0
2,11.7129,1995-12-22,0.0,101.0,6.5,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",1995.0
3,3.859495,1995-12-22,81452156.0,127.0,6.1,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",1995.0
4,8.387519,1995-02-10,76578911.0,106.0,5.7,"[{'id': 35, 'name': 'Comedy'}]",1995.0


In [80]:
[list(item.values())[0] for item in mini_df[['genres']].loc[0][0]]

[16, 35, 10751]

In [66]:
json_normalize(mini_df[['genres']].loc[0][0])

Unnamed: 0,id,name
0,16,Animation
1,35,Comedy
2,10751,Family


In [67]:
json_normalize(mini_df[['genres']].loc[1][0])

Unnamed: 0,id,name
0,12,Adventure
1,14,Fantasy
2,10751,Family


In [68]:
json_normalize(mini_df[['genres']].loc[2][0])

Unnamed: 0,id,name
0,10749,Romance
1,35,Comedy


In [69]:
pd.get_dummies(mini_df['genres'])

TypeError: unhashable type: 'list'

In [108]:
mini_df['genre_ids'] = mini_df['genres'].apply(lambda x: [list(item.values())[0] for item in x])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


TypeError: '<' not supported between instances of 'str' and 'int'

In [88]:
for i, lst in enumerate(mini_df.genre_ids):
    for j, item in enumerate(lst):
        if type(item) == str:
            print(f'{type(item)} found at series index{i}, array index{j}')

<class 'str'> found at series index19730, array index0
<class 'str'> found at series index19730, array index1
<class 'str'> found at series index19730, array index2
<class 'str'> found at series index29503, array index0
<class 'str'> found at series index29503, array index1
<class 'str'> found at series index29503, array index2
<class 'str'> found at series index29503, array index3
<class 'str'> found at series index29503, array index4
<class 'str'> found at series index35587, array index0
<class 'str'> found at series index35587, array index1
<class 'str'> found at series index35587, array index2
<class 'str'> found at series index35587, array index3


In [109]:
mini_df_dropped_na = mini_df.dropna()

In [97]:
mini_df_dropped_na.shape

(45130, 8)

In [110]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
encoded_categories = mlb.fit_transform(mini_df_dropped_na['genre_ids'])

In [111]:
encoded_categories_df = pd.DataFrame(encoded_categories)

In [112]:
encoded_categories_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0
1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
2,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0
3,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0
4,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
5,0,0,0,1,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0
7,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0
8,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
9,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0


In [114]:
content_filter_df = pd.concat([mini_df_dropped_na, encoded_categories_df], axis=1)

In [115]:
content_filter_df.to_pickle('../data/content_filter_df.pkl')

In [None]:
content_filter_df