In [1]:
import pandas as pd

# Transformer todos
## General
- Check each df for NaN values and remove those IDs from the other dfs
- Particularly links

- 

## Matrix factorization
- Create NMF matrix for crew and cast


In [32]:
class DataTransformer():
    
    def __init__(self,
                 movie_lens_data_path="../data/ml-latest-small/",
                 tmdb_data_path="../data/tmdb/"):

        self.movie_lens_data_path = movie_lens_data_path
        self.tmdb_data_path = tmdb_data_path
        
        self._ratings_df = pd.DataFrame()
        self._movies_df = pd.DataFrame()
        self._links_df = pd.DataFrame()
        self._poster_links = pd.DataFrame()
        self._movie_cast_and_crew = pd.DataFrame()
        self._genres_encoded_df = pd.DataFrame()
        self._genres_list = []
        
        self._load_movie_lens_data()
        self._load_tmdb_data()
        self._create_binary_encoded_genres()
        self._create_genres_list()
        
    def _load_movie_lens_data(self):
        self._ratings_df = pd.read_csv(
            self.movie_lens_data_path+'ratings.csv',
            usecols=['userId', 'movieId', 'rating', 'timestamp'],
            dtype={'userId': int, 'movieId': int, 'rating': float},
            parse_dates=['timestamp'],  
            converters={'timestamp': lambda x: pd.to_datetime(int(x), unit='s')}
        )
        
        self._movies_df = pd.read_csv(
            self.movie_lens_data_path+'movies.csv',
            usecols=['movieId', 'title', 'genres'],
            dtype={'movieId': int, 'title': str, 'genres': str}
        )
        # Move the year values to a separate col
        self._movies_df["year"] = self._movies_df["title"].str.extract(r"\((\d{4})\)")
        self._movies_df["title"] = self._movies_df["title"].str.replace(r"\(\d{4}\)", "", regex=True).str.strip()
        self._movies_df["year"] = self._movies_df["year"].astype(float).astype("Int64")
        
        self._links_df = pd.read_csv(
            self.movie_lens_data_path+'links.csv', 
            usecols=['movieId', 'imdbId', 'tmdbId'],
            dtype={'movieId': int, 'imdbId': str, 'tmdbId': str}
        )

    def _load_tmdb_data(self):
        self._cast_and_crew_df = pd.read_csv(
            self.tmdb_data_path+'movie_cast_and_crew.csv',
            usecols=['movieId', 'name', 'role'],
            dtype={'movieId': int, 'name': str, 'role': str}
        )

        self._poster_links = pd.read_csv(
            self.tmdb_data_path+'poster_links.csv',
            usecols=['movieId', 'poster_link'],
            dtype={'movieId': int, 'poster_link': str}
        )

    def _create_binary_encoded_genres(self):
        self._genres_encoded_df = self._movies_df['genres'].str.get_dummies(sep='|')
        self._genres_encoded_df = pd.concat([self._movies_df[['movieId']], self._genres_encoded_df], axis=1)
        self._movies_df.drop('genres', axis=1, inplace=True)

    def _create_genres_list(self):
        self._genres_list = [genre for genre in self._genres_encoded_df.columns.values if genre != 'movieId']

    def get_ratings_df(self, genre=None):
        if genre:
            return self._ratings_df[self._ratings_df.movieId.isin(self._genres_encoded_df[self._genres_encoded_df[genre]==1].movieId)].copy()
        return self._ratings_df.copy()
        
    def get_movies_df(self, genre=None):
        if genre:
            return self._movies_df[self._movies_df.movieId.isin(self._genres_encoded_df[self._genres_encoded_df[genre]==1].movieId)].copy()
        return self._movies_df.copy()
        
    def get_links_df(self, genre=None):
        if genre:
            return self._links_df[self._links_df.movieId.isin(self._genres_encoded_df[self._genres_encoded_df[genre]==1].movieId)].copy()
        return self._links_df.copy()

    def get_poster_link(self, movieId):
        return self._poster_links[self._poster_links.movieId==movieId].poster_link.values[0]

    def get_binary_encoded_genres_df(self):
        return self._genres_encoded_df.copy()

    def get_genres_list(self):
        return self._genres_list.copy()

    def get_genre_movieIds(self, genre):
        if genre not in self._genres_list:
            # This is just for my own testing purposes. In the production version 
            # the genres will be a hardcoded, in a drop-down for instance. 
            print(f"Please specify a valid genre from the following list: {', '.join(self._genres_list)}")
            return None
        return self._movies_df[self._movies_df.movieId.isin(self._genres_encoded_df[self._genres_encoded_df[genre]==1].movieId)].movieId.copy()

In [33]:
dt = DataTransformer()

In [19]:
dt.get_genre_movieIds('Action')

5            6
8            9
9           10
14          15
19          20
         ...  
9714    189547
9723    191005
9724    193565
9729    193581
9732    193587
Name: movieId, Length: 1828, dtype: int64

In [27]:
dt.get_links_df('Action').nunique()

movieId    1828
imdbId     1828
tmdbId     1828
dtype: int64

In [22]:
movies = dt.get_movies_df('Action')
movies

Unnamed: 0,movieId,title,year
5,6,Heat,1995
8,9,Sudden Death,1995
9,10,GoldenEye,1995
14,15,Cutthroat Island,1995
19,20,Money Train,1995
...,...,...,...
9714,189547,Iron Soldier,2010
9723,191005,Gintama,2017
9724,193565,Gintama: The Movie,2010
9729,193581,Black Butler: Book of the Atlantic,2017


In [25]:
ratings = dt.get_ratings_df('Action')
ratings.movieId.nunique()

1828

In [23]:
genres_encoded = dt.get_binary_encoded_genres_df()
genres_encoded

Unnamed: 0,movieId,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,0,0,1,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,4,0,0,0,0,0,1,0,0,1,...,0,0,0,0,0,1,0,0,0,0
4,5,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9729,193581,0,1,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9730,193583,0,0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9731,193585,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
9732,193587,0,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [28]:
dt.get_genres_list()

['(no genres listed)',
 'Action',
 'Adventure',
 'Animation',
 'Children',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Fantasy',
 'Film-Noir',
 'Horror',
 'IMAX',
 'Musical',
 'Mystery',
 'Romance',
 'Sci-Fi',
 'Thriller',
 'War',
 'Western']

In [34]:
toy_story_poster_link = dt.get_poster_link(1)
toy_story_poster_link

'https://media.themoviedb.org/t/p/w300_and_h450_bestv2/uXDfjJbdP4ijW5hWSBrPrlKpxab.jpg'