<a href="https://colab.research.google.com/github/ErikSeguinte/movie_data/blob/master/processing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import cufflinks as cf
import numpy as np
from plotly import graph_objs as go
import altair as alt

In [None]:
def enable_plotly_in_cell():
  import IPython
  from plotly.offline import init_notebook_mode
  display(IPython.core.display.HTML('''<script src="/static/components/requirejs/require.js"></script>'''))
  init_notebook_mode(connected=False)

In [None]:
cf.set_config_file(offline=True)

* I previously pulled CSV files from Kaggle, but the files were too big to host on github.
* I imported the files I wanted into pandas, and then exported them back out as compressed pickles.
* I was able to compress a 700MB csv to a 3 MB Pickle

In [None]:
try: 
    movies = pd.read_pickle('data/movies.pkl.xz')
    ratings = pd.read_pickle('data/ratings2.pkl.xz')
except:
    # Download pickles from github
    !wget https://github.com/ErikSeguinte/movie_data/raw/master/data/ratings2.pkl.xz
    !wget https://github.com/ErikSeguinte/movie_data/raw/master/data/movies.pkl.xz
    # Unpickle dataframes
    movies = pd.read_pickle('movies.pkl.xz')
    ratings = pd.read_pickle('ratings2.pkl.xz')

In [None]:
movies.head(1)

In [None]:
ratings.shape

In [None]:
movies.shape

## Clean Movie DF

In [None]:
movies.dtypes

* Movies Dataframe has malformed data. `id` Should be numeric.
* After inspection, it looks like there are rows that are missing a comma somewhere, making columns not line up, and adding the wrong data to columns. Let's clean those up.
* All malformed rows have strings for IDs instead of numeric, so we will coerce them into numeric columns, and strings will be returned as `NaN`, which we'll then drop.

* `budget` and `revanue` should also be numeric, but Nans won't be dropped






In [None]:
movies['id'] = pd.to_numeric(movies['id'], errors='coerce')
movies = movies[movies['id'].notnull()]
movies = movies.set_index('id')

In [None]:
def to_numeric(df, labels):
    
    for label in labels:
        df[label] = pd.to_numeric(movies[label], errors='coerce').copy()
    return df

In [None]:
movies = to_numeric(movies, ['budget', 'revenue', 'vote_average'])

In [None]:
movies['release_date'] =pd.to_datetime(movies['release_date'], infer_datetime_format= True)

In [None]:
clean_movies = movies[['title', 'release_date','budget', 'revenue', 'runtime', 'vote_average', 'vote_count']]

In [None]:
clean_movies.head()

## Process User Reviews
* User reviews come in a collection of individual reviews where a review gives a movie a score of 1 to 5.
* We will take the mean ratings for each movie

In [None]:
ratings

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
scaler = StandardScaler()
mean_rating = pd.DataFrame(scaler.fit_transform(ratings.groupby('movieId')[['rating']].mean()), columns = ["rating"])
#mean_rating = pd.DataFrame(ratings.groupby('movieId')[['rating']].mean(), columns = ["rating"])

In [None]:
mean_rating

In [None]:
# Aggregate mean ratings and number of votes per movie
movie_ratings =pd.DataFrame(ratings.groupby('movieId')[['rating']].agg(['mean', 'count']))['rating']
movie_ratings = movie_ratings.rename({'mean': 'rating', 'count': 'num_votes'}, axis = 1)

* Lets drop any movies with less than 10 votes. Those are more easily swayed by outliers and aren't reliable.

In [None]:
movie_ratings = movie_ratings[~(movie_ratings['num_votes'] < 1000)]

* And now we merge the averaged ratings back with the movie database.
* Note that not all movies are present in the user votings.

In [None]:
movie_ratings = clean_movies.merge(movie_ratings, left_index = True, right_index=True)

In [None]:
movie_ratings[['title', 'rating']].sort_values(by='rating', ascending = False).nlargest(10, 'rating')

* The movie Database also provides a rating and suffer from a similar problem of some movies having a tiny sample size.

In [None]:
movie_ratings[['title', 'vote_average', 'vote_count']].sort_values(by='vote_average', ascending = False).nlargest(10, 'vote_average')

In [None]:
# Dropping low sample size averages from Votes on the movie database
movie_ratings = movie_ratings[~(movie_ratings['vote_count'] < 1000)]

In [None]:
movie_ratings.dtypes

In [None]:
movie_ratings.corr()

In [None]:
# Adding a year and decade to examine trends over time
movie_ratings['year'] = movie_ratings['release_date'].dt.year

In [None]:
movie_ratings['decade'] = [x - (x%10) for x in movie_ratings['year']]

In [None]:
#enable_plotly_in_cell()
movie_ratings.groupby('year')['vote_average'].mean().iplot(kind='bar')

In [None]:
alt.Chart(movie_ratings).mark_bar().encode(
    alt.Y('mean(vote_average)'),
    alt.X('year')
)

In [None]:
movie_ratings

In [None]:
# enable_plotly_in_cell()
movie_ratings.groupby('decade')['rating'].mean().iplot(kind='bar')

* I'd like to compare the votes from TMB to the user ratings, but they are on different scales. We'll use standard scaler to normalize them so we can more easily compare.

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
movie_ratings = movie_ratings.merge(
    pd.DataFrame(
        scaler.fit_transform(movie_ratings[['vote_average', 'rating']]),
        index = movie_ratings.index,
        columns = ['scaled_tmdb_vote', 'scaled_user_rating']
    ),
    left_index = True,
    right_index = True,
)

In [None]:
movie_ratings.groupby('year')[['scaled_user_rating', 'scaled_tmdb_vote']].mean().iplot(kind='bar')

In [None]:
movie_ratings.groupby('year')['vote_average'].mean()

In [None]:
movie_ratings.groupby('year')['scaled_tmdb_vote'].mean().index

In [None]:
traces = [
    go.Bar(name='TMDB rating',
        x = movie_ratings.groupby('year')['scaled_tmdb_vote'].mean().index,
        y = movie_ratings.groupby('year')['scaled_tmdb_vote'].mean()
    ),
        go.Bar(name='TMDB rating',
        x = movie_ratings.groupby('year')['scaled_user_rating'].mean().index,
        y = movie_ratings.groupby('year')['scaled_user_rating'].mean()
    )
]

go.Figure(data = traces,
    layout_xaxis_tick0 = 1890
)

In [None]:
# enable_plotly_in_cell()
trace = go.Box(
    x = movie_ratings[movie_ratings['decade'].notnull()]['decade'],
    y = movie_ratings[movie_ratings['decade'].notnull()]['rating'],
    
)
go.Figure(
    trace,
    layout_xaxis_title = "Decade",
    layout_yaxis_title = "Movie Rating",
    layout_title = "Movie Ratings by decade"
)


In [None]:
movie_ratings['q_budget'] = pd.qcut(movie_ratings['budget'], labels = ['vlow', 'low', 'med', 'high', 'vhigh'], q = 5)

In [None]:
budget_ratings = movie_ratings[['title', 'budget', 'revenue', 'rating']].dropna()

In [None]:
budget_ratings.corr()

In [None]:
trace = go.Scatter(
    y = budget_ratings['rating'],
    x = budget_ratings['revenue'],
    mode = 'markers'
)

go.Figure(
    trace,
    layout_xaxis_title = "Budget",
    layout_yaxis_title = "Movie Rating",
    layout_title = "Movie Ratings by budget",
    
)

In [None]:
movie_ratings.shape

In [None]:
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.decomposition import PCA

In [None]:
all_ratings = movie_ratings[['vote_average','rating']].dropna()
all_ratings

In [None]:
all_ratings.isnull().sum()

In [None]:
scaler = StandardScaler()
x =scaler.fit_transform(all_ratings)

In [None]:
pca = PCA(1)

In [None]:
x = pca.fit_transform(x)

In [None]:
scaled_ratings = pd.DataFrame(x, index = all_ratings.index, columns=['scaled_rating'])

In [None]:
scaled_ratings

In [None]:
movie_ratings = movie_ratings.merge(scaled_ratings, left_index=True, right_index=True)

In [None]:
movie_ratings.corr()

In [None]:
enable_plotly_in_cell()
movie_ratings.groupby('year')['scaled_rating'].mean().iplot(kind='bar')

In [None]:
movie_ratings.groupby('year')['scaled_rating'].mean().iplot(kind='bar', title = "Scaled Rating by Year", xTitle="year", yTitle="Scaled Rating")

In [None]:
movie_ratings.groupby('decade')['scaled_rating'].mean().iplot(kind='bar')

In [None]:
budget_ratings = movie_ratings[['budget', 'rating']].dropna()

In [None]:
budget_ratings['q_budget'] = pd.qcut(budget_ratings['budget'], q = 5, labels = ['vlow', 'low', 'med', 'high', 'blockbuster'])


In [None]:
enable_plotly_in_cell()
budget_ratings.groupby('q_budget')['rating'].mean().iplot(kind='bar')

In [None]:
movie_ratings.nlargest(25, 'rating')

In [None]:
enable_plotly_in_cell()
trace = go.Scatter(
    x = movie_ratings["budget"],
    y = movie_ratings['revenue'],
    mode = "markers"
)

go.Figure(trace)