<a href="https://colab.research.google.com/github/ErikSeguinte/movie_data/blob/master/processing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import cufflinks as cf
import numpy as np
from plotly import graph_objs as go

In [None]:
def enable_plotly_in_cell():
  import IPython
  from plotly.offline import init_notebook_mode
  display(IPython.core.display.HTML('''<script src="/static/components/requirejs/require.js"></script>'''))
  init_notebook_mode(connected=False)

In [None]:
cf.set_config_file(offline=True)

* I previously pulled CSV files from Kaggle, but the files were too big to host on github.
* I imported the files I wanted into pandas, and then exported them back out as compressed pickles.
* I was able to compress a 700MB csv to a 3 MB Pickle

In [None]:
try: 
    movies = pd.read_pickle('data/movies.pkl.xz')
    ratings = pd.read_pickle('data/ratings.pkl.xz')
except:
    # Download pickles from github
    !wget https://github.com/ErikSeguinte/movie_data/raw/master/data/ratings.pkl.xz
    !wget https://github.com/ErikSeguinte/movie_data/raw/master/data/movies.pkl.xz
    # Unpickle dataframes
    movies = pd.read_pickle('movies.pkl.xz')
    ratings = pd.read_pickle('ratings.pkl.xz')

In [None]:
movies.head(1)

In [None]:
ratings.dtypes

In [None]:
ratings['datetime'] = pd.to_datetime(
    ratings['timestamp'], 
    infer_datetime_format=True, 
    unit = 's')

In [None]:
ratings.shape

In [None]:
movies.shape

In [None]:
movies.dtypes

* Movies Dataframe has malformed data. `id` Should be numeric.
* After inspection, it looks like there are rows that are missing a comma somewhere, making columns not line up, and adding the wrong data to columns. Let's clean those up.
* All malformed rows have strings for IDs instead of numeric, so we will coerce them into numeric columns, and strings will be returned as `NaN`, which we'll then drop.

* `budget` and `revanue` should also be numeric, but Nans won't be dropped






In [None]:
ratings.dtypes

In [None]:
movies['id'] = pd.to_numeric(movies['id'], errors='coerce')
movies = movies[movies['id'].notnull()]
movies['id'] = movies['id'].astype('Int64')
movies = movies.set_index('id')

In [None]:
movies['budget'] = pd.to_numeric(movies['budget'], errors='coerce')

In [None]:
movies['revenue'] = pd.to_numeric(movies['revenue'], errors='coerce')

In [None]:
movies['revenue'].value_counts()

In [None]:
movies = movies.replace({0: None, 0.0: None})

In [None]:
movies['release_date'] =pd.to_datetime(movies['release_date'], infer_datetime_format= True)

In [None]:
clean_movies = movies[['title', 'release_date','budget', 'revenue', 'runtime', 'vote_average']]

In [None]:
clean_movies.head()

In [None]:
mean_rating = ratings.groupby('movieId', as_index=False)[['rating']].mean()

In [None]:
median_rating = ratings.groupby('movieId', as_index=False)[['rating']].median()


In [None]:
avg_ratings = mean_rating.merge(median_rating, on ="movieId", suffixes = ('_mean', '_median'))

In [None]:
movie_ratings = clean_movies.merge(avg_ratings, left_on='id', right_on='movieId')

In [None]:
movie_ratings.nlargest(10, 'rating_mean')

In [None]:
movie_ratings['year'] = movie_ratings['release_date'].dt.year.astype('Int64')

In [None]:
top = [int(x) for x in movie_ratings.nlargest(5, 'rating_mean').index.to_list()]

In [None]:
# enable_plotly_in_cell()
ratings[ratings['movieId'].isin(top)].boxplot( by= 'movieId', column ='rating')

In [None]:
#enable_plotly_in_cell()
movie_ratings.groupby('year')['rating_mean'].mean().iplot(kind='bar')

In [None]:
movie_ratings['decade'] = [x - (x%10) for x in movie_ratings['year']]
movie_ratings['decade'] = movie_ratings['decade'].astype('Int64')

In [None]:
movie_ratings['decade']

In [None]:
# enable_plotly_in_cell()
movie_ratings.groupby('decade')['rating_mean'].mean().iplot(kind='bar')

In [None]:
# enable_plotly_in_cell()
trace = go.Box(
    x = movie_ratings[movie_ratings['decade'].notnull()]['decade'],
    y = movie_ratings[movie_ratings['decade'].notnull()]['rating_mean']
    
)
go.Figure(trace)


In [None]:
# enable_plotly_in_cell()
trace = go.Box(
    x = movie_ratings[movie_ratings['year'].notnull()]['year'],
    y = movie_ratings[movie_ratings['year'].notnull()]['rating_mean'],
    
)
go.Figure(trace)

In [None]:
enable_plotly_in_cell()
movie_ratings[['budget', 'rating_mean']].iplot(kind = 'scatter', x = 'budget', y = 'rating_mean')

In [None]:
movie_ratings.shape

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [None]:
all_ratings = movie_ratings[['vote_average','rating_mean', 'rating_median']].dropna()
all_ratings

In [None]:
all_ratings.isnull().sum()

In [None]:
scaler = StandardScaler()
x =scaler.fit_transform(all_ratings)

In [None]:
pca = PCA(1)

In [None]:
x = pca.fit_transform(x)

In [None]:
scaled_ratings = pd.DataFrame(x, index = all_ratings.index)

In [None]:
scaled_ratings

In [None]:
movie_ratings['scaled_rating'] = scaled_ratings

In [None]:
movie_ratings.shape

In [None]:
movie_ratings.groupby('year')['rating_mean'].mean().iplot(kind='bar')

In [None]:
movie_ratings.groupby('year')['scaled_rating'].mean().iplot(kind='bar', title = "Scaled Rating by Year", xTitle="year", yTitle="Scaled Rating")

In [None]:
movie_ratings.groupby('decade')['scaled_rating'].mean().iplot(kind='bar')

In [None]:
budget_ratings = movie_ratings[['budget', 'scaled_rating']]

In [None]:
budget_ratings['qbudget'] = pd.qcut(budget_ratings['budget'].dropna(), q = 5, labels = ['vlow', 'low', 'med', 'high', 'blockbuster'])


In [None]:
budget_ratings.groupby('qbudget')['scaled_rating'].mean().iplot(kind='bar', xTitle = "quantized budget", yTitle = 'Standardized Rating', title = "Rating vs Budget")