<a href="https://colab.research.google.com/github/ErikSeguinte/movie_data/blob/master/processing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import cufflinks as cf
import numpy as np
from plotly import graph_objs as go
import altair as alt

In [None]:
merge_keys = {'left_index':True, 'right_index':True}

* I previously pulled CSV files from Kaggle, but the files were too big to host on github.
* I imported the files I wanted into pandas, and then exported them back out as compressed pickles.
* I was able to compress a 700MB csv to a 3 MB Pickle

In [None]:
try: 
    movies = pd.read_pickle('data/movies.pkl.xz')
    ratings = pd.read_pickle('data/ratings2.pkl.xz')
    cpi = pd.read_csv('data/cpi.csv')
    
except:
    # Download pickles from github
    cpi = pd.read_csv('https://datahub.io/core/cpi/r/cpi.csv')
    movies = pd.read_pickle('https://github.com/ErikSeguinte/movie_data/raw/master/data/movies.pkl.xz')
    ratings = pd.read_pickle('https://github.com/ErikSeguinte/movie_data/raw/master/data/ratings2.pkl.xz')

## Clean Movie DF

In [None]:
movies.dtypes

* Movies Dataframe has malformed data. `id` Should be numeric.
* After inspection, it looks like there are rows that are missing a comma somewhere, making columns not line up, and adding the wrong data to columns. Let's clean those up.
* All malformed rows have strings for IDs instead of numeric, so we will coerce them into numeric columns, and strings will be returned as `NaN`, which we'll then drop.

* `budget` and `revanue` should also be numeric, but Nans won't be dropped






In [None]:
movies['id'] = pd.to_numeric(movies['id'], errors='coerce')
movies = movies[movies['id'].notnull()]
movies = movies.set_index('id')

In [None]:
def to_numeric(df, labels):
    
    for label in labels:
        df[label] = pd.to_numeric(movies[label], errors='coerce').copy()
    return df

In [None]:
movies = to_numeric(movies, ['budget', 'revenue', 'vote_average'])

In [None]:
movies['release_date'] =pd.to_datetime(movies['release_date'], infer_datetime_format= True)

In [None]:
movies['year'] = movies['release_date'].dt.year

In [None]:
clean_movies = movies[['title','genres', 'release_date','budget', 'revenue','year' ,'runtime', 'vote_average', 'vote_count']]

## Process User Reviews
* User reviews come in a collection of individual reviews where a review gives a movie a score of 1 to 5.
* We will take the mean ratings for each movie

In [None]:
# Aggregate mean ratings and number of votes per movie
movie_ratings =pd.DataFrame(ratings.groupby('movieId')[['rating']].agg(['mean', 'count']))['rating']
movie_ratings = movie_ratings.rename({'mean': 'rating', 'count': 'num_votes'}, axis = 1)


* Lets drop any movies with less than 100 votes. Those are more easily swayed by outliers and aren't reliable.

In [None]:
movie_ratings = movie_ratings[(movie_ratings['num_votes'] >= 100)]

* And now we merge the averaged ratings back with the movie database.
* Note that not all movies are present in the user votings.

In [None]:
clean_movies = clean_movies.merge(movie_ratings, left_index = True, right_index=True)

In [None]:
clean_movies[['title', 'rating', 'num_votes']].nlargest(10, 'rating')

* The movie Database also provides a rating and suffer from a similar problem of some movies having a tiny sample size.

In [None]:
clean_movies[clean_movies['vote_count'] >= 100][['title', 'vote_average', 'vote_count']].nlargest(10, 'vote_average')

In [None]:
clean_movies[['title', 'revenue']].nlargest(10, 'revenue')

## Inflation
* Inflation means that a 1940 dollar is worth more than a 2020 dollar. Let's adjust Revenue for that.
* The Consumer price index can be used to convert to standarized dollars.
* Here, we'll be using 2014 dollars.
* Years later than 2014 will not be adjusted.
$$ \textrm{adjusted dollars} = \frac{\textrm{New CPI}}{\textrm{Base CPI}}$$
* where x is the current cpi and y is the cpi of that year 

In [None]:
cpi = cpi[cpi['Country Name'] == 'United States'][['Year', 'CPI']]

In [None]:
cpi = cpi.set_index(cpi['Year'])

In [None]:
def adjust_dollars(value, year):
    year = int(year)
    try:
        current = cpi.loc[2014,'CPI']
        base = cpi.loc[year,'CPI']
        adjusted_value = value * (current/base)
        return adjusted_value
    except: 
        return value

In [None]:
clean_movies['year']= clean_movies['release_date'].dt.year

In [None]:
df = clean_movies[clean_movies['revenue'].notnull() & clean_movies['year'].notnull()]
df

In [None]:
adjusted = pd.DataFrame([adjust_dollars(x,y) for x,y in zip(df['revenue'], df['year'])], index = df.index, columns = ['adjusted_revenue'])

In [None]:
clean_movies = clean_movies.merge(adjusted, left_index=True, right_index=True)

In [None]:
clean_movies[['title', 'adjusted_revenue']].nlargest(10, 'adjusted_revenue')

In [None]:
clean_movies['decade'] = [x - (x%10) for x in clean_movies['year']]

In [None]:
clean_movies

In [None]:
#enable_plotly_in_cell()
clean_movies.groupby('year')['vote_average'].mean().iplot(kind='bar')

In [None]:
alt.data_transformers.disable_max_rows()

In [None]:
alt.Chart(clean_movies, width=1080).mark_bar().encode(
    alt.Y('mean(vote_average)'),
    alt.X('year(release_date):O')
)

In [None]:
alt.Chart(clean_movies, width=1080).mark_bar().encode(
    alt.Y('mean(rating)'),
    alt.X('year(release_date):N')
)

In [None]:
alt.Chart(clean_movies,width=720).mark_bar().encode(
    alt.Y('mean(vote_average)'),
    alt.X('decade:O')
)

* I'd like to compare the votes from TMB to the user ratings, but they are on different scales. We'll use standard scaler to normalize them so we can more easily compare.

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
to_scale = clean_movies[['vote_average', 'rating']].dropna()
scaled = scaler.fit_transform(to_scale)
clean_movies = clean_movies.merge(
    pd.DataFrame(
        scaled,
        index = to_scale.index,
        columns = ['scaled_tmdb_vote', 'scaled_user_rating']
    ),
    left_index = True,
    right_index = True,
)

In [None]:
traces = [
    go.Bar(name='TMDB rating',
        x = clean_movies.groupby('decade')['scaled_tmdb_vote'].mean().index,
        y = clean_movies.groupby('decade')['scaled_tmdb_vote'].mean()
    ),
        go.Bar(name='user rating',
        x = clean_movies.groupby('decade')['scaled_user_rating'].mean().index,
        y = clean_movies.groupby('decade')['scaled_user_rating'].mean()
    )
]

go.Figure(data = traces,
    layout_xaxis_tick0 = 1890
)

In [None]:
from sklearn.decomposition import PCA

pca = PCA(1)

to_scale = clean_movies[['vote_average', 'rating', 'adjusted_revenue']].dropna()
scaled = scaler.fit_transform(to_scale)

pca_df = pd.DataFrame(pca.fit_transform(scaled), index=to_scale.index, columns = ['PCA'])

clean_movies = clean_movies.merge(pca_df, left_index = True, right_index = True)
clean_movies.head(2)

In [None]:
clean_movies[['title', 'PCA']].nlargest(25, 'PCA')

|      | title                                         |     PCA |
|-----:|:----------------------------------------------|--------:|
|   11 | Star Wars                                     | 8.86514 |
|  597 | Titanic                                       | 7.50724 |
|  601 | E.T. the Extra-Terrestrial                    | 5.27108 |
| 1891 | The Empire Strikes Back                       | 4.81926 |
|  238 | The Godfather                                 | 4.75741 |
|  122 | The Lord of the Rings: The Return of the King | 4.51399 |
|  329 | Jurassic Park                                 | 4.3515  |
| 1892 | Return of the Jedi                            | 4.21927 |
|  121 | The Lord of the Rings: The Two Towers         | 3.98178 |
|  155 | The Dark Knight                               | 3.87451 |

In [None]:
clean_movies['q_budget'] = pd.qcut(clean_movies['budget'], labels = ['vlow', 'low', 'med', 'high', 'vhigh'], q = 5)

In [None]:
budget_ratings = clean_movies[['title','q_budget', 'budget', 'revenue', 'rating', 'vote_average']].dropna()

In [None]:
alt.Chart(budget_ratings, width=720).mark_bar().encode(
    x='q_budget:N',
    y='mean(vote_average)'
)

In [None]:
trace = go.Scatter(
    y = budget_ratings['vote_average'],
    x = budget_ratings['revenue'],
    mode = 'markers'
)

go.Figure(
    trace,
    layout_xaxis_title = "Budget",
    layout_yaxis_title = "Movie Rating",
    layout_title = "Movie Ratings by budget",
    
)

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

## Genre

In [None]:
genres = clean_movies[['title','genres']]

In [None]:
import ast

In [None]:
bad_genres = [
           'Aniplex',
 'BROSTA TV',
 'Carousel Productions',   
  'GoHands',
   'Mardock Scramble Production Committee',
    'Odyssey Media',
     'Pulser Productions',
 'Rogue State',
  'Sentai Filmworks',
   'Telescene Film Group Productions',
 'The Cartel',
  'Vision View Entertainment',
]

#genre_set = genre_set.difference(bad_genres)

In [None]:
genres['genres_ls'] = [
                    [d['name'] for d in ast.literal_eval(x) if d['name'] not in bad_genres ]
                    for x in genres['genres']
                    ]
genres['genres_ls']

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer

In [None]:
mlb = MultiLabelBinarizer()

In [None]:
mlb_ = mlb.fit_transform(genres['genres_ls'])

In [None]:
encoded_genres = pd.DataFrame(mlb_, columns = mlb.classes_, index = genres.index)
encoded_genres

In [None]:
encoded_genres = clean_movies.merge(encoded_genres, **merge_keys)
encoded_genres

In [None]:
for genre in mlb.classes_:
    print(genre, encoded_genres[encoded_genres[genre] == 1]['vote_average'].mean())

In [None]:
d = {g: [encoded_genres[encoded_genres[g] == 1]['PCA'].mean(),
         encoded_genres[encoded_genres[g] == 1]['adjusted_revenue'].sum()]
     for g in mlb.classes_}

In [None]:
df = pd.DataFrame.from_dict(d, orient='index',  columns = ['pca rating', 'total revenue'])
df

In [None]:
trace = go.Bar(
    x=df.index,
    y=df['pca rating']
)

go.Figure(
    data = trace,
    #layout_x)
)


In [None]:
melted = df.reset_index().melt(id_vars='index')
melted