# 1Strategy ML Immersion Day
### Building a model from movie data

In [None]:
import json
import math
import sys

import boto3
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sagemaker as sm
from sagemaker.amazon.amazon_estimator import get_image_uri

import workshop_utils as wu

# prevent warnings from displaying
import warnings
warnings.filterwarnings('ignore')

## Initialize variables

In [None]:
bucket    = '1s-ml'
your_name = 'agraves'

model_artifacts_location = f's3://{bucket}/movies/artifacts/{your_name}'

role = sm.get_execution_role()
sm_session = sm.session.Session()

print(f'IAM Role: {role}')

ratings = 'movies/data/title.ratings.tsv'
basics = 'movies/data/title.basics.tsv'

### A note about this data
source: https://datasets.imdbws.com

We will be downloading the data from S3 in order to inspect it and perform any cleanup necessary before we train our model.

In [None]:
s3 = boto3.resource('s3')
s3.Bucket(bucket).download_file(ratings, 'ratings.tsv')
s3.Bucket(bucket).download_file(basics, 'basics.tsv')

In [None]:
ratings_csv = pd.read_csv('ratings.tsv', sep='\t')
basics_csv = pd.read_csv('basics.tsv', sep='\t')
movie_data = pd.merge(ratings_csv, basics_csv, how='inner', on='tconst')
print(f'Movie Data Shape: {movie_data.shape}')

movie_data.head(15)

## Cleanup

There are several unecessary columns in this data as well as observations we aren't concerned about. This is an investigation of movie ratings, so we can eliminate the rows which contain data about television shows. This data also contains records from silent films. We can make a reasonable assumption that silent film appreciation is a bit different than modern film appreciation, so we will drop these observations as well.

In [None]:
# Eliminate TV Shows
movie_data = movie_data[(movie_data.titleType == 'movie') | (movie_data.titleType == 'short') | (movie_data.titleType == 'tvMovie')]
# Shape: (395863, 11)

# Limit to only years with talkies
movie_data = movie_data[movie_data.startYear != '\\N']
movie_data.startYear = movie_data.startYear.astype(int)
movie_data = movie_data[movie_data.startYear > 1927]
# Shape: (383612, 11)

# Remove unnecessary columns
movie_data.drop('originalTitle', axis=1, inplace=True)
movie_data.drop('endYear', axis=1, inplace=True)
movie_data.drop('tconst', axis=1, inplace=True)
movie_data.drop('primaryTitle', axis=1, inplace=True)
movie_data.drop('genres', axis=1, inplace=True)
# I am working to one hot encode the genres column. It requires a custom function.
movie_data.head(15)

In [None]:
# Convert \\N to NaN
movie_data = movie_data[movie_data != r'\N']

In [None]:
# Check to see how many NaN values we have now that we've dropped the /N entries
movie_data.isna().sum()

In [None]:
# Remove any observations with null values
movie_data.dropna(inplace=True)
movie_data.isna().sum()

## Visualization

In [None]:
# # Create a figure instance
# fig = plt.figure(1, figsize=(9, 6))

# # Create an axes instance
# ax = fig.add_subplot(111)

# # Create the boxplot
# bp = ax.boxplot([movie_data[:100].runtimeMinutes])

plt.plot(movie_data.titleType, movie_data.numVotes, 'o')

# # Save the figure
# fig.savefig('fig1.png', bbox_inches='tight')

In [None]:
# plt.hist(movie_data.runtimeMinutes)

## Model Prep

In [None]:
likable = movie_data.apply(lambda row: wu.label_rating(row), axis=1)
movie_data = pd.concat([likable, movie_data], axis=1)
movie_data.rename(columns={0:'likable'}, inplace=True)
movie_data.drop('averageRating', axis=1, inplace=True)
movie_data.head(15)

In [None]:
# One Hot Encode titleType column
dummy_types = pd.get_dummies(movie_data['titleType'])
movie_data = pd.concat([movie_data, dummy_types.reindex(movie_data.index)], axis=1)
movie_data.drop('titleType', axis=1, inplace=True)

movie_data.head(15)

In [None]:
# Now that we have only numbers in runtimeMinutes, we can convert to int
# movie_data['runtimeMinutes'] = movie_data['runtimeMinutes'].astype(int)
# movie_data['likable'] = movie_data['likable'].astype(int)
movie_data['movie'] = movie_data['movie'].astype(int)
movie_data['short'] = movie_data['short'].astype(int)
movie_data['tvMovie'] = movie_data['tvMovie'].astype(int)
movie_data.dtypes

### A note about splitting data

In [None]:
movie_train, movie_eval, movie_test = np.split(movie_data.sample(frac=1, random_state=1278), [int(0.7 * len(movie_data)), int(0.9 * len(movie_data))])
print(f'Movie Train Shape: {movie_train.shape}')
print(f'Movie Eval Shape: {movie_eval.shape}')
print(f'Movie Test Shape: {movie_test.shape}')

In [None]:
movie_train.dtypes

In [None]:
movie_train.to_csv('movie_train.csv', header=False, index=False)
train_upload = f'movies/artifacts/{your_name}/movie_train.csv'
print(train_upload)
s3.Bucket(bucket).Object(train_upload).upload_file('movie_train.csv')

In [None]:
movie_eval.to_csv('movie_eval.csv', header=False, index=False)
eval_upload = f'movies/artifacts/{your_name}/movie_eval.csv'
print(train_upload)
s3.Bucket(bucket).Object(eval_upload).upload_file('movie_eval.csv')

## Create ML resources

In [None]:
container = get_image_uri('us-west-2', 'xgboost', '0.90-1')

xgboost = sm.estimator.Estimator(
    container,
    role,
    base_job_name=f'{your_name}-ml-im',
    train_instance_count=1,
    train_instance_type='ml.m5.large',
    output_path=f's3://{bucket}/movies/artifacts/{your_name}/output',
    sagemaker_session=sm_session)
# eval_metric='auc',
# objective='reg:linear',

### A note about hyperparameters

In [None]:
xgboost.set_hyperparameters(
    max_depth=3,
    eta=0.1,
    subsample=0.5,
    eval_metric='error',
    objective='binary:logistic',
    scale_pos_weight=2.0,
    num_round=100)

In [None]:
train_data = sm.s3_input(s3_data=f's3://{bucket}/{train_upload}', content_type='csv')
eval_data = sm.s3_input(s3_data=f's3://{bucket}/{eval_upload}', content_type='csv')

xgboost.fit({'train': train_data, 'validation': eval_data})

## Deploy model

In [None]:
xgboost_predict = xgboost.deploy(
    initial_instance_count=1,
    instance_type='ml.m5.large')
xgboost_predict.content_type = 'text/csv'
xgboost_predict.deserializer = None

In [None]:
movie_test.drop('likable', axis=1, inplace=True)
movie_test.to_csv('movie_test.csv', header=False, index=False)
movie_test.head(15)

In [None]:
with open('movie_test.csv', 'r') as file:
    payload = file.read().strip()

test_data = [line for line in payload.split('\n')]
preds = wu.do_predict(test_data, xgboost_predict)
print(preds)

## Delete endpoint when done

In [None]:
sm_session.delete_endpoint(xgboost_predict.endpoint)