# Create Project

## Load Libraries and Functions

In [1]:
# import pandas
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Load Data

In [2]:
# imdb urls for datasets
basics_url = "https://datasets.imdbws.com/title.basics.tsv.gz"
akas_url = "https://datasets.imdbws.com/title.akas.tsv.gz"
ratings_url = "https://datasets.imdbws.com/title.ratings.tsv.gz"

In [None]:
# loading the data
basics = pd.read_csv(basics_url, sep='\t', low_memory=False)
akas = pd.read_csv(akas_url, sep='\t', low_memory=False)
ratings = pd.read_csv(ratings_url, sep='\t', low_memory=False)

# viewing the data
display(basics.head(), akas.head(), ratings.head())


## Cleaning the Data

In [None]:
# replacing '\N' with np.nan
basics = basics.replace('\\N', np.nan).copy()
akas = akas.replace('\\N', np.nan).copy()
ratings = ratings.replace('\\N', np.nan).copy()

In [None]:
# checking changes
display(basics.head(3), akas.head(3))

# ratings did not appear to have any NaN values
ratings.isna().sum()

### Basics
* [x] Eliminate movies that are null for runtimeMinutes
* [x] Eliminate movies that are null for genre
* [x] keep only titleType==Movie
* [x] keep startYear 2000-2022
* [x] Eliminate movies that include "Documentary" in genre (see tip below)
* [x] Keep only US movies

In [None]:
# look at info
basics.info()


In [None]:
# 'startYear' is a string column, changing it to an float (to account for NaNs)
basics['startYear'] = basics['startYear'].astype(float)

# confirming
basics.dtypes

In [None]:
#  drop minutes and genres 
basics= basics.dropna(subset= ['runtimeMinutes', 'genres'], inplace=True)

In [None]:
# keep only 'Movie'
basics = basics.loc[ basics['titleType']=='movie']

In [None]:
basics.info()