# ETL Project
## Members:
* Francisco Estevez
* Alvin Kim
### Data Sources
We extracted data from the Machine Learning Course offered by Google
https://developers.google.com/machine-learning/crash-course/
The dataset we selected is part of the Data preparation of the course
#### Collecting Data > The Size and Quality of a Data Set
The data source contains 20 Millions of records:
    * Movies from 1891 to 2015
    * Rates
    * Genome Scores
#### Final database
Our information is stored in Mongo DB:
    * Movie Rates
    * Movie Genome Scores

### Notebook 3: Merge Movies and Ratings and output to MongoDB

In [1]:
import pandas as pd
from pymongo import MongoClient

In [2]:
# The default port used by MongoDB is 27017
# https://docs.mongodb.com/manual/reference/default-mongodb-port/
conn = 'mongodb://localhost:27017'
client = MongoClient(conn)

In [3]:
# Declare the database
movies_db = client.movies_db

In [4]:
# Declare the collection
ratings = movies_db.ratings

#### Read cleansed Movies data file

In [5]:
csv_file = "Resources/cleansed_movies_data.csv"
movies_df = pd.read_csv(csv_file)
movies_df.head()

Unnamed: 0,movieId,Movie,Years,Genres
0,1,Toy Story,1995,"['Adventure', 'Animation', 'Children', 'Comedy..."
1,2,Jumanji,1995,"['Adventure', 'Children', 'Fantasy']"
2,3,Grumpier Old Men,1995,"['Comedy', 'Romance']"
3,4,Waiting to Exhale,1995,"['Comedy', 'Drama', 'Romance']"
4,5,Father of the Bride Part II,1995,['Comedy']


#### Read transformed Ratings data file

In [6]:
csv_file = "Resources/transformed_ratings_data.csv"
ratings_df = pd.read_csv(csv_file)
ratings_df.head()

Unnamed: 0,movieId,count,mean,std,min,25%,50%,75%,max
0,1,49695.0,3.92124,0.889012,0.5,3.5,4.0,4.5,5.0
1,2,22243.0,3.211977,0.95115,0.5,3.0,3.0,4.0,5.0
2,3,12735.0,3.15104,1.006642,0.5,3.0,3.0,4.0,5.0
3,4,2756.0,2.861393,1.095702,0.5,2.0,3.0,4.0,5.0
4,5,12161.0,3.064592,0.98214,0.5,3.0,3.0,4.0,5.0


#### Not all movies have rating information

In [7]:
print(f"Total movies:{len(movies_df)}, Total ratings:{len(ratings_df)}")

Total movies:27019, Total ratings:26744


In [8]:
# Merge the two datasets
movies_ratings_df = pd.merge(movies_df, ratings_df, how='left', on='movieId')
movies_ratings_df.head()

Unnamed: 0,movieId,Movie,Years,Genres,count,mean,std,min,25%,50%,75%,max
0,1,Toy Story,1995,"['Adventure', 'Animation', 'Children', 'Comedy...",49695.0,3.92124,0.889012,0.5,3.5,4.0,4.5,5.0
1,2,Jumanji,1995,"['Adventure', 'Children', 'Fantasy']",22243.0,3.211977,0.95115,0.5,3.0,3.0,4.0,5.0
2,3,Grumpier Old Men,1995,"['Comedy', 'Romance']",12735.0,3.15104,1.006642,0.5,3.0,3.0,4.0,5.0
3,4,Waiting to Exhale,1995,"['Comedy', 'Drama', 'Romance']",2756.0,2.861393,1.095702,0.5,2.0,3.0,4.0,5.0
4,5,Father of the Bride Part II,1995,['Comedy'],12161.0,3.064592,0.98214,0.5,3.0,3.0,4.0,5.0


### Prepare Data Collections (Records)

In [9]:
# Append data as STRINGS
movie_collections_string = []
for movie in range(len(movies_ratings_df)):
    movie_collections_string.append(
        {
        'movieId': str(movies_ratings_df['movieId'][movie]),
        'Name'   : movies_ratings_df['Movie'][movie].strip(),
        'Year'   : str(movies_ratings_df['Years'][movie]),
        'Generes': movies_ratings_df['Genres'][movie].translate(str.maketrans({"[": "", "]": "", "'": ""})).split(","),
        'Rate stats' : { 
            'Tot votes'    : str(movies_ratings_df['count'][movie]),
            'Lowest rate'  : str(movies_ratings_df['min'][movie]), 
            'Highest rate' : str(movies_ratings_df['max'][movie]),
            'Average rate' : str(round(movies_ratings_df['mean'][movie],2)),
            'Std dev'      : str(round(movies_ratings_df['std'][movie],2)),
            'Percentiles'  : {
                '25%': str(movies_ratings_df['25%'][movie]),
                '50%': str(movies_ratings_df['50%'][movie]),
                '75%': str(movies_ratings_df['75%'][movie]),
            }
        }
    })

In [10]:
# append raw data
movie_collections_raw = []
for movie in range(len(movies_ratings_df)):
    movie_collections_raw.append(
        {
        'movieId': movies_ratings_df['movieId'][movie],
        'Name'   : movies_ratings_df['Movie'][movie].strip(),
        'Year'   : movies_ratings_df['Years'][movie],
        'Generes': movies_ratings_df['Genres'][movie].translate(str.maketrans({"[": "", "]": "", "'": ""})).split(","),
        'Rate stats' : { 
            'Tot votes'    : movies_ratings_df['count'][movie],
            'Lowest rate'  : movies_ratings_df['min'][movie], 
            'Highest rate' : movies_ratings_df['max'][movie],
            'Average rate' : round(movies_ratings_df['mean'][movie],2),
            'Std dev'      : round(movies_ratings_df['std'][movie],2),
            'Percentiles'  : {
                '25%': movies_ratings_df['25%'][movie],
                '50%': movies_ratings_df['50%'][movie],
                '75%': movies_ratings_df['75%'][movie],
            }
        }
    })

### Inserting Data Collections in Mongo DB

In [11]:
ratings.insert_many(movie_collections_string)

<pymongo.results.InsertManyResult at 0x1ac20085c88>

In [12]:
# ratings.insert_many(movie_collections_raw)