# MongoDB Student Practice

The aim of this practice is to offer more opportunities to work with the aggregate pipeline and finding and searching the collections we have access to. We will use the MovieLens data we used for the labs. You are expected to create cells to do data exploration as needed. 

In [4]:
from bson.son import SON # for ordering by more than one column
from pprint import pprint
from pymongo import MongoClient
import pymongo

# Initialize a Mongo Client
#################################################
# Update UPDATE-ME in the connection code
#################################################
# Client 1 - mongodb-1.dsa.missouri.edu
# Client 2 - mongodb-2.dsa.missouri.edu
# Client 3 - mongodb-3.dsa.missouri.edu
# Client 4 - mongodb-4.dsa.missouri.edu
#################################################
#
client = MongoClient('mongodb-2.dsa.missouri.edu',
                     username='ml_small_reader',
                     password='mlsmall.read',
                     authSource='ml_small')
# retrieve the ml_small database from the connection
db = client.ml_small

In [4]:
db.collection_names()

['item', 'user', 'genre', 'data']

In [5]:
db.item.find_one()

{'Action': 0,
 'Adventure': 0,
 'Animation': 0,
 "Children's": 1,
 'Comedy': 1,
 'Crime': 0,
 'Documentary': 0,
 'Drama': 1,
 'Fantasy': 0,
 'Film-Noir': 0,
 'Horror': 0,
 'IMDb_URL': 'http://us.imdb.com/M/title-exact?Babe%20(1995)',
 'Musical': 0,
 'Mystery': 0,
 'Romance': 0,
 'Sci-Fi': 0,
 'Thriller': 0,
 'War': 0,
 'Western': 0,
 '_id': ObjectId('5b231f5ad698289b415e67f5'),
 'movie_id': 8,
 'movie_title': 'Babe (1995)',
 'release_date': datetime.datetime(1995, 1, 1, 6, 0),
 'unknown': 0,
 'video_release_date': ''}

In [6]:
db.user.find_one()

{'_id': ObjectId('5b18e2b3d698289b41f3cf1a'),
 'age': 33,
 'gender': 'F',
 'occupation': 'other',
 'user_id': 5,
 'zip_code': 15213}

In [7]:
db.genre.find_one()

{'_id': ObjectId('5b18e2abd698289b41f3ceff'),
 'genre': "Children's",
 'genre_id': 4}

In [8]:
db.data.find_one()

{'_id': ObjectId('5b18e05dd698289b41f236d2'),
 'item_id': 346,
 'rating': 1,
 'timestamp': 886397596,
 'user_id': 166}

### Task 1

#### Find the 10 most common letters to begin a movie title, sorted by number of occurences and letter.

You'll want to use the `$substrCP` operation to extract the first letter

In [9]:
# Add your code below
# -------------------------
from bson.son import SON

pipeline = [
    
    {"$group" : { "_id" :  {'movie_title': { "$substrCP": [ "$movie_title", 0, 1 ] }} , "count" : { "$sum" : 1 }}},
    {"$sort": SON([("movie_title",1),("count", -1)])},
    {"$limit": 10}
]

for doc in db.item.aggregate(pipeline):
    print(doc)






{'_id': {'movie_title': 'S'}, 'count': 181}
{'_id': {'movie_title': 'B'}, 'count': 148}
{'_id': {'movie_title': 'M'}, 'count': 131}
{'_id': {'movie_title': 'C'}, 'count': 118}
{'_id': {'movie_title': 'T'}, 'count': 108}
{'_id': {'movie_title': 'A'}, 'count': 94}
{'_id': {'movie_title': 'F'}, 'count': 94}
{'_id': {'movie_title': 'P'}, 'count': 80}
{'_id': {'movie_title': 'D'}, 'count': 80}
{'_id': {'movie_title': 'L'}, 'count': 79}


### Task 2

#### Find out the distribution of occupations of those who provided reviews. Sort by count



In [29]:
# Add your code below
# -------------------------
documents = db.user.aggregate([{
    "$group": { "_id": "$occupation", "count": { "$sum": 1}}
},{
    "$sort": {"count": -1}
}]);

for document in documents:
    pprint(document)




{'_id': 'student', 'count': 196}
{'_id': 'other', 'count': 105}
{'_id': 'educator', 'count': 95}
{'_id': 'administrator', 'count': 79}
{'_id': 'engineer', 'count': 67}
{'_id': 'programmer', 'count': 66}
{'_id': 'librarian', 'count': 51}
{'_id': 'writer', 'count': 45}
{'_id': 'executive', 'count': 32}
{'_id': 'scientist', 'count': 31}
{'_id': 'artist', 'count': 28}
{'_id': 'technician', 'count': 27}
{'_id': 'marketing', 'count': 26}
{'_id': 'entertainment', 'count': 18}
{'_id': 'healthcare', 'count': 16}
{'_id': 'retired', 'count': 14}
{'_id': 'salesman', 'count': 12}
{'_id': 'lawyer', 'count': 12}
{'_id': 'none', 'count': 9}
{'_id': 'doctor', 'count': 7}
{'_id': 'homemaker', 'count': 7}


### Task 3

#### Find the Average and Standard Deviation of the age of the reviewers, grouped by gender.


In [40]:
# Add your code below
# -------------------------
documents = db.user.aggregate([{
    "$group": { "_id": "$gender", "Average": { "$avg": "$age"}, "StdDev": { "$stdDevPop": "$age"}}
}]);

for document in documents:
    pprint(document)




{'Average': 34.149253731343286, 'StdDev': 12.347643114694364, '_id': 'M'}
{'Average': 33.81318681318681, 'StdDev': 11.777465371497483, '_id': 'F'}


### Task 4

#### Find the Average, Standard Deviation, and count of the age of the reviewers, grouped by occupation and gender.


In [48]:
# Add your code below
# -------------------------
documents = db.user.aggregate([{
    "$group": { "_id": {"gender":"$gender","occupation":"$occupation"}, 
               "Average": { "$avg": "$age"}, 
               "StdDev": { "$stdDevPop": "$age"},
               "count": {"$sum":1}}
},{
    "$sort":{"_id.occupation":1}
}]);

for document in documents:
    pprint(document)



{'Average': 37.16279069767442,
 'StdDev': 11.288871310623597,
 '_id': {'gender': 'M', 'occupation': 'administrator'},
 'count': 43}
{'Average': 40.638888888888886,
 'StdDev': 10.454007147123,
 '_id': {'gender': 'F', 'occupation': 'administrator'},
 'count': 36}
{'Average': 30.307692307692307,
 'StdDev': 9.168378896927916,
 '_id': {'gender': 'F', 'occupation': 'artist'},
 'count': 13}
{'Average': 32.333333333333336,
 'StdDev': 7.77746031098127,
 '_id': {'gender': 'M', 'occupation': 'artist'},
 'count': 15}
{'Average': 43.57142857142857,
 'StdDev': 11.574073771730912,
 '_id': {'gender': 'M', 'occupation': 'doctor'},
 'count': 7}
{'Average': 43.10144927536232,
 'StdDev': 10.67727497061641,
 '_id': {'gender': 'M', 'occupation': 'educator'},
 'count': 69}
{'Average': 39.11538461538461,
 'StdDev': 8.82882918920006,
 '_id': {'gender': 'F', 'occupation': 'educator'},
 'count': 26}
{'Average': 29.5,
 'StdDev': 6.5,
 '_id': {'gender': 'F', 'occupation': 'engineer'},
 'count': 2}
{'Average': 36.6

In [3]:
# Be sure to run this cell when you are finished. Thank you.
client.close()

# Save your notebook, then `File > Close and Halt`

---