In [8]:
from pandas.io.json import json_normalize
from pymongo import MongoClient
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np
import pprint

In [9]:
course_cluster_uri = "mongodb://agg-student:agg-password@cluster0-shard-00-00-jxeqq.mongodb.net:27017,cluster0-shard-00-01-jxeqq.mongodb.net:27017,cluster0-shard-00-02-jxeqq.mongodb.net:27017/test?ssl=true&replicaSet=Cluster0-shard-0&authSource=admin"
course_client = MongoClient(course_cluster_uri)

In [10]:
titanic = course_client['coursera-agg']['titanic']

In [11]:
# Replace {} with a stage to determine the possible values for gender.
unique_gender_stage = {
    "$group": {
        "_id": "$gender"
    }
}

In [12]:
possible_gender_values = titanic.aggregate([
    {
        "$match": {
            "age": {"$type": "number"},
            "point_of_embarkation": {"$ne": ""}
        }
    },
    unique_gender_stage
])

In [13]:
# Print the distinct list of values for the gender field
pprint.pprint(list(possible_gender_values))

[{'_id': 'male'}, {'_id': 'female'}]


In [15]:
# Replace {} with a stage to determine the possible values for point_of_embarkation
unique_point_of_embarkation_stage = {
    "$group": {
        "_id": "$point_of_embarkation"
    }
}

In [16]:
possible_point_of_embarkation_values = titanic.aggregate([
    {
        "$match": {
            "age": {"$type": "number"},
            "point_of_embarkation": {"$ne": ""}
        }
    },
    unique_point_of_embarkation_stage
])

In [17]:
# Print the distinct list of values for the point_of_embarkation field
pprint.pprint(list(possible_point_of_embarkation_values))

[{'_id': 'S'}, {'_id': 'Q'}, {'_id': 'C'}]


In [54]:
# Given the possible values for point_of_embarkation and gender replace {} with a stage that
# will convert those field values to an integer.
# e.g., For the gender field convert 'female' to 0 and 'male' to 1
gender_and_point_of_embarkation_conversion_stage = {
    "$project": {
        "gender": {
            "$cond": {"if": {"$eq": ["$gender", "female"]}, "then": 0, "else": 1}
        },
        "point_of_embarkation": {
            "$switch": {
                "branches": [
                    {"case": {"$eq": ["$point_of_embarkation", "S"]}, "then": 0},
                    {"case": {"$eq": ["$point_of_embarkation", "Q"]}, "then": 1},
                    {"case": {"$eq": ["$point_of_embarkation", "C"]}, "then": 2}
                ]
            }
        },
        "survived":1,
        "age": 1,
        "siblings_spouse": 1,
        "parents_children": 1,
        "fare_paid": 1,
        "class":1
    }
}

In [55]:
cursor = titanic.aggregate([
    {
        "$match": {
            "age": {"$type": "number"},
            "point_of_embarkation": {"$ne": ""}
        }
    },
    gender_and_point_of_embarkation_conversion_stage,
    {
        "$project": {
            "_id": 0,
            "ticket_number": 0,
            "name": 0,
            "passenger_id": 0,
            "cabin": 0
        }
    }
])

In [56]:
# Exhaust our cursor into a list
titanic_data = list(cursor)

In [57]:
# Load our dataset into a DataFrame
df = json_normalize(titanic_data)

  


In [58]:
df.tail()

Unnamed: 0,survived,class,age,siblings_spouse,parents_children,fare_paid,gender,point_of_embarkation
707,1,1,39.0,1,1,83.1583,0,2
708,1,1,30.0,0,0,31.0,0,2
709,1,1,51.0,0,0,26.55,1,0
710,0,2,21.0,1,0,11.5,1,0
711,1,2,27.0,1,0,13.8583,0,2


In [59]:
# Pull out the survived column (only the data we want to correlate against)
df_x = df.drop(['survived'], axis=1)

In [60]:
# Only the survived column (the value we want to predict)
df_y = df['survived']

In [61]:
# Create a Least Squares Linear Regression object
reg = linear_model.LinearRegression()

In [62]:
# Split our dataset into a training set (80%) and a test set (20%)
x_train, x_test, y_train, y_test = train_test_split(df_x, df_y, test_size=0.2, random_state=0)

In [72]:
# Fit a linear model to our training data
reg.fit(x_train, y_train)

LinearRegression()

In [73]:
# Check our test set against our trained linear model
reg.predict(x_test)

array([ 1.08422129e+00,  3.26433592e-01,  6.86829020e-01,  1.83203825e-01,
        3.54733676e-01,  1.54214573e-01,  3.75214252e-01,  1.46604504e-01,
        9.58592528e-01,  2.06632471e-01,  7.89493435e-01,  1.02229163e+00,
       -1.63226218e-03,  6.93683545e-02,  5.03288891e-01,  5.45642594e-01,
        4.05174193e-01,  1.31618613e-01,  3.41388092e-01,  5.53305641e-02,
        4.82269801e-02,  1.05191344e-01,  1.41559110e-01,  1.11779821e+00,
        6.23776889e-02,  2.22158542e-01,  1.18698228e-01,  7.66312342e-02,
        4.59646838e-01,  1.04603978e-01,  4.66494115e-01,  3.79552856e-01,
        8.20390496e-02,  3.39612485e-01,  4.16731967e-01,  5.07312894e-01,
        3.76773715e-01,  1.46749424e-01,  2.00384810e-02, -3.63537487e-02,
        8.93120917e-01,  7.71266367e-01,  8.39762074e-03,  6.98463121e-01,
        3.71003185e-01,  6.05815290e-01,  3.46988954e-01,  8.70426256e-02,
        2.55174953e-01,  6.99557203e-02,  1.69747609e-01,  3.40527842e-01,
        9.61889745e-01,  

In [74]:
# Calculate mean squared error (should be ~0.13-0.15%)
mean_squared_error(y_test, reg.predict(x_test))

0.1473083350242894

In [75]:
# age: 25,
# class: 1,
# fare_paid: 45,
# gender: Y, (replace Y with the integer you assigned for 'male')
# parents_children: 0,
# point_of_embarkation: Z, (replace Z with the integer you assigned for 'C')
# siblings_spouse: 1

fake_passenger = [[1, 25, 1, 0, 45, 1, 2]]

In [76]:
# Use this output to verify your completion of this exercise
reg.predict(fake_passenger)

array([0.55896469])