Lab
===
In Pairs
------------
### 1. Work through the getting started guide at http://avro.apache.org/docs/current/gettingstartedpython.html

In [1]:
import numpy as np
import pandas as pd
import json
import avro.io
import avro.schema
from avro.datafile import DataFileReader, DataFileWriter
from avro.io import DatumReader, DatumWriter

### 2. Generate sample data
Consider the following data:

In [2]:
data = [{'pedigree': {'true_as_of_secs': 1234567890},
           'dataunit': {'page_property': {'id': {'url': 'http://mysite.com/'},
                                    'property': {'page_views': 1}}},
           },
        {"pedigree": {"true_as_of_secs": 1234567891},
           "dataunit": {"equiv": {"id1": {"cookie": "ABCDE"},
                                  "id2": {"user_id": 123}}},
           },
        {"pedigree": {"true_as_of_secs": 1234567892},
           "dataunit": {"page_view": {"person": {"cookie": "ABCDE"},
                                      "page": {"url": "http://mysite.com/"},
                                      "nonce": 1234567890987654321}}
           },
        {"pedigree": {'true_as_of_secs': 1234567893},
           "dataunit": {"person_property": {"id": {"cookie": "ABCDE"},
                                            "property": {"full_name": "Alessandro"}}}
           },
        {"pedigree": {'true_as_of_secs': 1234567894},
           "dataunit": {"person_property": {"id": {"user_id": 123},
                                            "property": {"gender": "MALE"}}}
           },
        {"pedigree": {'true_as_of_secs': 1234567895},
           "dataunit": {"person_property": {"id": {"user_id": 123},
                                            "property": {"location": {"city" : "San Francisco", 
                                                                      "state": "CA"}}}}
           }]

In [3]:
from time import time

time()

1442704601.405036

In [4]:
from time import time

def gen_cookie():  
    cookie = 987654321
    while True:
        yield str(cookie)
        cookie +=1        
        

def firstn(n):
    num = 0
    cookie = gen_cookie()
    while num < n:
        yield [{'pedigree': {'true_as_of_secs': int(time())},
           'dataunit': {'page_property': {'id': {'url': 'http://mysite.com/'},
                                    'property': {'page_views': 1}}},
               },
            {"pedigree": {"true_as_of_secs": int(time())},
               "dataunit": {"equiv": {"id1": {"cookie": cookie.next()},
                                      "id2": {"user_id": 123}}},
               },
            {"pedigree": {"true_as_of_secs": int(time())},
               "dataunit": {"page_view": {"person": {"cookie": cookie.next()},
                                          "page": {"url": "http://mysite.com/"},
                                          "nonce": 1234567890987654321}}
               },
            {"pedigree": {'true_as_of_secs': int(time())},
               "dataunit": {"person_property": {"id": {"cookie": cookie.next()},
                                                "property": {"full_name": "Alessandro"}}}
               },
            {"pedigree": {'true_as_of_secs': int(time())},
               "dataunit": {"person_property": {"id": {"user_id": 123},
                                                "property": {"gender": "MALE"}}}
               },
            {"pedigree": {'true_as_of_secs': int(time())},
               "dataunit": {"person_property": {"id": {"user_id": 123},
                                                "property": {"location": {"city" : "San Francisco", 
                                                                          "state": "CA"}}}}
               }]
        num += 1

In [5]:
n = 2
gen = firstn(n)
gen_data =  np.concatenate([gen.next() for _ in range(n)])

In [6]:
gen_data[0]

{'dataunit': {'page_property': {'id': {'url': 'http://mysite.com/'},
   'property': {'page_views': 1}}},
 'pedigree': {'true_as_of_secs': 1442704601}}

Write a generator that will yield `n` data and validate it against the schema below:

In [7]:
%%writefile schema.avsc
[
    {
        "namespace": "analytics.avro",
        "type": "record",
        "name": "Pedigree",
        "fields": [{"name": "true_as_of_secs", "type": "int"}]
    },
    {
        "namespace": "analytics.avro",
        "type": "record",
        "name": "PersonID1",
        "fields": [{"name": "cookie", "type": "string"}]
    },
    {
        "namespace": "analytics.avro",
        "type": "record",
        "name": "PersonID2",
        "fields": [{"name": "user_id", "type": "long"}]
    },
    {
        "namespace": "analytics.avro",
        "type": "record",
        "name": "PageID",
        "fields": [{"name": "url", "type": "string"}]
    },
    
    {
        "namespace": "analytics.avro",
        "type": "record",
        "name": "PageProperty",
        "fields": [
            {
                "name": "id",
                "type": "PageID"
            },
            {
                "name": "property",
                "type": {
                    "type": "record",
                    "name": "PagePropertyValue",
                    "fields": [{"name": "page_views", "type": "int"}]
                }
            }
        ]
    },
    {
        "namespace": "analytics.avro",
        "type": "record",
        "name": "PersonProperty",
        "fields": [
            {
                "name": "id",
                "type": [
                    "PersonID1",
                    "PersonID2"
                ]
            },

            {
                "name": "property",
                "type": [
                    {
                        "type": "record",
                        "name": "PersonPropertyValue1",
                        "fields": [{"name": "full_name", "type": "string"}]
                    },
                    {
                        "type": "record",
                        "name": "PersonPropertyValue2",
                        "fields": [
                            {
                                "name": "gender", 
                                "type": {
                                    "type": "enum",
                                    "name": "GenderType",
                                    "symbols": ["MALE", "FEMALE"]
                                }
                            }
                        ]
                    },
                    {
                        "type": "record",
                        "name": "PersonPropertyValue3",
                        "fields": [
                            {
                                "name": "location", 
                                "type": {
                                    "type": "record",
                                    "name": "Location",
                                    "fields": [
                                        {"name": "city", "type": ["string", "null"]},
                                        {"name": "state", "type": ["string", "null"]},
                                        {"name": "country", "type": [ "string","null"]}
                                    ]
                                }
                            }
                        ]
                    },
                    {
                        "type": "record",
                        "name": "PersonPropertyValue4",
                        "fields": [{"name": "age", "type": "int"}] 
                    }
                ]
            }
        ]
    },
    {
        "namespace": "analytics.avro",
        "type": "record",
        "name": "EquivEdge",
        "fields": [
            {"name": "id1", "type": ["PersonID1", "PersonID2"]},
            {"name": "id2", "type": ["PersonID1", "PersonID2"]}
        ]
    },
    {
        "namespace": "analytics.avro",
        "type": "record",
        "name": "PageViewEdge",
        "fields": [
            {"name": "person", "type": ["PersonID1", "PersonID2"]},
            {"name": "page", "type": "PageID"},
            {"name": "nonce", "type": "long"}
        ]
    },
    {
        "namespace": "analytics.avro",
        "type": "record",
        "name": "PageLinkEdge",
        "fields": [
            {"name": "source", "type": "string"},
            {"name": "target", "type": "string"}  
        ]
    },
    {
        "namespace": "analytics.avro",
        "type": "record",
        "name": "Data",
        "fields": [
            {
                "name": "pedigree",
                "type": "Pedigree"
            },
            {
                "name": "dataunit",
                "type": [
                    {
                        "type": "record",
                        "name": "DataUnit1",
                        "fields": [{"name": "person_property", "type": "PersonProperty"}]
                    },
                    {
                        "type": "record",
                        "name": "DataUnit2",
                        "fields": [{"name": "page_property", "type": "PageProperty"}]
                    },
                    {
                        "type": "record",
                        "name": "DataUnit3",
                        "fields": [{"name": "equiv", "type": "EquivEdge"}]
                    },
                    {
                        "type": "record",
                        "name": "DataUnit4",
                        "fields": [{"name": "page_view", "type": "PageViewEdge"}]
                    },
                    {
                        "type": "record",
                        "name": "DataUnit5",
                        "fields": [{"name": "page_link", "type": "PageLinkEdge"}]
                    }
                ]
            }
        ]
    }
]

Overwriting schema.avsc


In [8]:
# have to clear comments, otherwise will throw parsing error
schema = avro.schema.parse(open("schema.avsc").read())

In [9]:
def test_good_data(datum, schema=schema):
    return avro.io.validate(schema, datum)
map(test_good_data, gen_data)

[True, True, True, True, True, True, True, True, True, True, True, True]

### 3. Extend the Schema

#### a) Allow a new person property called age which accepts integers  
1. Add this to your generator (these data should fail validation)
2. Adjust your schema to allow these new data.
3. Invent negative examples to make sure your schema is not too permissive.

In [10]:
age_example = {"pedigree": {'true_as_of_secs': 1234567896},
               "dataunit": {"person_property": {"id": {"user_id": 9876543210},
                                          "property": {"age": 23}}}
               }

In [11]:
avro.io.validate(schema, age_example)

True

In [12]:
age_example_neg = {"pedigree": {'true_as_of_secs': 1234567896},
               "dataunit": {"person_property": {"id": {"user_id": 9876543210},
                                          "property": {"age": 0.23}}}
                   }

In [13]:
avro.io.validate(schema, age_example_neg)

False

#### b) Allow links between pages
1. Add this to your generator (these data should fail validation)
2. Adjust your schema to allow these new data.
3. Invent negative examples to make sure your schema is not too permissive.

In [14]:
linked_edge_example = {"pedigree": {'true_as_of_secs': 1234567896},
                       "dataunit": {"page_link": {"source": "http://mysite.com/blog",
                                                  "target": "http://mysite.com/"}}
               }

In [15]:
avro.io.validate(schema, linked_edge_example)

True

On Your Own
------------
Define a fact-based graph schema based on the system you described yesterday.
1. Use [Gliffy](https://www.gliffy.com/) to map it out.
2. Write sample data and tests to see if your sample data fits. *Also generate examples that should fail.*
3. Use [Avro](http://avro.apache.org/docs/current/index.html) to define your schema and test it

##Note:
    All fields specifed under NodePropertyValue_i must be filled in otherwise 
    data check will fail. This is why we think of several vaild ways in which 
    data can be partially inserted and define those ways:
    
                        NodePropertyValue_i+1,
                        NodePropertyValue_i+2, 
                        NodePropertyValue_i+3, 
                                ... , 
                        NodePropertyValue_i+n 

##Nodes

In [16]:
ItemID = [{
        "namespace": "amazon.avsc",
        "type": "record",
        "name": "ItemID",
        "fields": [
            {"name": "item_id", "type": "int"}
        ]
    }]

In [17]:
UserID = [{
        "namespace": "amazon.avsc",
        "type": "record",
        "name": "UserID",
        "fields": [
            {"name": "userID", "type": "int"}
        ]
    }]

"Purcahser" and "Seller" are implicit titles based on explict edge calls.

This is why we only need a UserID node and NOT Purchaser or Seller node

####Think of Fact Based Models as logging event instances

In [18]:
UserProperties = [
    {
        "namespace": "amazon.avsc",
        "type": "record",
        "name": "Location",
        "fields": [
            {"name": "city",    "type": "string"},
            {"name": "state",   "type": "string"},
            {"name": "country", "type": "string"}
        ]
    },
    
    {
        "namespace": "amazon.avsc",
        "type": "enum",
        "name": "GenderType",
        "symbols": ["MALE", "FEMALE"]
    },
    
    {
        "namespace": "amazon.avsc",
        "type": "record",
        "name": "UserName",
        "fields": [
            {"name": "user_name", "type": "string"}
        ]
    },
    
    {
        "namespace": "amazon.avsc",
        "type": "record",
        "name": "UserProperty",
        "fields": [
            {"name": "userID","type": "UserID"},
            
            {
                "name": "property",
                "type": [
                    {
                        "type": "record",
                        "name": "UserPropertyValue1",
                        "fields": [{"name": "full_name", "type": "UserName"}]
                    },
                    {
                        "type": "record",
                        "name": "UserPropertyValue2",
                        "fields": [{"name": "gender",    "type": "GenderType"}]
                    },
                    {
                        "type": "record",
                        "name": "UserPropertyValue3",
                        "fields": [{"name": "location",  "type": "Location"}]
                    }
                ]
            }
        ]
    }]

In [19]:
ItemProperties = [
    {
        "namespace": "amazon.avsc",
        "type": "record",
        "name": "ItemName",
        "fields": [
            {"name": "item_name", "type": "string"}
        ]
    },
    
    {
        "namespace": "amazon.avsc",
        "type": "record",
        "name": "ItemDescription",
        "fields": [
            {"name": "item_description", "type": "string"}
        ]
    },
    {
        "namespace": "amazon.avsc",
        "type": "record",
        "name": "ItemProperty",
        "fields": [
            {
                "name": "item_id",
                "type": "ItemID"
            },
            {
                "name": "property",
                "type": [
                    {
                        "type": "record",
                        "name": "ItemPropertyValue1",
                        "fields": [{"name": "item_name", "type": "ItemName"}]
                    },
                    {
                        "type": "record",
                        "name": "ItemPropertyValue2",
                        "fields": [{"name": "description", "type": "ItemDescription"}]
                    }
                ]
            }
        ]
    }]

In [20]:
PurchaseEdge = {
        "namespace": "amazon.avsc",
        "type": "record",
        "name": "PurchaseEdge",
        "fields": [
            {"name": "userID", "type": "UserID"},
            {"name": "item_id", "type": "ItemID"},
            {"name": "purchase_time", "type": "int"}
        ]
    }

In [21]:
ReviewEdge = {
        "namespace": "amazon.avsc",
        "type": "record",
        "name": "ReviewEdge",
        "fields": [
            {"name": "userID",   "type": "UserID"},
            {"name": "item_id",   "type": "ItemID"},
            {"name": "review", "type": ["null", "string"]}
        ]
    }

In [22]:
Data = [
    {
        "namespace": "amazon.avsc",
        "type": "record",
        "name": "Pedigree",
        "fields": [{"name": "true_as_of_secs", "type": "int"}]
    },
    {
        "namespace": "amazon.avsc",
        "type": "record",
        "name": "Data",
        "fields": [
            {
                "name": "pedigree",
                "type": "Pedigree"
            },
            {
                "name": "dataunit",
                "type": [
                    {
                        "type": "record",
                        "name": "DataUnit1",
                        "fields": [{"name": "user_property", "type": "UserProperty"}]
                    },
                    {
                        "type": "record",
                        "name": "DataUnit2",
                        "fields": [{"name": "item_property", "type": "ItemProperty"}]
                    },
                    {
                        "type": "record",
                        "name": "DataUnit3",
                        "fields": [{"name": "purchase_edge", "type": "PurchaseEdge"}]
                    },
                    {
                        "type": "record",
                        "name": "DataUnit4",
                        "fields": [{"name": "review_view", "type": "ReviewEdge"}]
                    }
                ]
            }
        ]
    }
]

##Need to create data structures for each different type of entry
    i.e.) userData, itemData, EdgeData, ...
    I think this will work, anyways

In [23]:
Nodes = UserID + ItemID
Edges = [PurchaseEdge, ReviewEdge]

In [24]:
schema = avro.schema.parse(json.dumps(Nodes + Edges +  UserProperties +  ItemProperties + Data))

In [25]:
from time import time
from nltk.corpus import names
import random
       
def gen_userID():  
    userID = 1
    while True:
        yield str(userID)
        userID +=1    
        
def gen_item_id():  
    user_id = 100
    while True:
        yield str(user_id)
        user_id +=1 
    

def get_gender():
    if random.random() > 0.51:
        return "MALE"
    else:
        return "FEMALE"        
        
def get_city_state():
    data = pd.read_csv("/Users/Alexander/Downloads/AdWords API Location Criteria 2015-05-29.csv")
    city_state_country = data[data["Country Code"] == "US"]["Canonical Name"].values
    random.shuffle(city_state_country)
    while True:
        random.choice(city_state_country)
        state = city_state_country[0].split(",")[-2]
        city = city_state_country[0].split(",")[-3]
        yield city, state        
        
def gen_Users(n):
    Names = names.words()
    random.shuffle(Names)
    userID = gen_userID()
    city_state = get_city_state()
    num = 0
    while num < n:
        city,state = city_state.next()
        user_id = userID.next()
        
        yield [{'pedigree': {'true_as_of_secs': int(time())},
            'dataunit': {'person_property': {'id': {'userID': user_id},
                                    'property': {'user_name': random.choice(Names)}}},
               },
            {"pedigree": {'true_as_of_secs': int(time())},
               "dataunit": {"person_property": {"id": {"userID": user_id},
                                                "property": {"gender": get_gender()}}}
               },
            {"pedigree": {'true_as_of_secs': int(time())},
               "dataunit": {"person_property": {"id": {"userID": user_id},
                                                "property": {"location": {"city" :  city, 
                                                                          "state": state,
                                                                         "country": "USA"}}}}
            }]
        num += 1

In [26]:
gen_users = gen_Users(5)

In [27]:
dat = [user for user in gen_users.next()]

In [28]:
gen_users.next()[0]

{'dataunit': {'person_property': {'id': {'userID': '2'},
   'property': {'user_name': u'Rene'}}},
 'pedigree': {'true_as_of_secs': 1442704617}}

In [29]:
def test_good_data(datum, schema=schema):
    return avro.io.validate(schema, datum)
map(test_good_data, dat)

[False, False, False]

In [30]:
DAT = {"pedigree": {'true_as_of_secs': 12344},
       "dataunit": {"person_property": {"id": {"userID": 727272},
                                  "property": {"gender": "MALE"}}}}

In [31]:
test_good_data(DAT)

False

##Note: just changed 'namespace' in schema

In [32]:
#schema = avro.schema.parse(open("user.avsc").read())

writer = DataFileWriter(open("amazon.avsc", "w"), DatumWriter(), schema)

writer.append({'dataunit': {'page_property': {'id': {'userID': '1'},
               'property': {'user_name': u'Ethelin'}}},
               'pedigree': {'true_as_of_secs': 1442011805}})
writer.close()

reader = DataFileReader(open("amazon.avsc", "r"), DatumReader())

for user in reader:
    print user
reader.close()

AvroTypeException: The datum {'dataunit': {'page_property': {'property': {'user_name': u'Ethelin'}, 'id': {'userID': '1'}}}, 'pedigree': {'true_as_of_secs': 1442011805}} is not an example of the schema [
  {
    "namespace": "amazon.avsc", 
    "type": "record", 
    "name": "UserID", 
    "fields": [
      {
        "type": "int", 
        "name": "userID"
      }
    ]
  }, 
  {
    "namespace": "amazon.avsc", 
    "type": "record", 
    "name": "ItemID", 
    "fields": [
      {
        "type": "int", 
        "name": "item_id"
      }
    ]
  }, 
  {
    "namespace": "amazon.avsc", 
    "type": "record", 
    "name": "PurchaseEdge", 
    "fields": [
      {
        "type": "amazon.avsc.UserID", 
        "name": "userID"
      }, 
      {
        "type": "amazon.avsc.ItemID", 
        "name": "item_id"
      }, 
      {
        "type": "int", 
        "name": "purchase_time"
      }
    ]
  }, 
  {
    "namespace": "amazon.avsc", 
    "type": "record", 
    "name": "ReviewEdge", 
    "fields": [
      {
        "type": "amazon.avsc.UserID", 
        "name": "userID"
      }, 
      {
        "type": "amazon.avsc.ItemID", 
        "name": "item_id"
      }, 
      {
        "type": [
          "null", 
          "string"
        ], 
        "name": "review"
      }
    ]
  }, 
  {
    "namespace": "amazon.avsc", 
    "type": "record", 
    "name": "Location", 
    "fields": [
      {
        "type": "string", 
        "name": "city"
      }, 
      {
        "type": "string", 
        "name": "state"
      }, 
      {
        "type": "string", 
        "name": "country"
      }
    ]
  }, 
  {
    "symbols": [
      "MALE", 
      "FEMALE"
    ], 
    "type": "enum", 
    "namespace": "amazon.avsc", 
    "name": "GenderType"
  }, 
  {
    "namespace": "amazon.avsc", 
    "type": "record", 
    "name": "UserName", 
    "fields": [
      {
        "type": "string", 
        "name": "user_name"
      }
    ]
  }, 
  {
    "namespace": "amazon.avsc", 
    "type": "record", 
    "name": "UserProperty", 
    "fields": [
      {
        "type": "amazon.avsc.UserID", 
        "name": "userID"
      }, 
      {
        "type": [
          {
            "namespace": "amazon.avsc", 
            "type": "record", 
            "name": "UserPropertyValue1", 
            "fields": [
              {
                "type": "amazon.avsc.UserName", 
                "name": "full_name"
              }
            ]
          }, 
          {
            "namespace": "amazon.avsc", 
            "type": "record", 
            "name": "UserPropertyValue2", 
            "fields": [
              {
                "type": "amazon.avsc.GenderType", 
                "name": "gender"
              }
            ]
          }, 
          {
            "namespace": "amazon.avsc", 
            "type": "record", 
            "name": "UserPropertyValue3", 
            "fields": [
              {
                "type": "amazon.avsc.Location", 
                "name": "location"
              }
            ]
          }
        ], 
        "name": "property"
      }
    ]
  }, 
  {
    "namespace": "amazon.avsc", 
    "type": "record", 
    "name": "ItemName", 
    "fields": [
      {
        "type": "string", 
        "name": "item_name"
      }
    ]
  }, 
  {
    "namespace": "amazon.avsc", 
    "type": "record", 
    "name": "ItemDescription", 
    "fields": [
      {
        "type": "string", 
        "name": "item_description"
      }
    ]
  }, 
  {
    "namespace": "amazon.avsc", 
    "type": "record", 
    "name": "ItemProperty", 
    "fields": [
      {
        "type": "amazon.avsc.ItemID", 
        "name": "item_id"
      }, 
      {
        "type": [
          {
            "namespace": "amazon.avsc", 
            "type": "record", 
            "name": "ItemPropertyValue1", 
            "fields": [
              {
                "type": "amazon.avsc.ItemName", 
                "name": "item_name"
              }
            ]
          }, 
          {
            "namespace": "amazon.avsc", 
            "type": "record", 
            "name": "ItemPropertyValue2", 
            "fields": [
              {
                "type": "amazon.avsc.ItemDescription", 
                "name": "description"
              }
            ]
          }
        ], 
        "name": "property"
      }
    ]
  }, 
  {
    "namespace": "amazon.avsc", 
    "type": "record", 
    "name": "Pedigree", 
    "fields": [
      {
        "type": "int", 
        "name": "true_as_of_secs"
      }
    ]
  }, 
  {
    "namespace": "amazon.avsc", 
    "type": "record", 
    "name": "Data", 
    "fields": [
      {
        "type": "amazon.avsc.Pedigree", 
        "name": "pedigree"
      }, 
      {
        "type": [
          {
            "namespace": "amazon.avsc", 
            "type": "record", 
            "name": "DataUnit1", 
            "fields": [
              {
                "type": "amazon.avsc.UserProperty", 
                "name": "user_property"
              }
            ]
          }, 
          {
            "namespace": "amazon.avsc", 
            "type": "record", 
            "name": "DataUnit2", 
            "fields": [
              {
                "type": "amazon.avsc.ItemProperty", 
                "name": "item_property"
              }
            ]
          }, 
          {
            "namespace": "amazon.avsc", 
            "type": "record", 
            "name": "DataUnit3", 
            "fields": [
              {
                "type": "amazon.avsc.PurchaseEdge", 
                "name": "purchase_edge"
              }
            ]
          }, 
          {
            "namespace": "amazon.avsc", 
            "type": "record", 
            "name": "DataUnit4", 
            "fields": [
              {
                "type": "amazon.avsc.ReviewEdge", 
                "name": "review_view"
              }
            ]
          }
        ], 
        "name": "dataunit"
      }
    ]
  }
]

##Generate Bank of User Names

In [None]:
from nltk.corpus import names
import random

In [None]:
Names = names.words()
random.shuffle(Names)

In [None]:
random.choice(Names)

In [None]:
def get_gender():
    if random.random() > 0.51:
        return "MALE"
    else:
        return "FEMALE"

In [None]:
data = pd.read_csv("/Users/Alexander/Downloads/AdWords API Location Criteria 2015-05-29.csv")

In [None]:
data.head()

In [None]:
city_state_country = data[data["Country Code"] == "US"]["Canonical Name"].values

In [None]:
city_state_country[0].split(",")

In [None]:
city_state_country.shape

In [None]:
random.shuffle(city_state_country)
random.choice(city_state_country)

In [None]:
state = city_state_country[0].split(",")[-2]

In [None]:
city_state_country[0].split(",")[-3]

In [None]:

def get_city_state(data):
    data = pd.read_csv("/Users/Alexander/Downloads/AdWords API Location Criteria 2015-05-29.csv")
    city_state_country = data[data["Country Code"] == "US"]["Canonical Name"].values
    random.shuffle(city_staet_country)
    while True:
        random.choice(city_staet_country)
        state = city_staet_country[0].split(",")[-2]
        city = city_state_country[0].split(",")[-3]
        yield city, state