In [1]:
# !wget -O yelp_business.json.gz http://thedataincubator.s3.amazonaws.com/coursedata/mldata/yelp_train_academic_dataset_business.json.gz

In [59]:
# importing modules
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin, TransformerMixin
from sklearn.linear_model import Ridge, LinearRegression, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
import pickle
import dill
from sklearn.feature_extraction import DictVectorizer
import time
import toolz
from sklearn.pipeline import FeatureUnion, Pipeline
import gzip, ujson, re, json
from sklearn import cross_validation
from sklearn import metrics, base, neighbors, grid_search
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from scipy import sparse
from sklearn import linear_model

In [19]:
### Read json data ###
with gzip.open('yelp_business.json.gz') as f:
    data = pd.DataFrame(json.loads(line) for line in f)
X, y = data, data['stars']

In [20]:
### Customized transformers ###
class ColumnSelectTransformer(base.BaseEstimator, base.TransformerMixin):
    def __init__(self, keys):
        self.keys = keys

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.keys]


class CategoryTransformer(base.BaseEstimator, base.TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        D = []
        for record in X.values:
            D.append({k:1 for k in record[0]})
        return D


class AttributeTransformer(base.BaseEstimator, base.TransformerMixin):
    def __init__(self):
        pass

    def _flatten(self, d, parent_key='', sep='_'):
        """ Flatten dictonary
        """
        import collections
        items = []
        for k, v in d.items():
            new_key = parent_key + sep + k if parent_key else k
            if isinstance(v, collections.MutableMapping):
                items.extend(self._flatten(v, new_key, sep=sep).items())
            else:
                new_v = 1 if v == True else 0
                items.append((new_key, new_v))
        return dict(items)

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        D = []
        for record in X.values:
            D.append(self._flatten(record[0]))
        return D


class ModelTransformer(base.BaseEstimator, base.TransformerMixin):
    def __init__(self, model):
        self.model = model

    def fit(self, *args, **kwargs):
        self.model.fit(*args, **kwargs)
        return self

    def transform(self, X, **transform_params):
        return pd.DataFrame(self.model.predict(X))

# Question 1

In [72]:
### Q1: city_model ###
class CityEstimator(base.BaseEstimator, base.RegressorMixin):
    def __init__(self):
        pass

    def fit(self, X, y):
        df = pd.concat([X, y], axis=1)
        self.mean_by_city = df.groupby('city').mean()
        return self

    def predict(self, X):
        city = X.ix[0]['city']
        try:
            score = self.mean_by_city.ix[city]
        except:
            score = self.mean_by_city.mean()
        return float(score)

In [75]:
city_pipeline = Pipeline([
    ('trans', ColumnSelectTransformer(['city'])),
    ('est', CityEstimator())
])
city_pipeline.fit(X, y)
dill.dump(city_pipeline, open('city_model.txt', 'wb'))
#dill.dump(city_pipeline, open('city_model.pkl', 'wb'))

In [76]:
### Q2: lat_long_model ###
lat_long_pipeline = Pipeline([
    ('trans', ColumnSelectTransformer(['latitude', 'longitude'])),
    ('est', neighbors.KNeighborsRegressor())
])
cv = cross_validation.ShuffleSplit(len(y), n_iter=20, test_size=0.2, random_state=42)
knn_param_grid = { "est__n_neighbors": range(4, 24, 4) }
knn_regression_cv = grid_search.GridSearchCV(lat_long_pipeline,
                                             param_grid=knn_param_grid, cv=cv,
                                             scoring="mean_squared_error")
knn_regression_cv.fit(X, y)
dill.dump(knn_regression_cv.best_estimator_, open('lat_long_model.txt','wb'))
#dill.dump(knn_regression_cv.best_estimator_, open('lat_long_model.pkl','wb'))

In [77]:
### Q3: category_model ###
category_pipeline = Pipeline([
    ('trans', ColumnSelectTransformer(['categories'])),
    ('cat_trans', CategoryTransformer()),
    ('vect', DictVectorizer()),
    ('tfidf_vect', TfidfTransformer()),
    ('est', linear_model.Ridge())
])
cv = cross_validation.ShuffleSplit(len(y), n_iter=20, test_size=0.2, random_state=42)
param_grid = { "est__alpha": np.logspace(-1, 1, 5) }
ridge_cv = grid_search.GridSearchCV(category_pipeline,
                                    param_grid=param_grid, cv=cv,
                                    scoring="mean_squared_error")
ridge_cv.fit(X, y)
dill.dump(ridge_cv.best_estimator_, open('category_model_gridsearch.txt','wb'))
#dill.dump(ridge_cv.best_estimator_, open('category_model_gridsearch.pkl','wb'))

In [78]:
### Q4: attribute_knn_model ###
attribute_pipeline = Pipeline([
    ('trans', ColumnSelectTransformer(['attributes'])),
    ('cat_trans', AttributeTransformer()),
    ('vect', DictVectorizer()),
    ('est', linear_model.Ridge())
])
ridge_param_grid = { "est__alpha": np.logspace(-6., -.3, 20) }
attribute_ridge_cv = grid_search.GridSearchCV(attribute_pipeline,
                                              param_grid=ridge_param_grid, cv=cv,
                                              scoring="mean_squared_error")
attribute_ridge_cv.fit(X, y)
dill.dump(attribute_ridge_cv.best_estimator_, open('attribute_knn_model.txt','wb'))
#dill.dump(attribute_ridge_cv.best_estimator_, open('attribute_knn_model.pkl','wb'))

In [80]:
### Q5: full_model ###
full_pipeline = Pipeline([
    ('feature_union', FeatureUnion([
        ('lat_long_feature', Pipeline([
            ('col_select', ColumnSelectTransformer(['latitude', 'longitude'])),
            ('knn', ModelTransformer(neighbors.KNeighborsRegressor(n_neighbors=20)))
            ])),
        ('category_feature', Pipeline([
            ('col_select', ColumnSelectTransformer(['categories'])),
            ('cat_trans', CategoryTransformer()),
            ('vect', DictVectorizer()),
            ('tfidf_vect', TfidfTransformer()),
            ])),
        ('attribute_feature', Pipeline([
            ('col_select', ColumnSelectTransformer(['attributes'])),
            ('attr_transformer', AttributeTransformer()),
            ('vectorizer', DictVectorizer())
            ]))
        ])),
    ('est', linear_model.Ridge())
])
ridge_param_grid = { "est__alpha": np.logspace(-6., -.3, 20) }
full_ridge_cv = grid_search.GridSearchCV(full_pipeline,
                                         param_grid=ridge_param_grid, cv=cv,
                                         scoring="mean_squared_error")
full_ridge_cv.fit(X, y)
dill.dump(full_ridge_cv.best_estimator_, open('full_model_gridsearch.txt','wb'))
#dill.dump(full_ridge_cv.best_estimator_, open('full_model_gridsearch.pkl','wb'))

In [82]:
!pwd

/home/adelard/mlearning


In [84]:
f=open('city_model.txt', 'rb')
city_model=dill.loads(f.read())
f.close()

In [85]:
f=open('lat_long_model.txt', 'rb')
lat_long_model=dill.loads(f.read())
f.close()

In [86]:
f=open('attribute_knn_model.txt', 'rb')
attribute_knn_model=dill.loads(f.read())
f.close()

In [87]:
f=open('category_model_gridsearch.txt', 'rb')
category_model_gridsearch_model=dill.loads(f.read())
f.close()

In [88]:
f=open('full_model_gridsearch.txt', 'rb')
full_model_gridsearch=dill.loads(f.read())
f.close()

In [69]:
!pwd

/home/adelard/mlearning


In [70]:
!pwd

/home/adelard/mlearning
