In [None]:
'''+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++'''
'''Importing modules'''
import pandas
import numpy as np
import json
from sklearn.cross_validation import train_test_split
from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin, TransformerMixin
from sklearn.linear_model import Ridge, LinearRegression, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
import pickle
import dill
import gzip
'''***************************************************************************************************************************'''

'''Initializing variables and empty lists'''
dict_list = []
lines_iter = 0

'''+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++'''
'''Initializing class to create transformer selecting columns in dataframe'''
class trans_COLUMNSELECT(BaseEstimator, TransformerMixin):    
    def transform(self, record):
        record_transform = pandas.DataFrame([record])
        longitude_latitude_mx = record_transform[['longitude', 'latitude']].as_matrix()
        return longitude_latitude_mx

'''Initializing class containing a machine learning model (KNN)'''
class mlm_KNN(BaseEstimator, RegressorMixin):
    def __init__(self):
        self.knn = KNeighborsRegressor(n_neighbors = 120)
        
    def fit(self, X, y):
        self.knn.fit(X, y)
        return self
    
    def best_params_(self):
        return self.knn.best_params_
    
    def predict(self, X):        
        return self.knn.predict(X)
'''***************************************************************************************************************************'''

'''Opening txt file containing Yelp data and reading lines'''
yelp_dataset_Raw_txt = gzip.open('yelp_dataset.json.gz', 'rb+')
lines = yelp_dataset_Raw_txt.readlines()

'''Opening empty txt file to write machine learning models using dill to save model'''
#knn_model_txt = open('ml_Q2_knn_model.txt', 'wb+')

'''Writing lines to an empty list using json.loads to parse'''
while lines_iter < len(lines):
    if len(lines[lines_iter]) > 0:
        dict_list.append(json.loads(lines[lines_iter]))
    lines_iter += 1

'''Initializing list with Yelp data into a pandas data frame for manipulation'''
data_frame = pandas.DataFrame(dict_list)

'''Filtering data frame into longitude, latitude and stars'''
X = data_frame[['longitude', 'latitude']]
y = data_frame['stars']

'''Splitting data frame into train and test sets using cross validation'''
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

'''Calling KNN class to score and predict train and test sets'''
knn_model = mlm_KNN()
knn_model = knn_model.fit(X_train, y_train)

#print knn_model.best_params_()
print knn_model.score(X_train, y_train)
print knn_model.score(X_test, y_test)

'''Dumping machine learning model object into txt file'''
#dill.dump(knn_model, knn_model_txt)

'''Closing txt files'''
#knn_model_txt.close()