In [3]:
'''+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++'''
'''Importing modules'''
import pandas
import numpy as np
import json
from sklearn.cross_validation import train_test_split
from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin
from sklearn.linear_model import Ridge, LinearRegression, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
'''***************************************************************************************************************************'''

'''Initializing variables and empty lists'''
dict_list = []
lines_iter = 0

'''Opening txt file containing Yelp data and reading lines'''
f = open('yelp_dataset_Raw.txt', 'r+')
line = f.readlines()

'''Writing lines to an empty list using json.loads to parse'''
while lines_iter < len(line):
    if len(line[lines_iter]) > 0:
        dict_list.append(json.loads(line[lines_iter]))
    lines_iter += 1

'''Initializing list with Yelp data into a pandas data frame for manipulation'''
data_frame = pandas.DataFrame(dict_list)

'''Converting data frame features into a matrix for model processing'''
stars_mx = data_frame['stars'].as_matrix()
latitude_mx = data_frame['latitude'].as_matrix()

'''Splitting data frame into train and test sets using cross validation'''
stars_train, stars_test, latitude_train, latitude_test = train_test_split(stars_mx, latitude_mx, test_size=0.4, random_state=42)
latitude_train = latitude_train.reshape(len(latitude_train), 1)
latitude_test = latitude_test.reshape(len(latitude_test), 1)

'''+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++'''
'''Initializing class containing a machine learning model (KNN) with GridSearchCV to find best parameters'''
class mlm_KNN(BaseEstimator, RegressorMixin):
    def __init__(self):
        self.mlm = KNeighborsRegressor() #GridSearchCV(KNeighborsRegressor(), {'n_neighbors': range(100, 200, 10)})
        
    def fit(self, x, y):
        self.mlm.fit(x, y)
        return self
    
    def best_params_(self):
        return self.mlm.best_params_
    
    def predict(self, x):
        return self.mlm.predict(x)
'''***************************************************************************************************************************'''

'''+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++'''
'''Initializing class containing a machine learning model (RFR) with GridSearchCV to find best parameters'''
class mlm_RFR(BaseEstimator, RegressorMixin):
    def __init__(self):
        self.mlm = RandomForestRegressor() #GridSearchCV(RandomForestRegressor(), {'n_estimators' : range(100, 1300, 100), 'max_depth' : range(1, 3, 1)})
        
    def fit(self, x, y):
        self.mlm.fit(x, y)
        return self
    
    def best_params_(self):
        return self.mlm.best_params_
    
    def predict(self, x):
        return self.mlm.predict(x)
'''***************************************************************************************************************************'''

'''+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++'''
'''Initializing class finding the mean star rating of cities'''
class answer_Q1():        
    def groupby_mean(self, data_frame):
        return data_frame.groupby(['city'])['stars'].mean()
'''***************************************************************************************************************************'''

'''Calling machine learning class to train and test data finding R^2 and best parameters'''
knn = mlm_KNN()
knn.fit(latitude_train, stars_train)
print knn.score(latitude_train, stars_train)
print knn.score(latitude_test, stars_test)
#print knn.best_params_() #Only used with GridSearchCV in class

'''Calling answer_Q1 to find the mean star rating for each city grouping by city
city_mean = answer_Q1()
print city_mean.groupby_mean(data_frame)'''

0.217586135501
-0.149116985219


'Calling answer_Q1 to find the mean star rating for each city grouping by city\ncity_mean = answer_Q1()\nprint city_mean.groupby_mean(data_frame)'