In [None]:
'''Importing modules'''
import pandas
import numpy as np
from scipy.sparse import csr_matrix, coo_matrix
import json
from sklearn.cross_validation import train_test_split
from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin, TransformerMixin
from sklearn.linear_model import Ridge, LinearRegression, Lasso, SGDRegressor
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
import pickle
import dill
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfVectorizer, TfidfTransformer
from nltk.corpus import stopwords
from sys import getsizeof
import re
from sklearn.pipeline import Pipeline, FeatureUnion

'''Initializing variables and empty lists'''
dict_list = []
iter_lines = 0
iter_lower_split = 0
text_list_lower_split = []
a = 0

'''Initializing class containing a machine learning model (Ridge Regression)'''
class mlm_RidR(BaseEstimator, RegressorMixin):
    def __init__(self):
        self.rr = Ridge(alpha = 150)
        
    def fit(self, X, y):
        self.rr.fit(X, y)
        return self
    
    def best_params_(self):
        return self.rr.best_params_
    
    def predict(self, X):        
        return self.rr.predict(X)

'''Opening txt file containing Yelp review data and reading all lines'''
yelp_dataset_txt = open('yelp_train_academic_dataset_review.txt', 'rb+')
raw_lines = yelp_dataset_txt.readlines()

'''Writing raw_lines to an empty list using json.loads to parse'''
while iter_lines < len(raw_lines):
    if len(raw_lines[iter_lines]) > 0:
        dict_list.append(json.loads(raw_lines[iter_lines]))
    iter_lines += 1
    
'''Initializing list with Yelp data into a pandas data frame for manipulation'''
data_frame = pandas.DataFrame(dict_list).to_dict()

'''Closing txt files'''
yelp_dataset_txt.close()

'''Initializing stars and text lists'''
stars_list = data_frame['stars'].values()
text_list = data_frame['text'].values()

'''Initializing stopwords and adding new stopwords to the list'''
stop_words = stopwords.words("english")
stop_words_additions = ['ll', 'adwmuxsza', 'zu', 'abc', 'aac', 'aardbark', 'aabc', 'aab', 'aaa', 'aa', 'that', 'youre', 'zzcrkebcrfxbb', 'zse', 'i\'m', 'he\'s', 'i\'ve', 'it\'s', 'id', 'im', 'hes', 'ive', 'its']
while a < len(stop_words_additions):
    stop_words.append(stop_words_additions[a])
    a += 1
    
'''Looping through text_list converting to lower case and splitting into individual terms'''
while iter_lower_split < len(text_list):
    text_list_lower = text_list[iter_lower_split].lower()
    text_list_split = text_list_lower.split()
    text_list_split = [w for w in text_list_split if not w in stop_words]
    text_list_split = (' '.join(text_list_split))
    text_list_split = re.findall('[a-z]{2,}', text_list_split)
    text_list_split = (' '.join(text_list_split))
    text_list_lower_split.append(text_list_split)
    iter_lower_split += 1
    
'''Initializing analytical model axes'''
X = text_list_lower_split
y = stars_list

'''Splitting axes variables into train and test sets'''
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42)
    
count_vectorizer = CountVectorizer(analyzer = "word", tokenizer = None, preprocessor = None, stop_words = None, min_df = 0.001, max_df = 0.4)
tfidf_vectorizer = TfidfVectorizer(analyzer = "word", tokenizer = None, preprocessor = None, stop_words = None, min_df = 0.001, max_df = 0.4)

vect_union = FeatureUnion([('count_vectorizer', count_vectorizer), ('tfidf_vectorizer', tfidf_vectorizer)])

text_features = vect_union.fit(X_train, y_train)

ridge = mlm_RidR()

pipe_line = Pipeline([('text_features', text_features), ('ridge_regression', ridge)])

pipe_line.fit(X_train, y_train)

print pipe_line.score(X_train, y_train)
print pipe_line.score(X_test, y_test)