In [1]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
import os
from datetime import datetime as dt
from sklearn.feature_extraction.text import HashingVectorizer
import re
import joblib

In [2]:
os.listdir()

['04-create-server-model.ipynb',
 '01-preprocessing.ipynb',
 '.ipynb_checkpoints',
 '02-prototype.ipynb',
 '03-final-model.ipynb']

In [8]:
def tokenizer(text):
    """Removing http link and non word character at the beginning"""
    emoticons = re.findall(r'(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    text = re.sub(r'(?::|;|=)(?:-)?(?:\)|\(|D|P)', '', text)
    text = re.sub(r'http[s]?://\S+', '', text)
    text = (re.sub(r'[\W]+', ' ', text.lower()) +
            ' '.join(emoticons).replace('-', ''))
    return text.split()

In [3]:
def data_reader(path):
    """Load processed data and split in to train and test."""
    df = pd.read_csv(path)
    return df

In [4]:
def train_test_split(df, pct):
    train_rows = int(df.shape[0]*(1-pct))
    x_train = df.loc[:train_rows, 'review_text'].values
    y_train = df.loc[:train_rows, 'pruned_rating'].values
    x_test = df.loc[train_rows:, 'review_text'].values
    y_test = df.loc[train_rows:, 'pruned_rating'].values
    return x_train, y_train, x_test, y_test

In [18]:
def version_by_date(path: str, extension: str):
    """Select the latest file form a dictionary.
    :parameter
        path: root path
        extension: file extension
    """
    file_pattern = "." + extension
    dir_list = [
        item.split("_")[1] for item in os.listdir(path) if "version" in item
    ]
    versions = [dt.strptime(item.replace(file_pattern, ""), '%Y-%m-%d').date() for item in dir_list]
    latest_version = "version_{}.{}".format(max(versions), extension)
    return os.path.join(path, latest_version)

In [19]:
latest_data = version_by_date("../datalake/feed/", "csv")

In [21]:
df = data_reader(latest_data)

In [28]:
X_train, y_train, X_test, y_test = train_test_split(df, 0.7)

In [25]:
X_train[:1]

array(['easily installed on my parlor guitar the oval shape helps retain the strap and looks kewl installation is not rocket science '],
      dtype=object)

In [35]:
def trainer(doc, label):
    tf_idf = TfidfVectorizer(
        strip_accents=None,
        lowercase=False,
        preprocessor=None,
        ngram_range=(1, 2),
        stop_words=None,
        tokenizer=tokenizer

    )
    param_space = {
        "random_state": 1,
        "solver": 'liblinear',
        "C": 100.0,
        "penalty": 'l2',
        "n_jobs":-1
    }
    pipe = Pipeline([
        ('vect', tf_idf),
        ('clf', LogisticRegression(**param_space))
    ])
    pipe.fit(doc, label)
    return pipe

In [36]:
classifier = trainer(X_train, y_train)

  " = {}.".format(effective_n_jobs(self.n_jobs)))


In [37]:
classifier

Pipeline(memory=None,
         steps=[('vect',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=False, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 2), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='...b\\w\\w+\\b',
                                 tokenizer=<function tokenizer at 0x7f1762b42320>,
                                 use_idf=True, vocabulary=None)),
                ('clf',
                 LogisticRegression(C=100.0, class_weight=None, dual=False,
                                    fit_

In [38]:
classifier.score(X_test, y_test)

0.9669871117684344

In [42]:
joblib.dump(classifier, os.path.join("../modellake/temp/", "lr_classifier.pkl"))

['../modellake/temp/lr_classifier.pkl']

In [43]:
classifier = joblib.load(os.path.join("../modellake/temp/", "lr_classifier.pkl"))

In [44]:
classifier.score(X_test, y_test)

0.9669871117684344

In [51]:
classifier.predict(["low quality"])

array([2])

In [66]:
for x, y in zip(X_test[:10], y_test[:10]):
    print("{} ---> {} ---> {}".format(x[:200], classifier.predict([x])[0], y ) ) 

update 15 july 2013i still really love how these assist in quick tuning but man are they flimsy all of my other musician friends who also love this tuner s ability to grab the right pitch also have em ---> 1 ---> 1
for any usable tones while playing live the best bet is to plug it into a tube power amp direct to mixer or running it through a solid state amp is very uninspiring i didn t like the recorded tones ei ---> 0 ---> 0
this is a great capo clamps well and quickly the clamp handle is easy to reach and is not going to easily break aluminum not plastic i like it and it was well under the price of others  ---> 2 ---> 2
this is a really solid table mic stand and you re not going to find a better price i looked the base is solid and heavy and the actual stand is a sturdy metal tube i liked the first one enough that i  ---> 2 ---> 2
this seems to have been a rare case but still i had my ukulele on is and it snapped i m worried what might happen with a guitar or a bass ---> 0 ---> 0
i l

# Data Loader and Data Saver

In [4]:
def version_by_number(path):
    """Select the latest file form a dictionary.
    :parameter
        path: root path
    """
    dir_list = [
        item.split("_")[1] for item in os.listdir(path) if "model" in item
    ]
    if len(dir_list) > 0:
        versions = [int(item) for item in dir_list]
        latest_version = "model_{}".format(max(versions))
        return latest_version
    else:
        raise Exception("There is no model")

In [74]:
def saver(path, model):
    files = [item for item in os.listdir(path) if item != "temp"]
    try:
        if len(files) > 0:
            versions = [int(item.split("_")[1]) for item in files]
            new_version = "model_{}".format(max(versions) + 1)
            file_path = os.path.join(model_lake, new_version)
            os.mkdir(file_path)
            joblib.dump(model, os.path.join(file_path, "classifier.pkl"))
            print("Model Saved: ".format(file_path))
        else:
            file_path = os.path.join(model_lake, "model_1")
            os.mkdir(file_path)
            joblib.dump(model, os.path.join(file_path, "classifier.pkl"))
    except Exception as e:
        raise "Error Occurs: {}".format(e)

In [75]:
saver("../modellake/")

In [76]:
os.listdir("../modellake/")

['model_1', 'temp']

In [2]:
def loader(path, version=None):
    if version:
        model_path = os.path.join(path, "model_".format(version))
        model = joblib.load(os.path.join(model_path, "classifier.pkl"))
        return model
    else:
        latest_model = os.path.join(path, version_by_number(path))
        model = joblib.load(os.path.join(latest_model, "classifier.pkl"))
        return model

In [9]:
classifier = loader("../modellake/")

In [10]:
classifier.score(X_test, y_test)

NameError: name 'X_test' is not defined