In [1]:
import gzip,json
import pandas as pd
import numpy as np

In [2]:
with open('AMAZON_FASHION.npy', 'rb') as f:
    a = np.load(f, allow_pickle=True)

In [3]:
df_data=pd.DataFrame(a,columns=['asin', 'overall', 'reviewText', 'reviewTime', 'reviewerID',
       'reviewerName', 'summary', 'unixReviewTime', 'verified', 'vote',
       'style', 'image'
])
df_data.head()

Unnamed: 0,asin,overall,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime,verified,vote,style,image
0,7106116521,5,Exactly what I needed.,"10 20, 2014",A1D4G1SNUZWQOT,Tracy,perfect replacements!!,1413760000.0,1,,,
1,7106116521,2,"I agree with the other review, the opening is ...","09 28, 2014",A3DDWDH9PX2YX2,Sonja Lau,"I agree with the other review, the opening is ...",1411860000.0,1,3.0,,
2,7106116521,4,Love these... I am going to order another pack...,"08 25, 2014",A2MWC41EW7XL15,Kathleen,My New 'Friends' !!,1408920000.0,0,,,
3,7106116521,2,too tiny an opening,"08 24, 2014",A2UH2QQ275NV45,Jodi Stoner,Two Stars,1408840000.0,1,,,
4,7106116521,3,Okay,"07 27, 2014",A89F3LQADZBS5,Alexander D.,Three Stars,1406420000.0,0,,,


In [4]:
def dataHandler(df,row):
    return (str(df['reviewText'].iloc[row]),{'cats': {'neg': (3-int(df['overall'].iloc[row]))/3, 'pos': (int(df['overall'].iloc[row])-3)/3}})

from sklearn.utils import shuffle
def load_training_data(df,split):
    # Load from files
    df = shuffle(df)
    datas=[]
    for i in range(len(df)):
        datas.append(dataHandler(df_data,i))

    split = int(len(datas) * split)
    return datas[:split], datas[split:]

training_data, testing_data =load_training_data(df_data,0.8)

In [5]:
import random
import spacy
from spacy.util import minibatch, compounding

def train_model(
    training_data: list,
    test_data: list,
    iterations: int = 20
) -> None:
    # Build pipeline
    nlp = spacy.load("en") # Change this for loading medium or large models
    if "textcat" not in nlp.pipe_names:
        textcat = nlp.create_pipe(
            "textcat", config={"architecture": "simple_cnn"}
        )
        nlp.add_pipe(textcat, last=True)
    else:
        textcat = nlp.get_pipe("textcat")

    textcat.add_label("pos")
    textcat.add_label("neg")

    # Train only textcat
    training_excluded_pipes = [
        pipe for pipe in nlp.pipe_names if pipe != "textcat"
    ]
    with nlp.disable_pipes(training_excluded_pipes):
        optimizer = nlp.begin_training()
        # Training loop
        print("Beginning training")
        print("Loss\tPrecision\tRecall\tF-score")
        batch_sizes = compounding(
            4.0, 32.0, 1.001
        )  # A generator that yields infinite series of input numbers
        for i in range(iterations):
            print(f"Training iteration {i}")
            loss = {}
            random.shuffle(training_data)
            batches = minibatch(training_data, size=batch_sizes)
            for batch in batches:
                text, labels = zip(*batch)
                nlp.update(text, labels, drop=0.2, sgd=optimizer, losses=loss)
            with textcat.model.use_params(optimizer.averages):
                evaluation_results = evaluate_model(
                    tokenizer=nlp.tokenizer,
                    textcat=textcat,
                    test_data=test_data
                )
                print(
                    f"{loss['textcat']}\t{evaluation_results['precision']}"
                    f"\t{evaluation_results['recall']}"
                    f"\t{evaluation_results['f-score']}"
                )

    # Save model
    with nlp.use_params(optimizer.averages):
        nlp.to_disk("model_artifacts")

In [6]:
def evaluate_model(
    tokenizer, textcat, test_data: list
) -> dict:
    reviews, labels = zip(*test_data)
    reviews = (tokenizer(review) for review in reviews)
    true_positives = 0
    false_positives = 1e-8  # Can't be 0 because of presence in denominator
    true_negatives = 0
    false_negatives = 1e-8
    for i, review in enumerate(textcat.pipe(reviews)):
        true_label = labels[i]['cats']
        for predicted_label, score in review.cats.items():
            # Every cats dictionary includes both labels. You can get all
            # the info you need with just the pos label.
            if (
                predicted_label == "neg"
            ):
                continue
            if score >= 0.5 and true_label["pos"]:
                true_positives += 1
            elif score >= 0.5 and true_label["neg"]:
                false_positives += 1
            elif score < 0.5 and true_label["neg"]:
                true_negatives += 1
            elif score < 0.5 and true_label["pos"]:
                false_negatives += 1
    precision = true_positives / (true_positives + false_positives)
    recall = true_positives / (true_positives + false_negatives)

    if precision + recall == 0:
        f_score = 0
    else:
        f_score = 2 * (precision * recall) / (precision + recall)
    return {"precision": precision, "recall": recall, "f-score": f_score}

In [8]:
train_model(training_data, testing_data, 12)#<--pls uncomment if need to do training
#from IPython.display import Image 
#pil_img = Image(filename='training_steps.jpeg')
#display(pil_img)

Beginning training
Loss	Precision	Recall	F-score
Training iteration 0
30.07132751822064	0.9999999999998764	0.9999999999998764	0.9999999999998764
Training iteration 1
8.248354661598569	0.9999999999998772	0.9999999999998772	0.9999999999998772
Training iteration 2
8.194442044317839	0.9999999999998792	0.9999999999998792	0.9999999999998792
Training iteration 3
8.168050613559899	0.9999999999998783	0.9999999999998783	0.9999999999998783
Training iteration 4
8.152934108133195	0.9999999999998777	0.9999999999998777	0.9999999999998777
Training iteration 5
8.142768362420611	0.9999999999998769	0.9999999999998769	0.9999999999998769
Training iteration 6
8.136201839210116	0.9999999999998763	0.9999999999998763	0.9999999999998763
Training iteration 7
8.132240991064464	0.9999999999998753	0.9999999999998753	0.9999999999998753
Training iteration 8
8.128194896766217	0.9999999999998759	0.9999999999998759	0.9999999999998759
Training iteration 9
8.127085898100631	0.9999999999998755	0.9999999999998755	0.99999999

In [9]:
# Load saved model
loaded_model = spacy.load("model_artifacts") 
TEST_REVIEW = "Exactly what I needed"

# Test predictions for reviews
def test_model(input_data: str = TEST_REVIEW):
    #  Load saved trained model
    loaded_model = spacy.load("model_artifacts")
    # Generate prediction
    parsed_text = loaded_model(input_data)
    # Determine prediction to return
    if parsed_text.cats["pos"] > parsed_text.cats["neg"]:
        prediction = "Positive"
        score = parsed_text.cats["pos"]
        overall =score*3+3
    else:
        prediction = "Negative"
        score = parsed_text.cats["neg"]
        overall =3-score*3
    print(f"Review text: {input_data}\nPredicted sentiment: {prediction}"
        f"\tScore: {score}")
    
    print('overall:%s'%overall)
    
test_model(TEST_REVIEW)

Review text: Exactly what I needed
Predicted sentiment: Positive	Score: 0.6154553294181824
overall:4.846365988254547
