# ML Pipeline & Flask Deployment

In [1]:
import joblib
import numpy as np 
import pandas as pd
from tqdm.notebook import tqdm
tqdm.pandas()
import nltk
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.corpus import stopwords
import time
import string
import flask

In [2]:
lgbm_model = joblib.load("false_review_model.sav")

In [3]:
def punc_remove(text):
    """
    Removing punctuation 
    """
    punc = list(string.punctuation)
    
    punc_free = "".join([i for i in text if i not in punc])
    
    return punc_free

def tokenise(text):
    # Creating word tokens in lower case
    tokens = word_tokenize(text.lower())
    return tokens 

def pos_tokens(tokens):
    """
    Returns tokenised and tagged words
    Required pre-process for nltk lemmatizing
    """
    
    # Tagging using nltk 
    pos_token = nltk.pos_tag(tokens)
    
    # Replacing tags with tags accepted by lemmatiser 
    for i, (word, tag) in enumerate(pos_token):
        # Adjectives
        if tag.startswith("J"):
             tag = wordnet.ADJ
        # Verbs 
        elif tag.startswith("V"):
             tag = wordnet.VERB
        # Nouns 
        elif tag.startswith("N"):
             tag = wordnet.NOUN
        # Adverbs 
        elif tag.startswith("R"):
             tag = wordnet.ADV
        else:
             tag = ""
        # Replacing tags 
        pos_token[i] = (word, tag)
    
    return pos_token

lm = WordNetLemmatizer()

def get_all_lemmas(df, pos_tokens):
    # Creating empty list
    lemmatized_list = []
    stop = stopwords.words("english")
    
    # Loop through all tagged sentences
    for sentence in pos_tokens:
        sent = []
        # Loop through all words and POS tag
        for word, tag in sentence:
            if tag in ["a", "r", "n", "v"] and word not in stop:
                # Only nouns, adverbs, adjectives and verbs have lemma form
                lemma = lm.lemmatize(word, tag)
                sent.append(lemma)
            elif word not in stop:
                # Keeping non-stop words (NLP useful)
                lemma = word
                sent.append(lemma)
            else:
                pass

        lemmatized_list.append(sent)
    
    # Adding lemmas to dataframe 
    df["lemmatized"] = lemmatized_list
    
    return df["lemmatized"].sample(5)

In [4]:
def cleaning(df):
    """
    Main function for cleaning data 
    """
    df["raw_sentence"] = df["Raw_text"].apply(lambda x:punc_remove(x))
    
    df["token_sentence"] = df["raw_sentence"].apply(lambda x:tokenise(x))
    
    # Applying function to entire dataset 
    df["pos_tagged_sentence"] = df["token_sentence"].apply(lambda x: pos_tokens(x))

    get_all_lemmas(df, df["pos_tagged_sentence"])
    
    return

In [5]:
def feature_engineering(df):
    
    feature_df = pd.DataFrame()
    lemma_list = df["lemmatized"].tolist()
    lemma_list = list(np.concatenate(lemma_list).flat)
    
    # Word count
    feature_df["word_counts"] = df["lemmatized"].apply(lambda x: len(str(x).split()))

    # Char count + whitespace
    feature_df["char_counts"] = df["lemmatized"].apply(lambda x: len(str(x)))
    
    feature_df["rating"] = df["Rating"]

    feature_df["verified"] = df["Verification"]
    
    return feature_df

In [6]:
def result(df):
    feature_df = feature_engineering(df)
    y_pred = lgbm_model.predict(feature_df)
    result = y_pred.mean()
    df["Class"] = y_pred
    
    if result >= 0.5: 
        detection = "Real" 
    if result < 0.5:
        detection = "Fake"
    
    return detection

In [7]:
%run -i Amazon_Webscraper.ipynb

In [8]:
def get_data(given_url):
    review_data = pd.DataFrame()
    while review_data.empty == True:
        data = scraper_main(given_url = given_url,
                            start_page = 1,
                            end_page = 3, 
                            delay = 3)

        review_data = pd.DataFrame(data, columns = ["Rating", "Title", "Date", "Verification", "Raw_text"])
    else:
        return review_data
    
    return review_data

In [9]:
data = get_data("https://www.amazon.com/Magnetic-Pieces-Refrigerator-Whiteboard-Magnets/dp/B07GNC3JKN/?_encoding=UTF8&pd_rd_w=C1kL0&content-id=amzn1.sym.345ade0e-e021-4dab-a1e1-d299481e27a3&pf_rd_p=345ade0e-e021-4dab-a1e1-d299481e27a3&pf_rd_r=CVPH8BVWK571QFE671HM&pd_rd_wg=I64K6&pd_rd_r=c7bb3c30-774c-4fec-94e3-5cb9b57c7c6b&ref_=pd_gw_exports_top_sellers_unrec")

https://www.amazon.com/Magnetic-Pieces-Refrigerator-Whiteboard-Magnets/dp/B07GNC3JKN/?_encoding=UTF8&pd_rd_w=C1kL0&content-id=amzn1.sym.345ade0e-e021-4dab-a1e1-d299481e27a3&pf_rd_p=345ade0e-e021-4dab-a1e1-d299481e27a3&pf_rd_r=CVPH8BVWK571QFE671HM&pd_rd_wg=I64K6&pd_rd_r=c7bb3c30-774c-4fec-94e3-5cb9b57c7c6b&ref_=pd_gw_exports_top_sellers_unrec


In [10]:
data

Unnamed: 0,Rating,Title,Date,Verification,Raw_text
0,5,Great product,"August 24, 2022",1,These magnets were great for the cruise ship r...
1,5,Strong magnets great for holding pencils or he...,"August 22, 2022",1,I used these in my classroom to hold pencils a...
2,5,Cute &amp; handy,"August 28, 2022",1,Cute magnets. Super handy for hanging stuff o...
3,5,Good basic usage clips!,"August 19, 2022",1,Work well for what I bought them for: hanging ...
4,5,Great,"August 30, 2022",1,Really sturdy.
5,5,Teacher must have!!,"August 8, 2022",1,"Perfect blend of colors, sturdy and able to ha..."
6,5,Great classroom clips!,"July 27, 2022",1,These classroom clips are sturdy and the magne...
7,5,Great clips!,"July 21, 2022",1,I am so impressed with the quality of these cl...
8,5,Great product,"August 24, 2022",1,These magnets were great for the cruise ship r...
9,5,Strong magnets great for holding pencils or he...,"August 22, 2022",1,I used these in my classroom to hold pencils a...


In [14]:
def pipeline(given_url):
    """
    Main pipeline for data extraction, cleaning, feature engineering, modelling and output 
    """
    start = time.time()
    
    review_data = get_data(given_url)

    cleaning(review_data)
    
    detection = result(review_data)
    review_data.head()
    end = time.time()
    
    print("\n", end - start)
    
    return detection

In [15]:
pipeline("https://www.amazon.com/Magnetic-Pieces-Refrigerator-Whiteboard-Magnets/dp/B07GNC3JKN/?_encoding=UTF8&pd_rd_w=C1kL0&content-id=amzn1.sym.345ade0e-e021-4dab-a1e1-d299481e27a3&pf_rd_p=345ade0e-e021-4dab-a1e1-d299481e27a3&pf_rd_r=CVPH8BVWK571QFE671HM&pd_rd_wg=I64K6&pd_rd_r=c7bb3c30-774c-4fec-94e3-5cb9b57c7c6b&ref_=pd_gw_exports_top_sellers_unrec")

https://www.amazon.com/Magnetic-Pieces-Refrigerator-Whiteboard-Magnets/dp/B07GNC3JKN/?_encoding=UTF8&pd_rd_w=C1kL0&content-id=amzn1.sym.345ade0e-e021-4dab-a1e1-d299481e27a3&pf_rd_p=345ade0e-e021-4dab-a1e1-d299481e27a3&pf_rd_r=CVPH8BVWK571QFE671HM&pd_rd_wg=I64K6&pd_rd_r=c7bb3c30-774c-4fec-94e3-5cb9b57c7c6b&ref_=pd_gw_exports_top_sellers_unrec

 11.114157438278198


'Real'

In [16]:
# start flask
app = flask.Flask(__name__)

@app.route('/', methods=['GET', 'POST'])
def main_page():
    if flask.request.method == 'POST':
        global given_url
        given_url = flask.request.form.get('url')
        if given_url != None:
            return flask.redirect(flask.url_for('prediction'))
        else:
            #flask.flash('Your URL cannot be read, please try again')
            return flask.render_template('index.html')
    return flask.render_template('index.html')

@app.route('/prediction/')
def prediction():
    prediction = pipeline(given_url)
    return flask.render_template('prediction.html', prediction = prediction)

In [None]:
app.run(debug=True, use_reloader=False)

 * Serving Flask app '__main__' (lazy loading)
 * Environment: production
[2m   Use a production WSGI server instead.[0m
 * Debug mode: on


 * Running on http://127.0.0.1:5000 (Press CTRL+C to quit)
127.0.0.1 - - [21/Sep/2022 00:52:33] "GET / HTTP/1.1" 200 -


# Sample links 
## Fake 
1.https://www.amazon.com/Curing-Jewelry-Casting-Setting-Powered/product-reviews/B07X48MKZF/ref=cm_cr_arp_d_paging_btm_next_2?ie=UTF8&reviewerType=all_reviews&pageNumber=2

2.https://www.amazon.com/300-FPS-Airsoft-Powered-Digital/product-reviews/B075MSNJXZ/ref=cm_cr_unknown?ie=UTF8&reviewerType=all_reviews&pageNumber=1&filterByStar=five_star

3.https://www.amazon.com/BAOFENG-uv-5r-BaoFeng-UV-5R-Ham-Radio-BaoFeng-Radio-with-Extra-1800mAh-Battery-and-TIDRADIO-771-Antenna-Dual-Band-Ham-Radio-Handheld-Includes-Full-Kit-BaoFeng-Walkie-Talkie/product-reviews/B0925XWVS8/ref=cm_cr_arp_d_viewopt_sr?ie=UTF8&reviewerType=all_reviews&pageNumber=1&filterByStar=five_star