## Importing Libraries

In [8]:
import pandas as pd
import numpy as np
import re
import string
import dill
import swifter

%load_ext nb_black

The nb_black extension is already loaded. To reload it, use:
  %reload_ext nb_black


<IPython.core.display.Javascript object>

## Importing Dataset

In [14]:
# df = pd.read_csv("data.csv.gz", compression="gzip")
test = pd.read_csv("incentivized_examples.csv")

<IPython.core.display.Javascript object>

In [10]:
data = df.copy()

<IPython.core.display.Javascript object>

## Importing NLTK library and defining Lemmatization function

In [11]:
import nltk

nltk.download("averaged_perceptron_tagger")
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

lemmatizer = WordNetLemmatizer()


def lemmatize_sentence(sentence):
    nltk_tagged = nltk.word_tokenize(sentence)
    res_words = []
    for word in nltk_tagged:
        res_words.append(lemmatizer.lemmatize(word))
    return " ".join(res_words)

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\dlwss\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


<IPython.core.display.Javascript object>

## Defining preprocessing function
### Lowercasing the characters
### Removing leading and trailing white spaces
### Lemmatize the Review body

In [12]:
def preprocess(var):
    var = var.dropna(subset=["review_body"])
    var["review_body"] = var["review_body"].str.lower()
    var["review_body"] = var["review_body"].str.strip()
    var["review_body"] = var["review_body"].swifter.apply(
        lambda x: (lemmatize_sentence(x))
    )
    return var

<IPython.core.display.Javascript object>

## Running preprocessing of main data set and incentivized examples set

In [15]:
# data = preprocess(data)
test = preprocess(test)

HBox(children=(IntProgress(value=0, description='Pandas Apply', max=1022, style=ProgressStyle(description_widt…




<IPython.core.display.Javascript object>

In [113]:
test.to_csv("incentivized_lemma.csv")

<IPython.core.display.Javascript object>

## List of combined Regular Expressions

In [59]:
regex_list = "|".join(
    [
        r"(\btest\b|\breceiv(e|ed|es)\b|suppli)[^\.;!]{0,90}(evaluat(e|ion)|review|sample|(?<![-])\bfree\b(?![-])|\bno (cost|charge)\b)",
        r"\Awow !",
        r"^as usual \bi\b receive[^\.]*?(?<![-])\bfree\b(?![-])",
        r"exchange[^\.!]*?(review|opinion|assessment)",
        r"(honest|unbias|asked(?!\b his\b|\b her\b|\b him\b))[^\.!]{0,40}(opinion|review|(evaluat(e|ion)))",
        r"(honest|unbias)[^\.!]{0,10}results",
        r"(provid(e|ed)|\bsen(d|t)\b|offered|\breceiv(e|ed|es)\b)[^\.!?]{0,50}(evaluat(e|ion)|review|sample|\btest(|ing)\b)",
        r"unu \bsen(d|t)\b me .*?",
        r"reach[^\.!]{0,40}offer[^\.!]{0,40}(\btry\b|\btest\b|trial|sample|review|evaluat(e|ion))",
        r"(sample)[^\.!]{0,40}(\btest\b|\breceive(|d|s)\b|provide|give|offer|\bsen(d|t)\b|supplied)[^\.!]{0,40}(review)",
        r"(review)[^\.!]{0,40}(\btest\b|\breceive(|d|s)\b|provide|give|offer|\bsen(d|t)\b|supplied)[^\.!]{0,40}(sample)",
        r"(disclaimer|disclosure)[^\.!]*?(review|sample|(?<![-])\bfree\b(?![-])|\btest(|ing)\b|program)",
        r"((?<![-])\bfree\b(?![-])|offe(r|red)|\bno (cost|charge|obligation)\b)[^\.!\d]{0,20}(review|sample|\btest(|ing)\b|trial|\bpromo(|tion)\b)",
        r"(review|sample|\btest\b|trial|\bpromo(|tion)\b)[^\.!]{0,20}((?<![-])\bfree\b(?![-])|offe(r|red)|\bno (cost|charge|obligation)\b)",
        r"(?<![-])\bfree\b(?![-])[^\.!]{0,40}thank[^\.!]{0,20}unu",
        r"(review(ed|)|(\bget chance try\b))[^\.!]{0,30}(sample)",
        r"sample[^\.!]{0,20}(provid(ed|e)|review)",
        r"thank[^\.!]*?(\breview\b|provid(ed|e|ing)|\bsen(d|t|ding)\b)[^\.!]*?(product|sample|(?<![-])\bfree\b(?![-])|\btes\bt|program|\bunit\b|case|pack)",
        r"(g(e|o)t|give|\btry\b)[^\.!]{0,40}(sample|stipulation)",
        r"(g(e|o)t|give|\btry\b|provide)[^\.!]{0,20}(\bitem\b|device|product|chance)[^\.!]{0,20}(\breview\b|\btest(|ing)\b|from|\btry\b|(?<![-])\bfree\b(?![-]))",
        r"(g(e|o)t|give|\btry\b|provide)[^\.!]{0,20}\bfree (from|to try)\b",
        r"(professional reviewer|utry|product tester|amazon vine)",
        r"\bsample from\b",
        r"compl(i|e)mentary[^\.!]{0,20}(\bitem\b|product|review|sample|(?<![-])\bfree\b(?![-])|\btest\b|program|\bunit\b)",
        r"\breceive(|d|s)\b[^\.!]{0,40}(\bfrom\b|through|marketing)[^\.!]{0,30}(program|company|campaign|maxboost)",
    ]
)

<IPython.core.display.Javascript object>

In [60]:
print(regex_list)

(\btest\b|\breceiv(e|ed|es)\b|suppli)[^\.;!]{0,90}(evaluat(e|ion)|review|sample|(?<![-])\bfree\b(?![-])|\bno (cost|charge)\b)|\Awow !|^as usual \bi\b receive[^\.]*?(?<![-])\bfree\b(?![-])|exchange[^\.!]*?(review|opinion|assessment)|(honest|unbias|asked(?!\b his\b|\b her\b|\b him\b))[^\.!]{0,40}(opinion|review|(evaluat(e|ion)))|(honest|unbias)[^\.!]{0,10}results|(provid(e|ed)|\bsen(d|t)\b|offered|\breceiv(e|ed|es)\b)[^\.!?]{0,50}(evaluat(e|ion)|review|sample|\btest(|ing)\b)|unu \bsen(d|t)\b me .*?|reach[^\.!]{0,40}offer[^\.!]{0,40}(\btry\b|\btest\b|trial|sample|review|evaluat(e|ion))|(sample)[^\.!]{0,40}(\btest\b|\breceive(|d|s)\b|provide|give|offer|\bsen(d|t)\b|supplied)[^\.!]{0,40}(review)|(review)[^\.!]{0,40}(\btest\b|\breceive(|d|s)\b|provide|give|offer|\bsen(d|t)\b|supplied)[^\.!]{0,40}(sample)|(disclaimer|disclosure)[^\.!]*?(review|sample|(?<![-])\bfree\b(?![-])|\btest(|ing)\b|program)|((?<![-])\bfree\b(?![-])|offe(r|red)|\bno (cost|charge|obligation)\b)[^\.!\d]{0,20}(review|sampl

<IPython.core.display.Javascript object>

## Percentage of Incentivized Reviews detected in data set

In [62]:
incent = data[data["review_body"].str.contains(regex_list)]
incent["discount_text"] = incent["review_body"].swifter.apply(
    lambda x: re.search(regex_list, x).group(0)
)
print(
    "Percentage of Incentivized Reviews in data set",
    (len(incent) / len(data)) * 100,
    "%",
)

HBox(children=(IntProgress(value=0, description='Pandas Apply', max=51067, style=ProgressStyle(description_wid…




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Percentage of Incentivized Reviews in data set 2.553392130970161 %


<IPython.core.display.Javascript object>

## Percentage of Incentivized Reviews detected from incentivized_examples.csv

In [63]:
incent_test = test[test["review_body"].str.contains(regex_list)]
incent_test_fail = test[~test["review_body"].str.contains(regex_list)]
print(
    "Percentage of Incentivized Reviews detected from incentivized_examples.csv:",
    (len(incent_test) / len(test)) * 100,
    "%",
)

Percentage of Incentivized Reviews detected from incentivized_examples.csv: 98.53228962818004 %


<IPython.core.display.Javascript object>

## Saving Results File

In [64]:
incent_test_fail.to_csv("Incentivised Reviews Missed.csv", index=False)
incent.to_csv("Incentivised Reviews.csv", index=False)

<IPython.core.display.Javascript object>

In [31]:
dill.dump_session("dump1.db")

<IPython.core.display.Javascript object>