
# Feature engineering

#### Feature engineering
Since we're trying to do NLP using Naive Bayes' model, we'll need to clean and transform the text:

In [13]:
import pandas as pd
from collections import Counter
import string
import re

def deEmojify(text):
    regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags = re.UNICODE)
    return regrex_pattern.sub(r'',text)


def process_text(text):
    # Text to lowercase
    text = text.lower()

    # Remove punctuation
    text = text.translate(str.maketrans("", "", string.punctuation))

    # Remove emojis and special characters
    text = deEmojify(text)

    return text



In [14]:
total_data = pd.read_csv("../data/interim/playstore_reviews.csv")

total_data["review"] = total_data["review"].apply(process_text)
total_data.head()

Unnamed: 0,review,polarity
0,privacy at least put some option appear offli...,0
1,messenger issues ever since the last update i...,0
2,profile any time my wife or anybody has more ...,0
3,the new features suck for those of us who don...,0
4,forced reload on uploading pic on replying co...,0


In [15]:
from sklearn.model_selection import train_test_split

# We divide the dataset into training and test samples.
X = total_data.drop("polarity", axis = 1)
y = total_data["polarity"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 71)

In [16]:
X_train["polarity"] = list(y_train)
X_test["polarity"] = list(y_test)

X_train.to_csv("../data/processed/clean_train.csv", index = False)
X_test.to_csv("../data/processed/clean_test.csv", index = False)