# Feature Extraction

In [1]:
# Loading Modules
import pandas as pd 
import numpy as np
import collections
import matplotlib.pyplot as plt
import nltk 
from tqdm.notebook import tqdm
tqdm.pandas()
from sklearn.feature_extraction.text import TfidfTransformer
import seaborn as sns 

%matplotlib inline  
plt.rcParams["figure.figsize"] = (10, 7)

## Reading in Data

In [2]:
df = pd.read_pickle("labelled_data_preprocessed.pkl")
df.sample(5)

Unnamed: 0,class,RATING,VERIFIED_PURCHASE,PRODUCT_CATEGORY,raw_sentence,pos_tagged_sentence,lemmatized
17117,1,5,1,3,I go thru boxes of these at work I have been g...,"[(i, n), (go, v), (thru, a), (boxes, n), (of, ...","[go, thru, box, work, get, office, depot, year..."
20888,1,5,1,23,It came wrapped up in the cutest little bag an...,"[(it, ), (came, v), (wrapped, v), (up, r), (in...","[come, wrap, cut, little, bag, wrap, hairnet, ..."
20982,1,2,1,19,These run I would say two sizes smaller than w...,"[(these, ), (run, v), (i, n), (would, ), (say,...","[run, would, say, two, size, small, true, size..."
6557,0,5,0,16,Awesome movie and very creative The dancing is...,"[(awesome, a), (movie, n), (and, ), (very, r),...","[awesome, movie, creative, dancing, nice, exci..."
2746,0,4,0,13,Its a beautiful necklace I have received many ...,"[(its, ), (a, ), (beautiful, a), (necklace, n)...","[beautiful, necklace, receive, many, complimen..."


## Columns keys

**`class`** = class label the sentence (**this is our target label**).

    0 - Fake
    1 - Real

**`RATING`** = rating by customer (range 1 - 5)

**`VERIFIED_PURCHASE`** = binary label confirming if the product was truly purchased or not

**`PRODUCT_CATEGORY`** = numerical categories (range 1 - 19)
   
**`raw_sentence`** = raw sentence text

**`pos_tagged_sentence`** = an annotated version of each sentence obtained by:

    - lowercasing the sentence
    - tokenizing the text (roughly, splitting the into morphological units such as words and punctuation marks)
    - annotating the grammatical category (a.k.a. Part of Speech: PoS) of each token

**`lemmatized`** = annotated version of each sentence obtained by using the annotation available in the `pos_tagged_sentence` representation in order to tag the base form (e.g. "*be*" for the inflected word "*am*") of each token.

## Feature Extraction

In [3]:
# Core Feature Dataframe
feature_df = pd.DataFrame()

### Binary and Numerical Features

In [4]:
feature_df["rating"] = df["RATING"]

feature_df["category"] = df["PRODUCT_CATEGORY"]

feature_df["verified"] = df["VERIFIED_PURCHASE"]

### Word and Character Counts

In [5]:
# Word count
feature_df["word_counts"] = df["lemmatized"].apply(lambda x: len(str(x).split()))

# Char count + whitespace
feature_df["char_counts"] = df["lemmatized"].apply(lambda x: len(str(x)))

### N-Grams

In [6]:
# Formatting text into usable format
lemma_list = df["lemmatized"].tolist()
lemma_list = list(np.concatenate(lemma_list).flat)

In [7]:
def ngram_counter(lemmas, n):
    """
    Used to count the number of ngrams, finding the 200 most common
    """
    ngram_counts = collections.Counter(nltk.ngrams(lemmas,n))
    ngrams_200 = ngram_counts.most_common(200)
    
    return ngrams_200

In [8]:
# Bigrams - Most common 2 words in a row
bigrams_200 = ngram_counter(lemma_list, 2)

# Unigrams - Most common word
unigrams_200 = ngram_counter(lemma_list, 1)

In [9]:
def n_gram_counter(ngram_type, n):
    """
    Counting frequency each n-gram appears in a review
    """
    ngram_frequencies = []

    for ngram in tqdm(ngram_type):
        # 1 ngram 
        ngram_frequency = []
        for j in range(len(df)):
            # All dataframe
            # Count 
            ngram_freq = list(nltk.ngrams(df["lemmatized"][j],n)).count(ngram[0])
            # Append frequency in sentence
            ngram_frequency.append(ngram_freq)
        # Append frequency in all sentences
        ngram_frequencies.append(ngram_frequency)
        
    ngrams = pd.DataFrame(ngram_frequencies)
    return ngrams

In [10]:
bigrams = n_gram_counter(bigrams_200, 2)
unigrams = n_gram_counter(unigrams_200, 1)

# Combining both sets of n-grams
ngrams_df = pd.concat([unigrams, bigrams])
ngrams_df

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20990,20991,20992,20993,20994,20995,20996,20997,20998,20999
0,0,0,0,0,4,2,0,1,2,0,...,0,0,0,0,0,1,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,2,0
2,0,0,0,1,0,0,0,0,0,0,...,0,1,0,0,0,0,1,0,0,1
3,0,0,0,0,1,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,1,0,0,0,...,0,2,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
196,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
197,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
198,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
# Using TfidTransformer on existing count matrix
ngrams2_matrix_tfidf = TfidfTransformer(norm="l1").fit_transform(ngrams_df).todense()

### Combining All Feature Sets

In [12]:
print(feature_df.shape, ngrams2_matrix_tfidf.shape)
features_set_ngrams = np.concatenate((feature_df.T, ngrams2_matrix_tfidf))

# Adding y-label (to withstand later shuffling)
fs_ngram_df = pd.DataFrame(features_set_ngrams.T)
fs_ngram_df["y"] = df["class"]
fs_ngram_df.head()

(21000, 5) (400, 21000)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,396,397,398,399,400,401,402,403,404,y
0,4.0,0.0,0.0,10.0,91.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,4.0,1.0,1.0,40.0,400.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,3.0,2.0,0.0,22.0,198.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,4.0,3.0,0.0,19.0,173.0,0.0,0.0,0.00016,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,4.0,4.0,0.0,27.0,248.0,0.000577,0.0,0.0,0.000172,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [13]:
fs_ngram_df.to_pickle("features_as_vectors2.pkl")