In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import numpy as np
import pandas as pd

In [3]:
data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/amazon/FshionProductReviews_V5_NLP.csv')
data = data[['reviewText', 'sentiment']]
data.head()

Unnamed: 0,reviewText,sentiment
0,agree opening small bent hook expensive earrin...,0
1,tiny opening,0
2,little plastic back work great loosing hook ea...,1
3,mother law wanted present sister d work,0
4,look stretched carefully push case finger...,0


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 658184 entries, 0 to 658183
Data columns (total 2 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   reviewText  655715 non-null  object
 1   sentiment   658184 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 10.0+ MB


In [5]:
data.isnull().sum()

reviewText    2469
sentiment        0
dtype: int64

In [6]:
data.dropna(axis=0, inplace=True)

In [7]:
data.isnull().sum()

reviewText    0
sentiment     0
dtype: int64

In [8]:
data.sentiment.value_counts()

1    444251
0    211464
Name: sentiment, dtype: int64

# Create a Balanced Dev Set

In [9]:
devSet1 = data[data['sentiment']==0].head(10000)
devSet2 = data[data['sentiment']==1].head(10000)
devSet3 = data[data['sentiment']==0].tail(10000)
devSet4 = data[data['sentiment']==1].tail(10000)

In [10]:
devSet = pd.concat(
    [devSet1, devSet2, devSet3, devSet4],
    ignore_index=True,
    axis=0
)

In [11]:
devSet.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   reviewText  40000 non-null  object
 1   sentiment   40000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 625.1+ KB


In [12]:
devSet.sentiment.value_counts()

0    20000
1    20000
Name: sentiment, dtype: int64

In [13]:
del devSet1, devSet2, devSet3, devSet4

# Splitting Data into Train and Test

In [14]:
from sklearn.model_selection import train_test_split

In [15]:
train_tokens, test_tokens, train_sentiment, test_sentiment = train_test_split(
    devSet[['reviewText']], devSet.sentiment, 
    test_size=0.1, 
    random_state=33, 
    shuffle=True
)

In [16]:
train_tokens.head()

Unnamed: 0,reviewText
13593,find shoe hot foot hot foot seriously arent ai...
15785,perfect every super casual anywhere sandal nev...
37514,exactly picture show love fit size fit great m...
27594,tickle skin little little long van fix
29695,terrible close suit size needed owned suit m...


In [17]:
test_tokens.head()

Unnamed: 0,reviewText
5502,break half try really bad chance
36357,year like look great quality comfortable brea...
23479,advertised bikini sent bathing suit x big
19736,high arch work well expensive medical orthotic...
29026,care hot jogger company


In [18]:
train_tokens.shape, test_tokens.shape

((36000, 1), (4000, 1))

# **Feature Extraction**

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [20]:
vectorizer = TfidfVectorizer(
    max_features=100
    )

In [21]:
train_embeddings = vectorizer.fit_transform(train_tokens.reviewText).toarray()
test_embeddings = vectorizer.fit_transform(test_tokens.reviewText).toarray()

In [22]:
train_embeddings.shape

(36000, 100)

In [23]:
test_embeddings.shape

(4000, 100)

In [24]:
train_embeddings.max(axis=1)

array([0.84150072, 0.66499625, 0.4858737 , ..., 0.7703328 , 1.        ,
       0.77462271])

# Save Vectorizer

In [25]:
from joblib import dump

In [26]:
dump(
    vectorizer, 
     '/content/drive/MyDrive/Colab Notebooks/amazon/TfIdfVectorizer.pkl',
     compress=1
)

['/content/drive/MyDrive/Colab Notebooks/amazon/TfIdfVectorizer.pkl']

# Save to feature store

In [27]:
np.savez(
    '/content/drive/MyDrive/Colab Notebooks/amazon/dataset_V6.npz',
    train_embeddings=train_embeddings, train_sentiment=train_sentiment,
    test_embeddings=test_embeddings, test_sentiment=test_sentiment,
)