In [1]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.tokenize import TweetTokenizer
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer

from nltk.corpus import names  

from string import punctuation
import pandas as pd

In [2]:
eng_stopwords = stopwords.words('english')
eng_stopwords

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [3]:
#load the data from the Womens Clothing E-Commerce Reviews.csv file
filepath = r"C:\Users\Motunrayo Adubi\OneDrive\Desktop\5_week_Immersive\Womens Clothing E-Commerce Reviews.csv"
df = pd.read_csv(filepath, encoding = "latin-1") 

# show the first 5 rows in data frame
df.head()

Unnamed: 0.1,Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name
0,0,767,33,,Absolutely wonderful - silky and sexy and comf...,4,1,0,Initmates,Intimate,Intimates
1,1,1080,34,,Love this dress! it's sooo pretty. i happene...,5,1,4,General,Dresses,Dresses
2,2,1077,60,Some major design flaws,I had such high hopes for this dress and reall...,3,0,0,General,Dresses,Dresses
3,3,1049,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,1,0,General Petite,Bottoms,Pants
4,4,847,47,Flattering shirt,This shirt is very flattering to all due to th...,5,1,6,General,Tops,Blouses


In [4]:
# number of rows and columns in data frame
df.shape

(23486, 11)

In [5]:
# extract relevant column into a new data frame (df_rel)
data = [df["Review Text"]]

headers = ["Review Text"]

df_rel = pd.concat(data, axis=1, keys=headers)

In [6]:
df_rel

Unnamed: 0,Review Text
0,Absolutely wonderful - silky and sexy and comf...
1,Love this dress! it's sooo pretty. i happene...
2,I had such high hopes for this dress and reall...
3,"I love, love, love this jumpsuit. it's fun, fl..."
4,This shirt is very flattering to all due to th...
...,...
23481,I was very happy to snag this dress at such a ...
23482,"It reminds me of maternity clothes. soft, stre..."
23483,"This fit well, but the top was very see throug..."
23484,I bought this dress for a wedding i have this ...


In [7]:
df_rel.shape

(23486, 1)

In [8]:
# drop missing values from data frame
clean_df = df_rel.dropna() 
clean_df

Unnamed: 0,Review Text
0,Absolutely wonderful - silky and sexy and comf...
1,Love this dress! it's sooo pretty. i happene...
2,I had such high hopes for this dress and reall...
3,"I love, love, love this jumpsuit. it's fun, fl..."
4,This shirt is very flattering to all due to th...
...,...
23481,I was very happy to snag this dress at such a ...
23482,"It reminds me of maternity clothes. soft, stre..."
23483,"This fit well, but the top was very see throug..."
23484,I bought this dress for a wedding i have this ...


In [9]:
# 845 values in the Review Text column of original data frame
df.isna().sum()

Unnamed: 0                    0
Clothing ID                   0
Age                           0
Title                      3810
Review Text                 845
Rating                        0
Recommended IND               0
Positive Feedback Count       0
Division Name                14
Department Name              14
Class Name                   14
dtype: int64

In [10]:
#initilize function to do sentiment analysis
sid = SentimentIntensityAnalyzer()

In [11]:
#create a function to clean up each review
#then it will analyze and assign a sentiment polarity
def reviewSentiment(review):
    
    #make text lowercase
    review = review.lower()
    
    #tokenize the review
    #tknz_review is a list
    tknz_review = word_tokenize(review)
    
    #remove puntuation
    for token in tknz_review:
        if token in punctuation:
            tknz_review.remove(token)
    
    #empty list to hold "cleaned" tokens
    clean_tokens = []
    
    #remove filler words
    for token in tknz_review:
        if token not in eng_stopwords:
            clean_tokens.append(token)
            
    #put sentence back together with remaining clean words
    clean_review = ' '.join(clean_tokens)
    
    #get the polarity scores dictionary
    sid_rev = sid.polarity_scores(clean_review)
    
    #get sentiment polarity from the "compound" key in the sid_rev dictionary
    r_comp = sid_rev['compound']
    
    #return the sentiment value
    return r_comp

In [12]:
#create a new column to hold sentiment value from function
clean_df['review_sentiment'] = clean_df['Review Text'].apply(reviewSentiment)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [13]:
#erify sentiment values in new column
clean_df.head()

Unnamed: 0,Review Text,review_sentiment
0,Absolutely wonderful - silky and sexy and comf...,0.8991
1,Love this dress! it's sooo pretty. i happene...,0.971
2,I had such high hopes for this dress and reall...,0.9062
3,"I love, love, love this jumpsuit. it's fun, fl...",0.9464
4,This shirt is very flattering to all due to th...,0.9117


In [14]:
clean_df.dtypes

Review Text          object
review_sentiment    float64
dtype: object

In [15]:
#create a function to assign a polarity category to the sentiment
def sentimentCategory(sent_num):
    if sent_num >= 0.2:
        return "positive"
    if sent_num <= -0.2:
        return "negative"
    else:
        return "neutral"

In [16]:
#create a new column to hold sentiment category
clean_df['sentiment_category'] = clean_df['review_sentiment'].apply(sentimentCategory)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [17]:
clean_df.head()

Unnamed: 0,Review Text,review_sentiment,sentiment_category
0,Absolutely wonderful - silky and sexy and comf...,0.8991,positive
1,Love this dress! it's sooo pretty. i happene...,0.971,positive
2,I had such high hopes for this dress and reall...,0.9062,positive
3,"I love, love, love this jumpsuit. it's fun, fl...",0.9464,positive
4,This shirt is very flattering to all due to th...,0.9117,positive


In [18]:
clean_df.tail()

Unnamed: 0,Review Text,review_sentiment,sentiment_category
23481,I was very happy to snag this dress at such a ...,0.8979,positive
23482,"It reminds me of maternity clothes. soft, stre...",0.7579,positive
23483,"This fit well, but the top was very see throug...",0.91,positive
23484,I bought this dress for a wedding i have this ...,0.8272,positive
23485,This dress in a lovely platinum is feminine an...,0.9286,positive


In [19]:
clean_df.loc[25,'sentiment_category']

'positive'

In [20]:
#compare frequency of positive, negative, and neutral reviews
clean_df['sentiment_category'].value_counts()

positive    21380
neutral       717
negative      544
Name: sentiment_category, dtype: int64

In [21]:
clean_df['Review Text'].iloc[0]

'Absolutely wonderful - silky and sexy and comfortable'

In [22]:
clean_df['review_sentiment'].iloc[0]

0.8991

In [23]:
clean_df['sentiment_category'].iloc[0]

'positive'

In [24]:
clean_df['Review Text'].iloc[4]

'This shirt is very flattering to all due to the adjustable front tie. it is the perfect length to wear with leggings and it is sleeveless so it pairs well with any cardigan. love this shirt!!!'

In [25]:
clean_df['review_sentiment'].iloc[4]

0.9117

In [26]:
clean_df['sentiment_category'].iloc[4]

'positive'

In [27]:
clean_df[['sentiment_category']][101:120]

Unnamed: 0,sentiment_category
104,neutral
105,positive
106,positive
107,positive
108,positive
109,positive
110,negative
111,positive
112,positive
113,positive
