In [26]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
import string
from nltk.stem import LancasterStemmer
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')
from textblob import TextBlob
import json
import plotly.express as px

[nltk_data] Downloading package wordnet to /Users/alex/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [14]:
with open('gz_2010_us_040_00_500k.json') as f:
    geo_json = json.load(f)

In [59]:
states_abbreviation = {
    'AL': 'ALABAMA', 'AK': 'ALASKA', 'AZ': 'ARIZONA', 'CA': 'CALIFORNIA', 'CO': 'COLORADO',
    'CT': 'CONNECTICUT', 'DE': 'DELAWARE', 'DC': 'DISTRICT OF COLUMBIA', 'FL': 'FLORIDA',
    'GA': 'GEORGIA', 'HI': 'HAWAII', 'ID': 'IDAHO', 'IL': 'ILLINOIS', 'IA': 'IOWA', 'IN': 'INDIANA', 'KS': 'KANSAS',
    'KY': 'KENTUCKY', 'LA': 'LOUISIANA', 'ME': 'MAINE', 'MD': 'MARYLAND', 'MA': 'MASSACHUSETTS',
    'MI': 'MICHIGAN', 'MN': 'MINNESOTA', 'MS': 'MISSISSIPI', 'MO': 'MISSOURI', 'MT': 'MONTANA', 'NE': 'NEBRASKA',
    'NV': 'NEVADA', 'NH': 'NEW HAMPSHIRE', 'NJ': 'NEW JERSEY', 'NM': 'NEW MEXICO', 'NY': 'NEW YORK',
    'NC': 'NORTH CAROLINA', 'ND': 'NORTH DAKOTA', 'OH': 'OHIO', 'OK': 'OKLAHOMA', 'OR': 'OREGON',
    'PA': 'PENNSYLVANIA', 'PR': 'PUERTO RICO', 'RI': 'RHODE ISLAND', 'SC': 'SOUTH CAROLINA', 'SD': 'SOUTH DAKOTA',
    'TN': 'TENNESSEE', 'TX': 'TEXAS', 'UT': 'UTAH', 'VT': 'VERMONT', 'VA': 'VIRGINIA', 'WA': 'WASHINGTON',
    'WV': 'WEST VIRGINIA', 'WI': 'WISCONSIN', 'WY': 'WYOMING'
}

In [2]:
df = pd.read_csv('Final_cleaned_data_2.csv', lineterminator='\n')
df.drop('Unnamed: 0', axis=1, inplace=True)

In [3]:
df.head()

Unnamed: 0,brand,model,rating,review,review_date,review_location,reviewer_name,trial_nights,warranty_years
0,purple,The Purple Mattress,5.0,I am a hard worker with way more to physically...,2021-02-22,United States,Jessica G.,100.0,10.0
1,purple,The Purple Mattress,5.0,First off I am a disabled veteran with insomni...,2017-09-19,"Warrenton, Virginia",Jared B.,100.0,10.0
2,purple,The Purple Mattress,1.0,This mattress is well made and the process of ...,2021-03-05,United States,LeAnn M.,100.0,10.0
3,purple,The Purple Mattress,3.0,The regular twin foam mattress was marked as a...,2021-03-04,United States,Kathryn W.,100.0,10.0
4,purple,The Purple Mattress,4.0,We had a premium mattress for years (Sterns an...,2021-03-04,United States,Adam R.,100.0,10.0


In [10]:
df.review_location.value_counts()

 United States       5994
  US                 1798
  Undisclosed         502
                      180
 Los Angeles           93
                     ... 
Fort Worth, TX          1
 St Pete Beach          1
 San Rafael             1
 Port Orange FL         1
  Glen Cove, NY         1
Name: review_location, Length: 6595, dtype: int64

In [None]:
stopwords = stopwords.words('english')
def preprocess_text(text, lemmatization=True, stemming=True):   
    no_punctuation = [char for char in text if char not in string.punctuation]
    no_punctuation = ''.join(no_punctuation)
    words = [word for word in no_punctuation.split() if word.lower() not in stopwords]
    
    if lemmatization:
        lemmatizer = WordNetLemmatizer()
        words = [lemmatizer.lemmatize(word) for word in words]
    
    if stemming:
        lancaster = LancasterStemmer()
        words = [lancaster.stem(word) for word in words]
        
    return words

In [None]:
df['tokenized_text'] = [preprocess_text(text.lower(), True, False) for text in df['review']]

## Count Vectorizers

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
def word_counts(text, brand, most_frequent=True, total_words=10, n_grams=1, remove_stop_words=True):
    if most_frequent:
        if remove_stop_words:
            count_vect = CountVectorizer(stop_words='english', ngram_range=(n_grams, n_grams), lowercase=True, 
                                         strip_accents='unicode', max_features=total_words)
        else:
            count_vect = CountVectorizer(ngram_range=(n_grams, n_grams), lowercase=True, 
                                         strip_accents='unicode', max_features=total_words)
        
        word_counts = count_vect.fit_transform(text)

        feature_words = count_vect.get_feature_names()
        feature_words = np.asarray(feature_words)

        count_list = word_counts.toarray().sum(axis=0)

        word_counts_df = pd.DataFrame([feature_words, count_list], index=['Words', 'Count'])
        word_counts_df = word_counts_df.transpose()
        word_counts_df.sort_values('Count', ascending=False, inplace=True)
        word_counts_df['brand'] = brand

    else:
        tfidf = TfidfVectorizer(stop_words='english', lowercase=True, strip_accents='unicode')
        word_counts = tfidf.fit_transform(text)
        feature_words = tfidf.get_feature_names()
        
    
    return word_counts_df

In [None]:
wc_df = pd.DataFrame()

for brand in df['brand'].unique():
    brand_df = df[df['brand'] == brand]
    brand_reviews = [' '.join(brand_text) for brand_text in brand_df['tokenized_text']]
    bwc_df = word_counts(brand_reviews, brand, True, 10, 1, True)
    print(brand)
    print(bwc_df)
    print('\n')
    wc_df = wc_df.append(bwc_df)

In [None]:
plt.figure(figsize=(15, 5))
g=sns.barplot(x='Words', y='Count', data=wc_df, hue='brand')
g.set_xticklabels(g.get_xticklabels(), rotation=30)
plt.title('Top 10 words count for each brand')
plt.show()

## Feature Engineering

In [None]:
text_test = 'I am a side sleeper. That means I like to sleep on my side.'

In [None]:
# Let's try and create a column that contains info on what type of sleeper someone is (side-sleeper, back, stomach)
side_sleeper_idx = []
back_sleeper_idx = []
stomach_sleeper_idx = []

for index, row in df.iterrows():
    side_sleepers = re.findall('(sleep on my side|side sleep)', row.review)
    if len(side_sleepers) == 0:
        side_sleepers = None
    else:
        side_sleeper_idx.append(index)
        
    back_sleepers = re.findall('(sleep on my back|back sleep)', row.review)
    if len(back_sleepers) == 0:
        back_sleepers = None
    else:
        back_sleeper_idx.append(index)
        
    stomach_sleepers = re.findall('(stomach|belly)', row.review)
    if len(stomach_sleepers) == 0:
        stomach_sleepers = None
    else:
        stomach_sleeper_idx.append(index)

In [None]:
for index in side_sleeper_idx:
    df.loc[index, 'side_sleeper'] = 1

for index in back_sleeper_idx:
    df.loc[index, 'back_sleeper'] = 1
    
for index in stomach_sleeper_idx:
    df.loc[index, 'stomach_sleeper'] = 1

In [None]:
df['side_sleeper'] = df['side_sleeper'].apply(lambda x: 0 if x != 1 else x)
df['back_sleeper'] = df['back_sleeper'].apply(lambda x: 0 if x != 1 else x)
df['stomach_sleeper'] = df['stomach_sleeper'].apply(lambda x: 0 if x != 1 else x)

In [None]:
side_sleepers = df[df['side_sleeper'] == 1]
back_sleepers = df[df['back_sleeper'] == 1]
stomach_sleepers = df[df['stomach_sleeper'] == 1]

plt.figure(figsize=(20, 5))
plt.subplot(1, 3, 1)
sns.countplot(x='side_sleeper', hue='brand', data=side_sleepers)
plt.title('Side Sleepers by Brand')

plt.subplot(1, 3, 2)
sns.countplot(x='back_sleeper', hue='brand', data=back_sleepers)
plt.title('Back Sleepers by Brand')

plt.subplot(1, 3, 3)
sns.countplot(x='stomach_sleeper', hue='brand', data=stomach_sleepers)
plt.title('Stomach Sleepers by Brand')

plt.show()

In [None]:
# Plotting the sleepers by mattress model

plt.figure(figsize=(15, 20))

plt.subplot(3, 1, 1)
sns.countplot(x='side_sleeper', hue='model', data=side_sleepers, palette='Paired')
plt.legend(bbox_to_anchor=(1.01, 1),borderaxespad=0)
plt.title('Side Sleeper Count per Mattress Model')

plt.subplot(3, 1, 2)
sns.countplot(x='back_sleeper', hue='model', data=back_sleepers, palette='Paired')
plt.legend(bbox_to_anchor=(1.01, 1),borderaxespad=0)
plt.title('Back Sleeper Count per Mattress Model')

plt.subplot(3, 1, 3)
sns.countplot(x='stomach_sleeper', hue='model', data=stomach_sleepers, palette='Paired')
plt.legend(bbox_to_anchor=(1.01, 1),borderaxespad=0)
plt.title('Stomach Sleeper Count per Mattress Model')

plt.show()

In [None]:
# Calculating polarity scores for each review for side sleepers
side_sleepers['polarity'] = side_sleepers['review'].apply(lambda x: TextBlob(x).sentiment.polarity)

# Calculating polarity scores for each review for back sleepers
back_sleepers['polarity'] = back_sleepers['review'].apply(lambda x: TextBlob(x).sentiment.polarity)

# Calculating polarity scores for each review for stomach sleepers
stomach_sleepers['polarity'] = stomach_sleepers['review'].apply(lambda x: TextBlob(x).sentiment.polarity)

In [None]:
# Calculating the mean average for polarity scores for side sleepers grouped by brand
avg_ss_brand_polarity = side_sleepers.groupby('brand').mean().polarity.reset_index()

# Calculating the mean average for polarity scores for side sleepers grouped by model
avg_ss_model_polarity = side_sleepers.groupby('model').mean().polarity.reset_index()

In [None]:
plt.figure(figsize=(20, 5))
plt.subplot(1,2,1)
sns.barplot(x='brand', y='polarity', data=avg_ss_brand_polarity)
plt.title('Average Polarity Score for Side Sleepers per Mattress Brand')

plt.subplot(1,2,2)
g = sns.barplot(x='model', y='polarity', data=avg_ss_model_polarity)
g.set_xticklabels(g.get_xticklabels(), rotation=90)
plt.title('Average Polarity Score for Side Sleepers per Mattress model')
plt.show()

In [None]:
# Calculating the mean average for polarity scores for back sleepers grouped by brand
avg_bs_brand_polarity = back_sleepers.groupby('brand').mean().polarity.reset_index()

# Calculating the mean average for polarity scores for back sleepers grouped by model
avg_bs_model_polarity = back_sleepers.groupby('model').mean().polarity.reset_index()

In [None]:
plt.figure(figsize=(20, 5))
plt.subplot(1,2,1)
sns.barplot(x='brand', y='polarity', data=avg_bs_brand_polarity)
plt.title('Average Polarity Score for Back Sleepers per Mattress Brand')

plt.subplot(1,2,2)
g = sns.barplot(x='model', y='polarity', data=avg_bs_model_polarity)
g.set_xticklabels(g.get_xticklabels(), rotation=90)
plt.title('Average Polarity Score for Back Sleepers per Mattress model')
plt.show()

In [None]:
# Calculating the mean average for polarity scores for stomach sleepers grouped by brand
avg_stomach_brand_polarity = stomach_sleepers.groupby('brand').mean().polarity.reset_index()

# Calculating the mean average for polarity scores for stomach sleepers grouped by model
avg_stomach_model_polarity = stomach_sleepers.groupby('model').mean().polarity.reset_index()

In [None]:
plt.figure(figsize=(20, 5))
plt.subplot(1,2,1)
sns.barplot(x='brand', y='polarity', data=avg_stomach_brand_polarity)
plt.title('Average Polarity Score for Back Sleepers per Mattress Brand')

plt.subplot(1,2,2)
g = sns.barplot(x='model', y='polarity', data=avg_stomach_model_polarity)
g.set_xticklabels(g.get_xticklabels(), rotation=90)
plt.title('Average Polarity Score for Back Sleepers per Mattress model')
plt.show()

In [None]:
# Calculating the mean average for polarity scores for all sleepers grouped by brand
avg_brand_polarity = df.groupby('brand').mean().polarity.reset_index()

# Calculating the mean average for polarity scores for all sleepers grouped by model
avg_model_polarity = df.groupby('model').mean().polarity.reset_index()

In [None]:
plt.figure(figsize=(20, 5))
plt.subplot(1,2,1)
sns.barplot(x='brand', y='polarity', data=avg_brand_polarity)
plt.title('Average Polarity Score for All Sleepers per Mattress Brand')

plt.subplot(1,2,2)
g = sns.barplot(x='model', y='polarity', data=avg_model_polarity)
g.set_xticklabels(g.get_xticklabels(), rotation=90)
plt.title('Average Polarity Score for All Sleepers per Mattress model')
plt.show()

## Parsing the Reviews

In [None]:
# Chunking Noun Phrases

chunk_grammar = 'NP: {<DT>?<JJ>*<NN>}'
chunk_parser = nltk.RegexpParser(chunk_grammar)

In [None]:
for brand in df['brand'].unique():
    print(brand)
    for review in df[df['brand']==brand]['review']:
        sentence_list = review.split()
        sentence_pos_tag = nltk.pos_tag(sentence_list)
        chunked = chunk_parser.parse(sentence_pos_tag)
        print(chunked)

In [None]:
# Calculating polarity scores for each review
df['polarity'] = df['review'].apply(lambda x: TextBlob(x).sentiment.polarity)

# Calculating length of each review
df['review_length'] = df['review'].apply(lambda x: len(x))

# Calculating word count for each review
df['review_word_count'] = df['review'].apply(lambda x: len(x.split()))


In [None]:
# Checking to see if polairty for most positive reviews are correct for 5 samples
print('5 reviews with the highest polarity: \n')
hp = df.loc[df['polarity'] == 1, 'review'].sample(5).values
for r in hp:
    print(r)

In [None]:
print('5 reviews with the neutral polarity: \n')
np = df.loc[df['polarity'] == 0, 'review'].sample(5).values

for r in np:
    print(r)

In [None]:
print('5 reviews with the negative polarity: \n')
lp = df.loc[df['polarity'] <= -.5, 'review'].sample(5).values

for r in lp:
    print(r)

In [None]:
# Plotting distribution of Polarity scores
plt.figure(figsize=(10, 5))
sns.distplot(df['polarity'], kde=False)
plt.title('Distribution of Polarity Scores')
plt.axvline(df['polarity'].mean(), ls='--', c='r', label='mean')
plt.axvline(df['polarity'].median(), c='black', label='median')
plt.ylabel('Count')
plt.legend()
plt.show()

Most of the sentiment scores are above 0, which indicates that a majority of the reviews are positive.

In [None]:
# Checking the distribution of the ratings of the matresses
plt.figure(figsize=(10, 5))
sns.countplot(x='rating', data=df)
plt.title('Distribution of Mattress Ratings')
plt.show()

The distribution of the ratings show that a majority of people rated the mattresses very highly. That falls in line with a majority of the reviews being positive.

In [None]:
# Plotting the distribution of review lengths
plt.figure(figsize=(10, 5))
sns.distplot(df['review_length'], kde=False, color='red')
plt.title('Review Lengths')
plt.show()

In [None]:
# It looks like the distribution of review lengths is log-normally distributed, or exponentially distributed. 
# To test for this I will plot a Normal Probability plot with a standard normal distribution vs. the log distribution
# of review lengths on the left, and the distribution vs. the CCDF of the distribution on the right
import numpy as np
from scipy.stats import norm
rv = norm.rvs(size=len(df))

plt.figure(figsize=(15, 5))
plt.subplot(1, 2, 1)
plt.plot(sorted(rv), sorted(np.log(df['review_length'])))
plt.xlabel('Standard Normal Values')
plt.ylabel('Log of Review Lengths')
plt.title('Normal Probability Plot of Log of Review Lengths')

plt.subplot(1, 2, 2)
sorted_values = sorted(df['review_length'])
cdf_y = np.linspace(0, 1, len(sorted_values))
ccdf = [1 - value for value in cdf_y]
plt.plot(sorted_values, np.log(ccdf))
plt.title('Complimentary CDF')
plt.xlabel('Review Lengths')
plt.ylabel('Log-CCDF')
plt.show()

It looks like it can be argued that the review length distribution would loosely fit a log-normal distribution, or an exponential distribution, but visually it looks as though the log-normal distribution is a better fit.

In [None]:
# Plotting Word count distributions
plt.figure(figsize=(10, 5))
sns.distplot(df['review_word_count'], kde=False)

In [None]:
# Plotting out Warranty Years for each brand, and Trial nights for each brand

plt.figure(figsize=(15, 5))
plt.subplot(1, 2, 1)
sns.barplot(x='brand', y='warranty_years', data=df)
plt.title('Warranty Years per Brand')

plt.subplot(1, 2, 2)
sns.barplot(x='brand', y='trial_nights', data=df)
plt.title('Trial Nights per Brand')

plt.show()

In [None]:
import psycopg2
conn = psycopg2.connect('dbname=Mattresses')
cur = conn.cursor()
sql = 'SELECT * FROM mattressinfo'

In [None]:
df.loc[(df['brand'] == 'Sealy') & (df['model'] == 'Chill Memory Foam Mattress'), 'model'] = 'Cocoon Chill'

In [None]:
mattress_info_df = pd.read_sql_query(sql, conn)

In [None]:
df['brand'] = df['brand'].apply(lambda x: x.title())

In [None]:
df = df.merge(mattress_info_df, on=['brand', 'model'], how='left')

In [None]:
# Plotting out warranty vs. price per brand
plt.figure(figsize=(15, 5))
plt.subplot(1, 2, 1)
sns.scatterplot(x='warranty_years', y='price', hue='brand', data=df)
plt.title('Warranty vs. Price per Brand')

# Plotting out trial vs. price per brand

plt.subplot(1, 2, 2)
sns.scatterplot(x='trial_nights', y='price', hue='brand', data=df)
plt.title('Trial Nights per Brand')

plt.show()

In [9]:
df_for_locations = df.copy()

df_for_locations.dropna(subset=['review_location'], axis=0, inplace=True)

df_for_locations['review_location'] = df_for_locations['review_location'].\
                                        apply(lambda x: x.split(',')[1] if ',' in x else x)

for key in states_abbreviation.keys():
    df_for_locations['review_location'] = df_for_locations['review_location'].\
                                        apply(lambda x: states_abbreviation[key] if key in x.upper() else x)
    
for key, value in states_abbreviation.items():
df_for_locations['review_location'] = df_for_locations['review_location'].\
                                    apply(lambda x: value if value in x.upper() else x)
    
df_for_locations['id'] = df_for_locations['review_location'].apply(lambda x:
                                                    state_id_map[x.title()] if x.title() in state_id_map
                                                                  else np.nan)

In [None]:
state_id_map = {}
for feature in geo_json['features']:
    feature['id'] = feature['properties']['STATE']
    state_id_map[feature['properties']['NAME']] = feature['id']

In [None]:
df_for_locations['id'].isna().sum()

In [None]:
df_for_locations.dropna(subset=['id'], inplace=True)

In [None]:
location_counts = df_for_locations.groupby(['review_location', 'id']).review.count().reset_index()

location_counts.rename(columns={'review': 'counts'}, inplace=True)

location_counts['count_scaled'] = np.log(location_counts.counts)

location_counts.head()

In [None]:
fig = px.choropleth(location_counts, locations='id', geojson=geo_json, color='count_scaled',
                    scope='usa', hover_name='review_location', hover_data=['counts'])
fig.show()

In [None]:
# Plot out location of most reviewers on a map

import plotly.express as px

fig = px.choropleth_mapbox(df, geojson=counties, locations='fips', color='unemp',
                           color_continuous_scale="Viridis",
                           range_color=(0, 12),
                           mapbox_style="carto-positron",
                           zoom=3, center = {"lat": 37.0902, "lon": -95.7129},
                           opacity=0.5,
                           labels={'unemp':'unemployment rate'}
                          )
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()

## Text EDA

### Top 10 Unigram Words per Brand including Stopwords

In [None]:
# Plotting out top unigrams before removing stopwords

plt.figure(figsize=(20, 10))

for i, brand in enumerate(df['brand'].unique(), start=1):
    plt.subplot(2, 3, i)
    brand_top_review_words = word_counts(df.loc[df['brand'] == brand, 'review'], brand, True, 10, 1, False)
    sns.barplot(x='Words', y='Count', data=brand_top_review_words)
    plt.title('{} Mattress top 10 Unigram words including Stopwords'.format(brand.title()))

### Top 10 Unigram Words per Brand not including Stopwords

In [None]:
plt.figure(figsize=(20, 10))

for i, brand in enumerate(df['brand'].unique(), start=1):
    plt.subplot(2, 3, i)
    brand_top_review_words = word_counts(df.loc[df['brand'] == brand, 'review'], brand, True, 10, 1, True)
    g = sns.barplot(x='Words', y='Count', data=brand_top_review_words)
    g.set_xticklabels(g.get_xticklabels(), rotation=45)
    plt.subplots_adjust(hspace=0.5)
    plt.title('{} Mattress top 10 Unigram words including Stopwords'.format(brand.title()))

### Top 10 Bigram Words per Brand including Stopwords

In [None]:
plt.figure(figsize=(20, 10))

for i, brand in enumerate(df['brand'].unique(), start=1):
    plt.subplot(2, 3, i)
    brand_top_review_words = word_counts(df.loc[df['brand'] == brand, 'review'], brand, True, 10, 2, False)
    g = sns.barplot(x='Words', y='Count', data=brand_top_review_words)
    g.set_xticklabels(g.get_xticklabels(), rotation=45)
    plt.subplots_adjust(hspace=0.5)
    plt.title('{} Mattress top 10 Bigram words including Stopwords'.format(brand.title()))

### Top 10 Bigram Words per Brand not including Stopwords

In [None]:
plt.figure(figsize=(20, 10))

for i, brand in enumerate(df['brand'].unique(), start=1):
    plt.subplot(2, 3, i)
    brand_top_review_words = word_counts(df.loc[df['brand'] == brand, 'review'], brand, True, 10, 2, True)
    g = sns.barplot(x='Words', y='Count', data=brand_top_review_words)
    g.set_xticklabels(g.get_xticklabels(), rotation=90)
    plt.subplots_adjust(hspace=0.9)
    plt.title('{} Mattress top 10 Bigram words including Stopwords'.format(brand.title()))

### Top 10 Trigram Words per Brand including Stopwords

In [None]:
plt.figure(figsize=(20, 10))

for i, brand in enumerate(df['brand'].unique(), start=1):
    plt.subplot(2, 3, i)
    brand_top_review_words = word_counts(df.loc[df['brand'] == brand, 'review'], brand, True, 10, 3, False)
    g = sns.barplot(x='Words', y='Count', data=brand_top_review_words)
    g.set_xticklabels(g.get_xticklabels(), rotation=75)
    plt.subplots_adjust(hspace=0.9)
    plt.title('{} Mattress top 10 Trigram words including Stopwords'.format(brand.title()))

### Top 10 Trigram Words per Brand not including Stopwords

In [None]:
plt.figure(figsize=(20, 10))

for i, brand in enumerate(df['brand'].unique(), start=1):
    plt.subplot(2, 3, i)
    brand_top_review_words = word_counts(df.loc[df['brand'] == brand, 'review'], brand, True, 10, 3, True)
    g = sns.barplot(x='Words', y='Count', data=brand_top_review_words)
    g.set_xticklabels(g.get_xticklabels(), rotation=75)
    plt.subplots_adjust(hspace=0.9)
    plt.title('{} Mattress top 10 Trigram words including Stopwords'.format(brand.title()))

# Part of Speech Tagging

In [None]:
plt.figure(figsize=(20, 10))
for i, brand in enumerate(df['brand'].unique(), start=1):
    plt.subplot(2, 3, i)
    
    pos_blob = TextBlob(str(df.loc[df['brand'] == brand, 'review']))
    pos_df = pd.DataFrame(pos_blob.tags, columns=['word', 'pos'])
    pos_df = pd.DataFrame(pos_df.pos.value_counts()[:20], columns=['pos']).rename(columns={'pos': 'count'})
    g=sns.barplot(x=pos_df.index, y='count', data=pos_df)
    g.set_xticklabels(g.get_xticklabels(), rotation=45)
    plt.title('{} Matress Review Part of Speech Frequency'.format(brand.title()))

In [None]:
# Plot boxplots to look at all brands distirbutions of rating, polarity, review length, review word count
# Looking at boxplots for the polarity for each brand
plt.figure(figsize=(10, 5))
sns.boxplot(df['polarity'], y=df['brand'])
plt.title('Boxplots of Review Polarity per Brand')
plt.show()

In [None]:
# Looking at boxplots for the ratings for each brand
plt.figure(figsize=(10, 5))
sns.boxplot(df['rating'], y=df['brand'])
plt.title('Boxplots of Review Ratings per Brand')
plt.show()

In [None]:
# Looking at boxplots for the review lengths for each brand
plt.figure(figsize=(10, 5))
sns.boxplot(df['review_length'], y=df['brand'])
plt.title('Boxplots of Review Ratings per Brand')
plt.show()

In [None]:
# Looking at boxplots for the review lengths for each brand
plt.figure(figsize=(10, 5))
sns.boxplot(df['review_word_count'], y=df['brand'])
plt.title('Boxplots of Review Ratings per Brand')
plt.show()

In [None]:
df.head()