## Semantic Analysis for wine shop reviews

In [1]:
import re

# to handle datasets
import pandas as pd
import numpy as np

# for visualization
import matplotlib.pyplot as plt


In [2]:
root_dir = "/content/"
base_dir = root_dir + 'drive/My Drive/Knight ML Assignment/Data/'
base_dir

'/content/drive/My Drive/Knight ML Assignment/Data/'

In [3]:
#Initialization 
%reload_ext autoreload
%autoreload 2
%matplotlib inline

# Importing the fastai library
from fastai import *
from fastai.text import *

In [4]:
path = Path(base_dir)

In [5]:
path.ls()

[PosixPath('/content/drive/My Drive/Knight ML Assignment/Data/test.csv'),
 PosixPath('/content/drive/My Drive/Knight ML Assignment/Data/train.csv')]

In [6]:
#Read in train data
df = pd.read_csv(path/'train.csv')
df.head()

Unnamed: 0,user_name,country,review_title,review_description,designation,points,price,province,region_1,region_2,winery,variety
0,,Australia,Andrew Peace 2007 Peace Family Vineyard Chardo...,"Classic Chardonnay aromas of apple, pear and h...",Peace Family Vineyard,83,10.0,Australia Other,South Eastern Australia,,Andrew Peace,Chardonnay
1,@wawinereport,US,North by Northwest 2014 Red (Columbia Valley (...,This wine is near equal parts Syrah and Merlot...,,89,15.0,Washington,Columbia Valley (WA),Columbia Valley,North by Northwest,Red Blend
2,,Italy,Renato Ratti 2007 Conca (Barolo),Barolo Conca opens with inky dark concentratio...,Conca,94,80.0,Piedmont,Barolo,,Renato Ratti,Nebbiolo
3,@vossroger,France,Domaine l'Ancienne Cure 2010 L'Abbaye White (B...,It's impressive what a small addition of Sauvi...,L'Abbaye,87,22.0,Southwest France,Bergerac Sec,,Domaine l'Ancienne Cure,Bordeaux-style White Blend
4,@vossroger,France,Château du Cèdre 2012 Le Cèdre Vintage Malbec ...,"This ripe, sweet wine is rich and full of drie...",Le Cèdre Vintage,88,33.0,France Other,Vin de Liqueur,,Château du Cèdre,Malbec


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 82657 entries, 0 to 82656
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   user_name           63264 non-null  object 
 1   country             82622 non-null  object 
 2   review_title        82657 non-null  object 
 3   review_description  82657 non-null  object 
 4   designation         59010 non-null  object 
 5   points              82657 non-null  int64  
 6   price               77088 non-null  float64
 7   province            82622 non-null  object 
 8   region_1            69903 non-null  object 
 9   region_2            35949 non-null  object 
 10  winery              82657 non-null  object 
 11  variety             82657 non-null  object 
dtypes: float64(1), int64(1), object(10)
memory usage: 7.6+ MB


In [8]:
df['review_title'][0]

'Andrew Peace 2007 Peace Family Vineyard Chardonnay (South Eastern Australia)'

In [9]:
df['review_description'][0]

'Classic Chardonnay aromas of apple, pear and hay lead into a palate marked by decent intensity but also a bit of sweetness. Orange and candy notes run through the rather short finish.'

In [10]:
# create a corpus with topic and descrription
data = pd.DataFrame(df['review_title'])
data['review_description'] = df['review_description']
#data = data.set_index('review_title')
data.head()

Unnamed: 0,review_title,review_description
0,Andrew Peace 2007 Peace Family Vineyard Chardo...,"Classic Chardonnay aromas of apple, pear and h..."
1,North by Northwest 2014 Red (Columbia Valley (...,This wine is near equal parts Syrah and Merlot...
2,Renato Ratti 2007 Conca (Barolo),Barolo Conca opens with inky dark concentratio...
3,Domaine l'Ancienne Cure 2010 L'Abbaye White (B...,It's impressive what a small addition of Sauvi...
4,Château du Cèdre 2012 Le Cèdre Vintage Malbec ...,"This ripe, sweet wine is rich and full of drie..."


In [11]:
# Apply a first round of text cleaning techniques
import re
import string

def clean_text_round1(text):
    '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.'''
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

round1 = lambda x: clean_text_round1(x)

In [12]:
# Let's take a look at the updated text for corpus1
data = pd.DataFrame(data.review_description.apply(round1))
data.head()

Unnamed: 0,review_description
0,classic chardonnay aromas of apple pear and ha...
1,this wine is near equal parts syrah and merlot...
2,barolo conca opens with inky dark concentratio...
3,its impressive what a small addition of sauvig...
4,this ripe sweet wine is rich and full of dried...


In [13]:
data.to_pickle("corpus.pkl")

In [14]:
# Create quick lambda functions to find the polarity and subjectivity of each routine
# Terminal / Anaconda Navigator: conda install -c conda-forge textblob
from textblob import TextBlob

pol = lambda x: TextBlob(x).sentiment.polarity
sub = lambda x: TextBlob(x).sentiment.subjectivity

data['polarity'] = data['review_description'].apply(pol)
data['subjectivity'] = data['review_description'].apply(sub)
data

Unnamed: 0,review_description,polarity,subjectivity
0,classic chardonnay aromas of apple pear and ha...,0.108333,0.433333
1,this wine is near equal parts syrah and merlot...,-0.047917,0.322917
2,barolo conca opens with inky dark concentratio...,-0.204167,0.583333
3,its impressive what a small addition of sauvig...,0.416667,0.800000
4,this ripe sweet wine is rich and full of dried...,0.265476,0.623810
...,...,...,...
82652,hot earth baked cherry and mild vanilla aromas...,0.190556,0.577500
82653,pungent highpitched notes of turmeric and drie...,-0.050000,0.444444
82654,this is a wine thats not just lively and fruit...,0.278788,0.588889
82655,an impressive wine it already shows signs of i...,0.263889,0.755556


In [15]:
data['review_title'] = df['review_title']
data.head()

Unnamed: 0,review_description,polarity,subjectivity,review_title
0,classic chardonnay aromas of apple pear and ha...,0.108333,0.433333,Andrew Peace 2007 Peace Family Vineyard Chardo...
1,this wine is near equal parts syrah and merlot...,-0.047917,0.322917,North by Northwest 2014 Red (Columbia Valley (...
2,barolo conca opens with inky dark concentratio...,-0.204167,0.583333,Renato Ratti 2007 Conca (Barolo)
3,its impressive what a small addition of sauvig...,0.416667,0.8,Domaine l'Ancienne Cure 2010 L'Abbaye White (B...
4,this ripe sweet wine is rich and full of dried...,0.265476,0.62381,Château du Cèdre 2012 Le Cèdre Vintage Malbec ...


In [20]:
def pos_or_neg(pol, x= 'Neutral'):
  if pol > 0:
    x = 'Positive'
  elif pol < 0:
    x = 'Negative'
  return x

sentiment = lambda x: pos_or_neg(x)

In [21]:
s = pd.Series(data.polarity.apply(sentiment))
s

0        Positive
1        Negative
2        Negative
3        Positive
4        Positive
           ...   
82652    Positive
82653    Negative
82654    Positive
82655    Positive
82656    Positive
Name: polarity, Length: 82657, dtype: object

In [22]:
data['sentiment'] = s

In [23]:
data.head()

Unnamed: 0,review_description,polarity,subjectivity,review_title,sentiment
0,classic chardonnay aromas of apple pear and ha...,0.108333,0.433333,Andrew Peace 2007 Peace Family Vineyard Chardo...,Positive
1,this wine is near equal parts syrah and merlot...,-0.047917,0.322917,North by Northwest 2014 Red (Columbia Valley (...,Negative
2,barolo conca opens with inky dark concentratio...,-0.204167,0.583333,Renato Ratti 2007 Conca (Barolo),Negative
3,its impressive what a small addition of sauvig...,0.416667,0.8,Domaine l'Ancienne Cure 2010 L'Abbaye White (B...,Positive
4,this ripe sweet wine is rich and full of dried...,0.265476,0.62381,Château du Cèdre 2012 Le Cèdre Vintage Malbec ...,Positive
