## Machine leaning project
This project is about wine reviews dataset that was found in Kaggle.

Project team members: Annely Liivas, Velda Lauringson, Eduard Rudi

#### Importing packages

In [1]:
import pandas as pd

import gensim
from gensim.models import Word2Vec
from gensim.models import FastText
from autocorrect import Speller
spell = Speller(lang='en')

from statistics import fmean

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.utils import shuffle

#### Reading data

In [2]:
lemmatizer = WordNetLemmatizer()
vectorizer = TfidfVectorizer()
word2vec_model = gensim.models.Word2Vec.load('test_word2vec.model')
fastText_model = FastText.load('fastText.model')
stop_words = stopwords.words('english')

In [3]:
data_loc = 'data/'

In [4]:
data1 = pd.read_csv(data_loc + 'winemag-data_first150k.csv', index_col=0)
data2 = pd.read_csv(data_loc + 'winemag-data-130k-v2.csv', index_col=0)

In [5]:
dataset = pd.concat([data1, data2])

In [6]:
dataset = shuffle(dataset, random_state=123)

In [7]:
dataset = dataset.reset_index(drop=True)
dataset

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,variety,winery,taster_name,taster_twitter_handle,title
0,US,"Sparkling wine is all about texture, and the m...",Brut,85,22.0,California,Russian River Valley,Sonoma,Sparkling Blend,River Road,,,
1,France,"This fruity wine, free from any oak aging, rev...",,85,15.0,Burgundy,Mâcon-Villages,,Chardonnay,Labouré-Roi,Roger Voss,@vossroger,Labouré-Roi 2015 Mâcon-Villages
2,US,"A rich, ripe and complex Cabernet, with classi...",Estate Bottled,90,35.0,California,Napa Valley,Napa,Cabernet Sauvignon,St. Supéry,,,
3,Italy,Here's a cheerful white blend from Tuscany (wi...,Fonte delle Donne,87,18.0,Tuscany,Toscana,,White Blend,Fattoria Fibbiano,,,
4,New Zealand,"This medium-weight, supple Pinot Noir is a bit...",Estate Series,87,24.0,Central Otago,,,Pinot Noir,Matua Valley,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
280896,Portugal,"Ripe, rich and creamy, this is a dense wine fr...",Reserva,88,12.0,Douro,,,Portuguese White,Borges,Roger Voss,@vossroger,Borges 2015 Reserva White (Douro)
280897,US,"Pretty much a copy of the '03, which I found t...",Monte Rosso Vineyard Gnarly Vine,84,50.0,California,Sonoma Valley,Sonoma,Zinfandel,Louis M. Martini,,,
280898,US,"The blend is Zinfandel, Syrah, Cabernet Sauvig...",Essential,83,11.0,California,California,California Other,Red Blend,Bogle,,,
280899,US,"The aromas of plum, blackberry, vanilla, dried...",,87,14.0,Washington,Columbia Valley (WA),Columbia Valley,Syrah,Waterbrook,Sean P. Sullivan,@wawinereport,Waterbrook 2014 Syrah (Columbia Valley (WA))


In [8]:
train = dataset.iloc[:205000]
test = dataset.iloc[205000:]

In [9]:
# Reset index
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)

In [10]:
# For testing
# train = train.iloc[:10000]
# test = test.iloc[:2500]

In [11]:
#Vaatan, mis toimub
train.head()

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,variety,winery,taster_name,taster_twitter_handle,title
0,US,"Sparkling wine is all about texture, and the m...",Brut,85,22.0,California,Russian River Valley,Sonoma,Sparkling Blend,River Road,,,
1,France,"This fruity wine, free from any oak aging, rev...",,85,15.0,Burgundy,Mâcon-Villages,,Chardonnay,Labouré-Roi,Roger Voss,@vossroger,Labouré-Roi 2015 Mâcon-Villages
2,US,"A rich, ripe and complex Cabernet, with classi...",Estate Bottled,90,35.0,California,Napa Valley,Napa,Cabernet Sauvignon,St. Supéry,,,
3,Italy,Here's a cheerful white blend from Tuscany (wi...,Fonte delle Donne,87,18.0,Tuscany,Toscana,,White Blend,Fattoria Fibbiano,,,
4,New Zealand,"This medium-weight, supple Pinot Noir is a bit...",Estate Series,87,24.0,Central Otago,,,Pinot Noir,Matua Valley,,,


#### Preprocessing train description

In [12]:
reviews_lower = [x.lower() for x in train.description]

In [13]:
cleaned_reviews = [re.sub('[^a-zA-Z]', ' ', x) for x in reviews_lower]
cleaned_reviews = [re.sub(r'\s+', ' ', x) for x in cleaned_reviews]

In [14]:
reviews = [nltk.sent_tokenize(x) for x in cleaned_reviews]

In [15]:
reviews = [y for x in reviews for y in x]

In [16]:
words = [nltk.word_tokenize(x) for x in reviews]

In [17]:
for i in range(len(words)):
    words[i] = [lemmatizer.lemmatize(w) for w in words[i] if w not in stop_words and len(lemmatizer.lemmatize(w)) > 1]

In [18]:
X = vectorizer.fit_transform([' '.join(x) for x in words])

In [19]:
freq = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_))



In [20]:
fmean(word2vec_model.wv['wine'] * freq['wine'])

-0.12229394380003214

In [21]:
desc_score = []
for desc in words:
    word_score = []
    for w in desc:
        # word_score.append(fmean(fastText_model.wv[w]))
        try:
            word_score.append(fmean(word2vec_model.wv[w] * freq[w]))
            # word_score.append(fmean(word2vec_model.wv[w]))
        except:
            # print(f'No word {w}')
            continue
    desc_score.append(fmean(word_score))

In [22]:
train['desc_score'] = desc_score

In [23]:
train.head()

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,variety,winery,taster_name,taster_twitter_handle,title,desc_score
0,US,"Sparkling wine is all about texture, and the m...",Brut,85,22.0,California,Russian River Valley,Sonoma,Sparkling Blend,River Road,,,,0.186345
1,France,"This fruity wine, free from any oak aging, rev...",,85,15.0,Burgundy,Mâcon-Villages,,Chardonnay,Labouré-Roi,Roger Voss,@vossroger,Labouré-Roi 2015 Mâcon-Villages,-0.28678
2,US,"A rich, ripe and complex Cabernet, with classi...",Estate Bottled,90,35.0,California,Napa Valley,Napa,Cabernet Sauvignon,St. Supéry,,,,0.099817
3,Italy,Here's a cheerful white blend from Tuscany (wi...,Fonte delle Donne,87,18.0,Tuscany,Toscana,,White Blend,Fattoria Fibbiano,,,,-0.207998
4,New Zealand,"This medium-weight, supple Pinot Noir is a bit...",Estate Series,87,24.0,Central Otago,,,Pinot Noir,Matua Valley,,,,-0.122666


#### Preprocessing test description

In [24]:
reviews_lower = [x.lower() for x in test.description]

In [25]:
cleaned_reviews = [re.sub('[^a-zA-Z]', ' ', x) for x in reviews_lower]
cleaned_reviews = [re.sub(r'\s+', ' ', x) for x in cleaned_reviews]

In [26]:
reviews = [nltk.sent_tokenize(x) for x in cleaned_reviews]
reviews = [y for x in reviews for y in x]

In [27]:
words = [nltk.word_tokenize(x) for x in reviews]

In [28]:
for i in range(len(words)):
    words[i] = [lemmatizer.lemmatize(w) for w in words[i] if w not in stop_words and len(lemmatizer.lemmatize(w)) > 1]

In [29]:
desc_score = []
vocab = set(word2vec_model.wv.key_to_index.keys())
for desc in words:
    word_score = []
    for w in desc:
        # word_score.append(fmean(fastText_model.wv[w]))
        
        # try:
        #     word_score.append(fmean(word2vec_model.wv[w] * freq[w]))
        # except:
        #     try:
        #         word_score.append(fmean(word2vec_model.wv[spell(w)] * freq[spell(w)]))
        #     except:
        #         # print(f'No word {w}')
        #         continue
        
        if w in vocab and w in freq:
            word_score.append(fmean(word2vec_model.wv[w] * freq[w]))
            # word_score.append(fmean(word2vec_model.wv[w]))
        elif spell(w) in vocab and spell(w) in freq:
            word_score.append(fmean(word2vec_model.wv[spell(w)] * freq[spell(w)]))
        # else:
        #     try:
        #         word_score.append(fmean(word2vec_model.wv[spell(w)] * freq[spell(w)]))
        #         # word_score.append(fmean(word2vec_model.wv[spell(w)]))
        #     except:
        #         # print(f'No word {w}')
        #         continue
    desc_score.append(fmean(word_score))

In [30]:
test['desc_score'] = desc_score

In [31]:
test.head()

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,variety,winery,taster_name,taster_twitter_handle,title,desc_score
0,US,"This is straightforward and easy, with sweet M...",,85,16.0,California,Napa Valley,Napa,Sauvignon Blanc,Lola Kay,,,Lola Kay 2011 Sauvignon Blanc (Napa Valley),0.226332
1,Italy,"Upfront and generous, this is a bright and foo...",Donia,86,18.0,Piedmont,Barbera d'Alba,,Barbera,Ponchione Maurizio,,,Ponchione Maurizio 2007 Donia (Barbera d'Alba),-0.117231
2,Italy,"Robust but informal, this opens with aromas of...",Rosso,88,12.0,Southern Italy,Cirò,,Gaglioppo,Librandi,Kerin O’Keefe,@kerinokeefe,Librandi 2013 Rosso (Cirò),-0.129068
3,Spain,"Dusty berry, resin and bell-pepper aromas sugg...",Crianza,89,20.0,Northern Spain,Ribera del Duero,,Tempranillo,Tamaral,,,,0.001262
4,Austria,"Beautiful crushed red cherry and lifted, polis...",Vom Kiesel,92,,Burgenland,,,St. Laurent,Strehn,,,,-0.365299


## Encoding columns

In [32]:
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler, MinMaxScaler

In [33]:
enc = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
# categories = ['country', 'variety', 'designation', 'province', 'winery']
categories = ['country', 'variety', 'province', 'winery']

In [34]:
enc.fit(train[categories])

In [35]:
transformed = enc.transform(train[categories])

In [36]:
transformed.shape

(205000, 4)

In [37]:
enc_train = train.copy()

In [38]:
enc_data = pd.DataFrame(transformed, index=enc_train.index, columns=categories)

In [39]:
enc_data

Unnamed: 0,country,variety,province,winery
0,46.0,573.0,53.0,14392.0
1,16.0,128.0,49.0,10422.0
2,46.0,80.0,53.0,15599.0
3,23.0,713.0,423.0,7738.0
4,33.0,458.0,71.0,11734.0
...,...,...,...,...
204995,46.0,128.0,53.0,14861.0
204996,46.0,582.0,463.0,11511.0
204997,46.0,582.0,463.0,12457.0
204998,46.0,128.0,282.0,17695.0


In [40]:
enc_train = pd.concat([enc_train[['price', 'desc_score', 'points']], enc_data], axis=1)

In [41]:
enc_train.head()

Unnamed: 0,price,desc_score,points,country,variety,province,winery
0,22.0,0.186345,85,46.0,573.0,53.0,14392.0
1,15.0,-0.28678,85,16.0,128.0,49.0,10422.0
2,35.0,0.099817,90,46.0,80.0,53.0,15599.0
3,18.0,-0.207998,87,23.0,713.0,423.0,7738.0
4,24.0,-0.122666,87,33.0,458.0,71.0,11734.0


In [42]:
enc_train = enc_train.apply(lambda x: x.fillna(int(x.mean())),axis=0)

In [43]:
test_transformed = enc.transform(test[categories])

In [44]:
enc_test = test.copy()

In [45]:
test_enc_data = pd.DataFrame(test_transformed, index=test.index, columns=categories)

In [46]:
enc_test = pd.concat([enc_test[['price', 'desc_score', 'points']], test_enc_data], axis=1)

In [47]:
enc_test = enc_test.apply(lambda x: x.fillna(int(x.mean())),axis=0)

In [64]:
enc_train.to_csv('data/enc_train.csv', index=False)

In [63]:
enc_test.to_csv('data/enc_test.csv', index=False)