## HW4 Task2 

In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, PolynomialFeatures,scale
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression, LinearRegression,Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.compose import make_column_transformer, ColumnTransformer, make_column_selector
from sklearn.pipeline import make_pipeline,Pipeline
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, KNNImputer,IterativeImputer
from category_encoders.target_encoder import TargetEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
import re
from nltk.corpus import stopwords
from gensim.corpora import WikiCorpus
from gensim.models import word2vec
import spacy

### Data Cleansing

In [3]:
path1 = "/Users/racheltan/Desktop/QMSS/AML/winemag-data-130k-v2.csv"
path2 = "/Users/ellen/Documents/GitHub/assignment-4-rachel_ellen/wine-reviews/winemag-data-130k-v2.csv"

wine_raw = pd.read_csv(path2, index_col = 0).drop(['taster_name', 'taster_twitter_handle'], axis = 1)

In [4]:

wine_usa = wine_raw[wine_raw.country == 'US']
wine_usa_skew = wine_usa[wine_usa.price < 250].drop_duplicates('description') #using wine only from US, remove duplicate rows
wine_us = wine_usa_skew.sample(n = 20000, random_state = 123) #subsample 20k rows 


In [5]:

wine_us=wine_us.reset_index()

 Use a pretrained word-embedding (word2vec, glove or fasttext) for featurization instead of the
bag-of-words model. Does this improve classification? How about combining the embedded
words with the BoW model?

In [6]:
wine_us["description_punc"]=0
wine_us["title_punc"]=0

for i in range(len(wine_us)):
    wine_us.loc[i,"description_punc"] = re.sub("[^a-zA-Z']+", ' ', wine_us['description'][i]) 
    wine_us.loc[i,"title_punc"] = re.sub("[^a-zA-Z']+", ' ', wine_us['description'][i])
    wine_us.loc[i,"description_punc"] = wine_us.loc[i,"description_punc"].lower()
    wine_us.loc[i,"title_punc"] = wine_us.loc[i,"title_punc"].lower()


In [7]:
wine_us["description_punc"].head()

0    this is a relatively thick and dense wine grip...
1    a routine although entirely drinkable merlot i...
2    tremendously rich and oaky with butterscotch i...
3    white pepper dominates the smell and the flavo...
4    this yakima valley bottling highlights fresh h...
Name: description_punc, dtype: object

### Using word2vec

In [8]:
X_full = wine_us[['description','description_punc','designation', 'price', 'province', 'region_1', 'region_2','variety','winery', 'title',"title_punc"]] #take out title 
y_full = wine_us['points']
X_full_train, X_full_test, y_full_train, y_full_test = train_test_split(X_full,y_full,test_size=0.2,random_state=30)
X_full_train = X_full_train.reset_index().drop('index',axis=1)
y_full_train = y_full_train.reset_index().drop('index',axis=1)
X_full_test = X_full_test.reset_index().drop('index',axis=1)
y_full_test = y_full_test.reset_index().drop('index',axis=1)

In [9]:
nlp = spacy.load("en_core_web_lg", disable=["tagger", "parser", "ner"])
docs_train = [nlp(d).vector for d in X_full_train["description_punc"]]
X_train = np.vstack(docs_train)
X_train.shape

(16000, 300)

In [10]:
#train
Ridge_w2v = Ridge().fit(X_train, y_full_train)
Ridge_w2v.score(X_train, y_full_train)

0.5528940232729389

In [11]:
#test
docs_val = [nlp(d).vector for d in X_full_test["description_punc"]]
X_val = np.vstack(docs_val)
Ridge_w2v.score(X_val, y_full_test)

0.5280946595663794

When we combined description and title:

In [12]:
title_train = [nlp(d).vector for d in X_full_train["title_punc"]]
X_title = np.vstack(title_train)
X_train2 = np.hstack((X_train,X_title))
X_train2.shape


(16000, 600)

In [13]:
#train
Ridge_w2v = Ridge().fit(X_train2, y_full_train)
Ridge_w2v.score(X_train2, y_full_train)

0.5538889576141489

In [14]:
#test
title_val = [nlp(d).vector for d in X_full_test["title_punc"]]
X_titleval = np.vstack(title_val)
X_val2 = np.hstack((X_val,X_titleval))

Ridge_w2v.score(X_val2, y_full_test)

0.5278713969428191

We reached a score 0.5529 for train data and 0.528 for test data. When we add both description and title, the score hasn't changed much.

### Combine the embedded words and the BOW words together

In [15]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
my_stopwords = set(ENGLISH_STOP_WORDS)
my_stopwords.remove("well")
my_stopwords.remove("not")
my_stopwords.add("ve")

In [27]:
title = [nlp(d).vector for d in X_full["title_punc"]]
description = [nlp(d).vector for d in X_full["description_punc"]]

X_Tfidf = np.hstack((title,description))
X_Tfidf = pd.DataFrame(X_train_Tfidf)

In [29]:
X_Tfidf['description_bow'] = X_full["description_punc"]
X_Tfidf['title_bow'] = X_full["title_punc"]

In [30]:
X_tfi_train, X_tfi_test, y_tfi_train, y_tfi_test = train_test_split(X_Tfidf,y_full,test_size=0.2,random_state=30)

In [31]:
#using characters instead of words 

tfidf_vect2 = TfidfVectorizer(stop_words = my_stopwords, ngram_range = (2,5), min_df = 3, analyzer="char")
preprocess2 = make_column_transformer(    
    (tfidf_vect2, 'description_bow'), 
    (tfidf_vect2, 'title_bow'))

tfidf_ridge2 = make_pipeline(preprocess2, Ridge(alpha = 1.0))
score_tfidf_ridge2 = cross_val_score(tfidf_ridge2, X_tfi_train, y_tfi_train)
np.mean(score_tfidf_ridge2) 


0.693898121413947

In this part we combine the BOW words with embedded word together. The score goes up to 0.69

### Original BOW with re text prepoccessing 

In [None]:
X_full = wine_us[['description','description_punc','designation', 'price', 'province', 'region_1', 'region_2','variety','winery', 'title',"title_punc"]] #take out title 
y_full = wine_us['points']
X_full_train, X_full_test, y_full_train, y_full_test = train_test_split(X_full,y_full,test_size=0.2,random_state=30)
X_full_train = X_full_train.reset_index().drop('index',axis=1)
y_full_train = y_full_train.reset_index().drop('index',axis=1)
X_full_test = X_full_test.reset_index().drop('index',axis=1)
y_full_test = y_full_test.reset_index().drop('index',axis=1)

In [51]:
tfidf_vect = TfidfVectorizer(stop_words = my_stopwords, ngram_range = (2,5), min_df = 2, analyzer="char") #tfidf vectorizer is the same as count vec + tfidf transformer


In [52]:
X_bow_train = tfidf_vect.fit_transform(X_full_train['description_punc'])


In [66]:
ridge3 = Ridge(alpha = 1.0).fit(X_bow_train, y_full_train)
ridge3.score(X_bow_train, y_full_train)

0.8431906240825124

with the help of re prepocessing, the bow score goes up to 0.84