In [1]:
import numpy as np
import pandas as pd 
import matplotlib
import matplotlib.pyplot as plt
import sklearn
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score, GridSearchCV, KFold, cross_validate
from sklearn.linear_model import Ridge, LogisticRegression, LinearRegression, Lasso, ElasticNet
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE, RFECV
from sklearn.dummy import DummyClassifier
from xgboost import XGBRegressor
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models import Word2Vec

In [2]:
wine_df = pd.read_csv('winemag-data-130k-v2.csv', index_col = 0)
wine_df

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia
1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos
2,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm
3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,,Alexander Peartree,,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian
4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks
...,...,...,...,...,...,...,...,...,...,...,...,...,...
129966,Germany,Notes of honeysuckle and cantaloupe sweeten th...,Brauneberger Juffer-Sonnenuhr Spätlese,90,28.0,Mosel,,,Anna Lee C. Iijima,,Dr. H. Thanisch (Erben Müller-Burggraef) 2013 ...,Riesling,Dr. H. Thanisch (Erben Müller-Burggraef)
129967,US,Citation is given as much as a decade of bottl...,,90,75.0,Oregon,Oregon,Oregon Other,Paul Gregutt,@paulgwine,Citation 2004 Pinot Noir (Oregon),Pinot Noir,Citation
129968,France,Well-drained gravel soil gives this wine its c...,Kritt,90,30.0,Alsace,Alsace,,Roger Voss,@vossroger,Domaine Gresser 2013 Kritt Gewurztraminer (Als...,Gewürztraminer,Domaine Gresser
129969,France,"A dry style of Pinot Gris, this is crisp with ...",,90,32.0,Alsace,Alsace,,Roger Voss,@vossroger,Domaine Marcel Deiss 2012 Pinot Gris (Alsace),Pinot Gris,Domaine Marcel Deiss


In [3]:
wine_df.points.unique()

array([ 87,  86,  85,  88,  92,  91,  90,  89,  83,  82,  81,  80, 100,
        98,  97,  96,  95,  93,  94,  84,  99], dtype=int64)

In [4]:
wine_df.isna().sum()

country                     63
description                  0
designation              37465
points                       0
price                     8996
province                    63
region_1                 21247
region_2                 79460
taster_name              26244
taster_twitter_handle    31213
title                        0
variety                      1
winery                       0
dtype: int64

In [5]:
# wine_df[['region_1','region_2']]
x = zip(wine_df.region_1.tolist(), wine_df.region_2.tolist())
for i in list(x):
    print(i)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)




("Montepulciano d'Abruzzo", nan)
(nan, nan)
('Juliénas', nan)
('Haut-Médoc', nan)
('Bordeaux Blanc', nan)
('Sauternes', nan)
('Margaux', nan)
('Lalande de Pomerol', nan)
(nan, nan)
('Santa Ynez Valley', 'Central Coast')
('Santa Clara Valley', 'Central Coast')
('Washington', 'Washington Other')
('Horse Heaven Hills', 'Columbia Valley')
('Dundee Hills', 'Willamette Valley')
('Mendoza', nan)
('Mendoza', nan)
(nan, nan)
('Mendocino', nan)
(nan, nan)
('Morgon', nan)
('Willamette Valley', 'Willamette Valley')
(nan, nan)
(nan, nan)
(nan, nan)
('Western Australia', nan)
('Valtellina Superiore', nan)
('Paso Robles', 'Central Coast')
('Côtes de Provence', nan)
(nan, nan)
(nan, nan)
('Russian River Valley', 'Sonoma')
('Dry Creek Valley', 'Sonoma')
(nan, nan)
(nan, nan)
('Templeton Gap District', 'Central Coast')
('Paso Robles', 'Central Coast')
('Horse Heaven Hills', 'Columbia Valley')
(nan, nan)
(nan, nan)
('Napa Valley', 'Napa')
('Côtes de Provence', nan)
('Columbia Valley (WA)', 'Columbia Val

In [6]:
drop_columns = ['region_2','taster_twitter_handle','taster_name','region_1']
wine_df.drop(columns = drop_columns, inplace = True)
wine_df

Unnamed: 0,country,description,designation,points,price,province,title,variety,winery
0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia
1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos
2,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm
3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian
4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks
...,...,...,...,...,...,...,...,...,...
129966,Germany,Notes of honeysuckle and cantaloupe sweeten th...,Brauneberger Juffer-Sonnenuhr Spätlese,90,28.0,Mosel,Dr. H. Thanisch (Erben Müller-Burggraef) 2013 ...,Riesling,Dr. H. Thanisch (Erben Müller-Burggraef)
129967,US,Citation is given as much as a decade of bottl...,,90,75.0,Oregon,Citation 2004 Pinot Noir (Oregon),Pinot Noir,Citation
129968,France,Well-drained gravel soil gives this wine its c...,Kritt,90,30.0,Alsace,Domaine Gresser 2013 Kritt Gewurztraminer (Als...,Gewürztraminer,Domaine Gresser
129969,France,"A dry style of Pinot Gris, this is crisp with ...",,90,32.0,Alsace,Domaine Marcel Deiss 2012 Pinot Gris (Alsace),Pinot Gris,Domaine Marcel Deiss


In [7]:
wine_df.price = wine_df.price.fillna(wine_df.price.mean())

In [8]:
drop_data = ['country','designation','province','variety']
for i in wine_df:
    wine_df = wine_df[wine_df[i].notna()]

In [9]:
print('total null value in all columns')
wine_df.isna().sum()

total null value in all columns


country        0
description    0
designation    0
points         0
price          0
province       0
title          0
variety        0
winery         0
dtype: int64

In [10]:
print("all columns unique elements")
for i in wine_df.columns:
    print(i, wine_df[i].nunique())

all columns unique elements
country 42
description 85379
designation 37954
points 21
price 331
province 406
title 84502
variety 641
winery 12538


In [11]:
ordinal_col = ['country','province','variety']
wine_df[ordinal_col] = wine_df[ordinal_col].apply(LabelEncoder().fit_transform)

In [12]:
wine_df

Unnamed: 0,country,description,designation,points,price,province,title,variety,winery
0,22,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,35.363389,316,Nicosia 2013 Vulkà Bianco (Etna),627,Nicosia
1,31,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.000000,104,Quinta dos Avidagos 2011 Avidagos Red (Douro),407,Quinta dos Avidagos
3,39,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.000000,208,St. Julian 2013 Reserve Late Harvest Riesling ...,434,St. Julian
4,39,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.000000,256,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,398,Sweet Cheeks
5,36,Blackberry and raspberry aromas show a typical...,Ars In Vitro,87,15.000000,250,Tandem 2011 Ars In Vitro Tempranillo-Merlot (N...,538,Tandem
...,...,...,...,...,...,...,...,...,...
129964,15,"Initially quite muted, this wine slowly develo...",Domaine Saint-Rémy Herrenweg,90,35.363389,11,Domaine Ehrhart 2013 Domaine Saint-Rémy Herren...,194,Domaine Ehrhart
129965,15,"While it's rich, this beautiful dry wine also ...",Seppi Landmann Vallée Noble,90,28.000000,11,Domaine Rieflé-Landmann 2013 Seppi Landmann Va...,394,Domaine Rieflé-Landmann
129966,17,Notes of honeysuckle and cantaloupe sweeten th...,Brauneberger Juffer-Sonnenuhr Spätlese,90,28.000000,222,Dr. H. Thanisch (Erben Müller-Burggraef) 2013 ...,434,Dr. H. Thanisch (Erben Müller-Burggraef)
129968,15,Well-drained gravel soil gives this wine its c...,Kritt,90,30.000000,11,Domaine Gresser 2013 Kritt Gewurztraminer (Als...,194,Domaine Gresser


In [13]:
# numeric_data = wine_df[['country','price','province','variety','points']]
numeric_data = wine_df[['country','price','province','variety','points']]
x = numeric_data.drop(columns = 'points')
y = numeric_data['points']

In [14]:
linear = LinearRegression()
score = cross_val_score(linear, x, y, cv = 5).mean()
print(f'with only numeric data score is {score}')

with only numeric data score is 0.19459515770582686


In [15]:
CountVec = CountVectorizer(stop_words='english')
Count_data = CountVec.fit_transform(wine_df.description)

# cv_dataframe=pd.DataFrame(Count_data.toarray(),columns=CountVec.get_feature_names())
print()




In [16]:
x = Count_data
y = numeric_data['points']

In [17]:
X_train, X_test, y_train, y_test = train_test_split(x, y, train_size = 0.80)

In [18]:
linear = LinearRegression()
linear.fit(X_train, y_train)
linear_score = linear.score(X_test, y_test)
print(f'with Count Vectorizer with not ngram hyperParameter score is {linear_score}')

with Count Vectorizer with not ngram hyperParameter score is 0.5594071496265465


In [19]:
Tf_Vec = CountVectorizer(stop_words='english', ngram_range=(1, 3))
Tf_data = CountVec.fit_transform(wine_df.description)

In [20]:
x = Tf_data
y = numeric_data['points']
X_train, X_test, y_train, y_test = train_test_split(x, y, train_size = 0.80)

In [21]:
regulized_linear_model = {
    'liner': LinearRegression(),
    'ridge': Ridge(),
    'Lasso': Lasso()
}

In [22]:
model_score = {}
for i in regulized_linear_model:
    linear = regulized_linear_model[i]
    linear.fit(X_train, y_train)
    model_score[i] = linear.score(X_test, y_test)
print(print(f'CountVectorizer with ngram hyperParameter score is \n{model_score}'))

CountVectorizer with ngram hyperParameter score is 
{'liner': 0.5694240215359093, 'ridge': 0.6734025397737551, 'Lasso': -6.670501608851964e-07}
None


In [23]:
tf_vec = TfidfVectorizer(lowercase=True, stop_words='english', ngram_range=(1, 3))
Tf_vsc = CountVec.fit_transform(wine_df.description)

In [24]:
x = Tf_vsc
y = numeric_data['points']
X_train, X_test, y_train, y_test = train_test_split(x, y, train_size = 0.80)

In [25]:
tf_score = {}
for i in regulized_linear_model:
    linear = regulized_linear_model[i]
    linear.fit(X_train, y_train)
    tf_score[i] = linear.score(X_test, y_test)
print(print(f'TfidfVectorizer with ngram hyperParameter score linearmodels score is \n{tf_score}'))

TfidfVectorizer with ngram hyperParameter score linearmodels score is 
{'liner': 0.5566794115838426, 'ridge': 0.6686426052717533, 'Lasso': -0.0006866869085992988}
None


In [26]:
main_df = wine_df[['country', 'price', 'province', 'variety','description']]
main_points = wine_df['points']

In [27]:
tf_vec = TfidfVectorizer(lowercase=True, stop_words='english', ngram_range=(1, 3))
main_df = CountVec.fit_transform(wine_df.description)

In [28]:
x = main_df
y = main_points
X_train, X_test, y_train, y_test = train_test_split(x, y, train_size = 0.80)

In [29]:
main_model_score = {}
for i in regulized_linear_model:
    linear = regulized_linear_model[i]
    linear.fit(X_train, y_train)
    main_model_score[i] = linear.score(X_test, y_test)
print(f'all numeric data + description columns\nTfidfVectorizer with ngram hyperParameter score in linear models score is \n{main_model_score}')

all numeric data + description columns
TfidfVectorizer with ngram hyperParameter score in linear models score is 
{'liner': 0.5467624712347319, 'ridge': 0.6663270373155953, 'Lasso': -7.558444930122477e-05}


# TASK 2

In [30]:
main_df = wine_df[['country', 'price', 'province', 'variety','description']]
main_points = wine_df['points']

In [31]:
word2Vec = Word2Vec(main_df)
# main_df = word2Vec.fit_transform(main_df['description'])
word2Vec

<gensim.models.word2vec.Word2Vec at 0x13dddcb1e80>

In [32]:
df = pd.DataFrame(word2Vec)
df

ValueError: DataFrame constructor not properly called!

In [None]:
x = word2Vec
y = main_points
X_train, X_test, y_train, y_test = train_test_split(x, y, train_size = 0.80)

In [None]:
linear = LinearRegression()
linear.fit(X_train, y_train)
linear_score = linear.score(X_test, y_test)
linear_score