In [None]:
# Adapted from https://www.kaggle.com/amukho33/exploring-wine-reviews

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
winedata = pd.read_csv('/kaggle/input/wine-reviews/winemag-data-130k-v2.csv', index_col=0)
winedata.describe().T

In [None]:
#Cleanup

#Remove rows without prices
winedata = winedata[winedata["price"].notnull()]

#Remove rows without variety
winedata = winedata[winedata["variety"].notnull()]

#Feature Extraction

#Add year column
winedata["year"] = winedata["title"].str.extract(r'(\d{4})')


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(style="white", color_codes=True)
%matplotlib inline

winedata.plot(kind="scatter", x="points", y="price")

In [None]:
#Count of reviews in each wine variety 
value_counts = winedata["variety"].value_counts()
value_counts.head()

In [None]:
#Wine Review by Country
pd.options.mode.chained_assignment = None
pd.options.display.max_columns = 999

plt.figure(figsize=(20,7))
sns.countplot(x="country", data=winedata)
plt.ylabel("Review Count",fontsize = 12)
plt.xlabel("Country", fontsize=12)
plt.xticks(rotation=90)
plt.title("Count of Reviews by country", fontsize=15)
plt.show()

In [None]:
ReviewCountbyCountry = pd.DataFrame(winedata["country"].value_counts())
ReviewCountbyCountry.describe().T

In [None]:
#Top 12 Countries
country_list = ['US','Italy','France','Spain','Chile','Argentina','Portugal','Australia','New Zealand','Germany','South Africa']
sub_country = winedata[winedata['country'].isin(country_list)]
plt.figure(figsize=(40,12))
sns.set_context("paper", font_scale=2.5)    
sns.violinplot(x="country", y="price", data=sub_country, inner=None)

In [None]:
#Look at the most expensive wine
sub_country[sub_country['price'] == 3300]

In [None]:
#Look at distribution for this type of wine
Bordeaux_style_redblend = sub_country[sub_country['variety'] == 'Bordeaux-style Red Blend']
Bordeaux_style_redblend.describe().T

In [None]:
#Look at 5 wines
Bordeaux_style_redblend.head(5)


In [None]:
#Encoding the labels 
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
label_encoder = label_encoder.fit(sub_country['variety'])
label_encoded_y = label_encoder.transform(sub_country['variety'])
sub_country['encoded_winevariety'] = label_encoded_y
sub_country.head()

In [None]:
tfidf = TfidfVectorizer(
    min_df=5, max_features=100, strip_accents='unicode',lowercase =True,
    analyzer='word', token_pattern=r'\w+', use_idf=True, 
    smooth_idf=True, sublinear_tf=True, stop_words = 'english').fit(sub_country["description"])
features = tfidf.get_feature_names()
print(features)

In [None]:
X_tfidf_text = tfidf.transform(sub_country["description"])
subdata_2 = pd.DataFrame(X_tfidf_text.toarray())
sub_country = sub_country.reset_index()
subdata_2['encoded_winevariety'] = sub_country['encoded_winevariety']
#Also adding variety for better readibility
subdata_2['variety'] = sub_country['variety']


seed = 7

#Split into train and test
test_size = 0.2
y = subdata_2['encoded_winevariety']
X = subdata_2.drop(['encoded_winevariety','variety'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=seed)
# fit model no training data
import xgboost as xgb
clf = xgb.XGBClassifier(max_depth=3, n_estimators=300, learning_rate=0.05)

In [None]:
clf.fit(X_train, y_train)

In [None]:
y_pred = clf.predict(X_test)

In [None]:
#Measuring accuracy
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_pred, y_test)
print("Accuracy: %.2f%%" % (accuracy * 100.0))