In [14]:
%config IPCompleter.greedy=True
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,MinMaxScaler,LabelEncoder,OneHotEncoder,LabelBinarizer
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer,HashingVectorizer

sns.set(style="darkgrid")

In [15]:
dataset = "winemag-data-130k-v2.csv"
df = pd.read_csv(dataset,delimiter=',',index_col=0)

df = df.drop(['designation','winery','region_2','taster_name', 'taster_twitter_handle','title'], axis=1)
df = df.drop_duplicates(subset='description', keep=False)
df = df.dropna(subset=['country','description', 'points','price','province'])

min_price = 5
max_price = 120

df = df[df['price'].between(min_price,max_price)]

df['variety'] = df['variety'].str.replace(" ","")
#df['winery'] = df['winery'].str.replace(" ","")
df['country'] = df['country'].str.replace(" ","")
df['province'] = df['province'].str.replace(" ","")
df['region_1'] = df['region_1'].str.replace(" ","")

df['variety'] = df['variety'].str.lower()
#df['winery'] = df['winery'].str.lower()
df['country'] = df['country'].str.lower()
df['province'] = df['province'].str.lower()
df['region_1'] = df['region_1'].str.lower()

keys = df['variety'].value_counts(sort=False)
mask = keys.values > 120
keys = keys[mask].index
df = df[df['variety'].isin(keys)]

#keys = df['winery'].value_counts(sort=False)
#mask = keys.values > 5
#keys = keys[mask].index
#df = df[df['winery'].isin(keys)]

keys = df['country'].value_counts(sort=False)
mask = keys.values > 500
keys = keys[mask].index
df = df[df['country'].isin(keys)]

keys = df['province'].value_counts(sort=False)
mask = keys.values > 250
keys = keys[mask].index
df = df[df['province'].isin(keys)]

keys = df['region_1'].value_counts(sort=False)
mask = keys.values > 40
keys = keys[mask].index
df = df[df['region_1'].isin(keys)]

df['description_length'] = df['description'].apply(len)
df = df.drop(['description'],axis=1)

labelColumns = ['country','province','variety','region_1']

for col in labelColumns:
    one_hot = pd.get_dummies(df[col],prefix=col)
    df = df.join(one_hot)

df = df.drop(labelColumns,axis=1)

In [16]:
len(df)

69980

In [17]:
df.head()

Unnamed: 0,points,price,description_length,country_argentina,country_australia,country_france,country_italy,country_spain,country_us,province_alsace,...,region_1_volnay,region_1_vouvray,region_1_wahlukeslope,region_1_wallawallavalley(or),region_1_wallawallavalley(wa),region_1_washington,region_1_willamettevalley,region_1_yakimavalley,region_1_yorkvillehighlands,region_1_yountville
2,87,14.0,186,0,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
4,87,65.0,249,0,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
7,87,24.0,122,0,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
18,87,28.0,275,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
19,87,32.0,315,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
df.to_csv('preprocessed.csv',index=False)