In [897]:
import pandas as pd
import seaborn as sns
import numpy as np
train_df=pd.read_csv('train.csv') #we use this dataset to train our model
test_df=pd.read_csv('test.csv') #we will use this data set later to validate our model

#Combined both training and test
Combined_df=pd.concat([train_df,test_df])

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


In [898]:
Combined_df.shape

(258210, 15)

In [899]:
train_df.shape

(175000, 14)

In [900]:
Combined_df.info() #check the column data types

<class 'pandas.core.frame.DataFrame'>
Int64Index: 258210 entries, 0 to 83209
Data columns (total 15 columns):
country                  258146 non-null object
description              258210 non-null object
designation              181120 non-null object
id                       258210 non-null int64
index                    83210 non-null float64
points                   258210 non-null float64
price                    175000 non-null float64
province                 258146 non-null object
region_1                 215793 non-null object
region_2                 110996 non-null object
taster_name              96479 non-null object
taster_twitter_handle    91559 non-null object
title                    120975 non-null object
variety                  258209 non-null object
winery                   258210 non-null object
dtypes: float64(3), int64(1), object(11)
memory usage: 31.5+ MB


In [901]:
Combined_df.head()

Unnamed: 0,country,description,designation,id,index,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
0,Portugal,This is a fine rich balanced wine. It has ripe...,Vila Santa Reserva,32027,,88.870874,20.0,Alentejano,,,,,,PORTUGUESE RED,J. Portugal Ramos
1,France,"A solid, chunky wine, with a structure that is...",,71079,,88.041695,28.0,Bordeaux,Lalande de Pomerol,,,,,BORDEAUX-STYLE RED BLEND,Château Tour Grand Colombier
2,France,"This is powerful and concentrated, with the hi...",,32440,,94.085021,130.0,Bordeaux,Saint-Émilion,,,,,BORDEAUX-STYLE RED BLEND,Château Figeac
3,US,"Rich, ripe and oaky, this Petite Sirah charms ...",Thompson Vineyard,124405,,89.869797,34.0,California,Santa Barbara County,Central Coast,,,Jaffurs 2010 Thompson Vineyard Petite Sirah (S...,PETITE SIRAH,Jaffurs
4,US,This wine is a unique in the state blend and f...,McKinley Springs Vineyard,33649,,89.017651,24.0,Washington,Horse Heaven Hills,Columbia Valley,Sean P. Sullivan,@wawinereport,Syncline 2016 McKinley Springs Vineyard Rosé (...,ROSé,Syncline


In [902]:
train_df.nunique()

country                      45
description              123811
designation               37931
points                   175000
price                       387
province                    468
region_1                   1278
region_2                     18
taster_name                  19
taster_twitter_handle        15
title                     77411
variety                     706
winery                    16968
id                       124675
dtype: int64

In [903]:
train_df.describe()

Unnamed: 0,points,price,id
count,175000.0,175000.0,175000.0
mean,88.083987,34.3044,70684.04724
std,3.157001,38.398146,41341.638798
min,79.636128,4.0,1.0
25%,85.971283,16.0,35020.0
50%,87.981631,25.0,70256.5
75%,90.085631,40.0,105550.25
max,100.220603,2500.0,150929.0


## EDA

In [904]:
#sns.pairplot(train_df) #see the correlation of our features.---This guides you whether there exist linear relationships or not

In [905]:
#sns.distplot(train_df['price'])

In [906]:
#sns.distplot(train_df['points'],kde=False)

# Data Cleaning

In [907]:
#Fill NA values
Combined_df.fillna(value=' ', inplace=True)

#Clean description column and add other features
Combined_df['description']= Combined_df['description'].str.lower()
Combined_df['text length']=Combined_df['description'].apply(len)
Combined_df["raw_word_len"] = Combined_df["description"].apply(lambda x: len(x.split()))

In [908]:
# Convert variables to categories with label encoding
#Combined_df["country"] = Combined_df["country"].astype('category').cat.codes
Combined_df["designation"] = Combined_df["designation"].astype('category').cat.codes
Combined_df["province"] = Combined_df["province"].astype('category').cat.codes
Combined_df["region_1"] = Combined_df["region_1"].astype('category').cat.codes
Combined_df["region_2"] = Combined_df["region_2"].astype('category').cat.codes
Combined_df["taster_name"] = Combined_df["taster_name"].astype('category').cat.codes
#Combined_df["taster_twitter_handle"] = Combined_df["taster_twitter_handle"].astype('category').cat.codes
Combined_df["title"] = Combined_df["title"].astype('category').cat.codes
Combined_df["variety"] = Combined_df["variety"].astype('category').cat.codes
Combined_df["winery"] = Combined_df["winery"].astype('category').cat.codes

In [909]:
#Applying one hot encoding in some variables
Combined_df = pd.get_dummies(Combined_df, columns = ['country','taster_twitter_handle'], drop_first = True)

In [910]:
Combined_df.shape

(258210, 78)

## TF- IDF

In [911]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

In [912]:
#Train tfidf and svd
tf = TfidfVectorizer(sublinear_tf=True,
        strip_accents='unicode',
        analyzer='word',
        token_pattern=r'\w{1,}',
        ngram_range=(1, 2),
        max_features=30000,
        stop_words='english')

svd = TruncatedSVD(n_components=5)

def transform_text(data, col):
    #Fit tfidf and svd, and transform training data
    tfidf_matrix = tf.fit_transform(data[col])
    lsa_features = pd.DataFrame(svd.fit_transform(tfidf_matrix))

    #Creat meaningful column names
    collist = map(str, range(0, 5))
    collist = ["latent_" + col + '_' + s for s in collist]
    lsa_features.columns = collist
    lsa_features = lsa_features.set_index(data.index)
    return lsa_features

lsa_features = transform_text(Combined_df, 'description') #this is the reduced features from the description column

In [913]:
lsa_features.head()

Unnamed: 0,latent_description_0,latent_description_1,latent_description_2,latent_description_3,latent_description_4
0,0.137522,-0.017933,0.129833,0.058877,0.00571
1,0.12911,-0.040122,0.068277,0.038326,-0.000454
2,0.136792,-0.116025,0.094334,0.056492,0.142298
3,0.122417,-0.081205,0.008858,-0.063829,0.021029
4,0.106948,0.061462,-0.012938,-0.018909,0.019314


In [914]:
#Add description features to dataframe
Combined_df_new=pd.concat([Combined_df,lsa_features],axis=1)

In [915]:
Combined_df.head()

Unnamed: 0,description,designation,id,index,points,price,province,region_1,region_2,taster_name,...,taster_twitter_handle_@laurbuzz,taster_twitter_handle_@mattkettmann,taster_twitter_handle_@paulgwine,taster_twitter_handle_@suskostrzewa,taster_twitter_handle_@vboone,taster_twitter_handle_@vossroger,taster_twitter_handle_@wawinereport,taster_twitter_handle_@wineschach,taster_twitter_handle_@winewchristina,taster_twitter_handle_@worldwineguys
0,this is a fine rich balanced wine. it has ripe...,42042,32027,,88.870874,20,9,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"a solid, chunky wine, with a structure that is...",0,71079,,88.041695,28,39,583,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"this is powerful and concentrated, with the hi...",0,32440,,94.085021,130,39,984,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"rich, ripe and oaky, this petite sirah charms ...",39633,124405,,89.869797,34,54,1015,2,0,...,0,0,0,0,0,0,0,0,0,0
4,this wine is a unique in the state blend and f...,25326,33649,,89.017651,24,469,537,4,17,...,0,0,0,0,0,0,1,0,0,0


In [916]:
train_df_new=Combined_df_new.iloc[0:len(train_df),]
test_df_new=Combined_df_new.iloc[len(train_df):,]

In [917]:
y=train_df_new[['price']]
X=train_df_new.drop(['description', 'price','id','index' ],axis=1)
x_test=test_df_new.drop(['description', 'price','id','index' ],axis=1) #xtest for test data---

In [918]:
X.head()

Unnamed: 0,designation,points,province,region_1,region_2,taster_name,title,variety,winery,text length,...,taster_twitter_handle_@vossroger,taster_twitter_handle_@wawinereport,taster_twitter_handle_@wineschach,taster_twitter_handle_@winewchristina,taster_twitter_handle_@worldwineguys,latent_description_0,latent_description_1,latent_description_2,latent_description_3,latent_description_4
0,42042,88.870874,9,0,0,0,0,480,9470,267,...,0,0,0,0,0,0.137522,-0.017933,0.129833,0.058877,0.00571
1,0,88.041695,39,583,0,0,0,63,4345,225,...,0,0,0,0,0,0.12911,-0.040122,0.068277,0.038326,-0.000454
2,0,94.085021,39,984,0,0,0,63,3564,199,...,0,0,0,0,0,0.136792,-0.116025,0.094334,0.056492,0.142298
3,39633,89.869797,54,1015,2,0,53574,444,9532,187,...,0,0,0,0,0,0.122417,-0.081205,0.008858,-0.063829,0.021029
4,25326,89.017651,469,537,4,17,95521,523,15765,290,...,0,1,0,0,0,0.106948,0.061462,-0.012938,-0.018909,0.019314


# Feature Scaling


In [919]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

sc = StandardScaler()
X_train = pd.DataFrame(sc.fit_transform(X_train))
X_test = pd.DataFrame(sc.transform(X_test))


  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  


## Model Training 

In [920]:
from sklearn.ensemble import RandomForestRegressor

regressor = RandomForestRegressor(n_estimators=60, random_state=101)
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)

from sklearn import metrics

print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

  after removing the cwd from sys.path.


Mean Absolute Error: 8.854643333333335
Mean Squared Error: 468.28930934126987
Root Mean Squared Error: 21.639993284224232


In [921]:
x_test=test_df_new.drop(['description', 'price','id','index' ],axis=1)

#transform test column
x_test = pd.DataFrame(sc.transform(x_test))

  after removing the cwd from sys.path.


In [922]:
predictions=regressor.predict(x_test)
predictions_df=pd.DataFrame({'id':range(len(test_df)),'price':predictions})

In [923]:
np.mean(predictions_df['price'])

34.77902876256859

In [783]:
predictions_df.to_csv('predictions.csv',index=False)

# Training the model with K - FOLD

In [894]:

%%time
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestRegressor

KF = KFold(n_splits = 5, shuffle = True, random_state=1)
preds = np.zeros([test_df_new.shape[0],])

for i,(train_ind, test_ind) in enumerate(KF.split(X)):
    print('========Fold',i)
    Xtrain, XCV, ytrain, yCV = X.iloc[train_ind], X.iloc[test_ind], y.values[train_ind], y.values[test_ind]
    
    regressor = RandomForestRegressor(n_estimators=60, random_state=100)
    regressor.fit(Xtrain, ytrain)
    preds += regressor.predict(x_test)/5

    pred = regressor.predict(XCV)
    print('RMSE :', np.sqrt(metrics.mean_squared_error(pred, yCV)))



  if sys.path[0] == '':


RMSE : 24.98429001376792


  if sys.path[0] == '':


RMSE : 24.206710516707602


  if sys.path[0] == '':


RMSE : 22.24897534540939


  if sys.path[0] == '':


RMSE : 23.490753745162646


  if sys.path[0] == '':


RMSE : 21.56968493194771
CPU times: user 10min 20s, sys: 156 ms, total: 10min 20s
Wall time: 10min 20s


In [896]:
preds

array([102.38666667,  49.97666667,  40.07666667, ...,  31.72666667,
        23.82333333,  59.24333333])

In [655]:
predictions_df=pd.DataFrame({'id':range(len(test_df)),'price':preds})

In [656]:
np.mean(predictions_df['price'])

34.724481953290876

In [547]:
predictions_df.to_csv('predictions.csv',index=False)

  after removing the cwd from sys.path.
