# Import dataset

First of all I import all the necessaries packages and my dataset into Python

In [None]:
# importing libraries
import pandas as pd                                  # data science essentials
import matplotlib.pyplot as plt                      # data visualization
import seaborn as sns                                # enhanced data visualization
import numpy as np                                   # mathematical essentials
import statsmodels.formula.api as smf                # regression modeling
from sklearn.model_selection import train_test_split # train/test split
from sklearn.linear_model import LinearRegression

# specifying the path and file name
file = './LasVegasTripAdvisorReviews-1.xlsx'

# reading the file into Python
LasVegas = pd.read_excel(file)

# checking the file
LasVegas.head(n = 5)

# Looking for NaN

Even if from the dataset source we could see there are no NaN, we quickly check
anyway to be sure. As expected, no NaN values were detected in the dataset.

In [None]:
# Checking the NaN
LasVegas.isnull().sum()

# Checking data type

Now displaying information about columns to check data type



In [None]:
#checking datasets information
LasVegas.info()

We have 14 categorical variables, this means we need to encode them to see
their correlation with the Y variable. I've done some research about how to 
encode a list of categorical variables. For reference, this is the link with
the code I've used for this part: https://github.com/akshi8/Trip_Advisor_Reviews_LasVegas/blob/master/code/EDA.ipynb

In [None]:
# list of categorical variables
categorical = list(LasVegas.select_dtypes(include=['object']).columns.values)
categorical

In [None]:
#importing packages
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

#substituting months with seasons

['Dec-Feb' 'Mar-May' 'Jun-Aug' 'Sep-Nov']
LasVegas['Period of stay'] = LasVegas['Period of stay'].map({'Dec-Feb':'winter', 'Mar-May':'spring', 'Jun-Aug' :'summer','Sep-Nov':'autumn'})


#data[categorical[i]] = le.fit_transform(data[categorical[i]])

for i in range(0, len(categorical)):
    print(LasVegas[categorical[i]].unique())

In [None]:
#Encoding categorical features with numbers
for i in range(0, len(categorical)):
    LasVegas[categorical[i]] = le.fit_transform(LasVegas[categorical[i]])
    
LasVegas.head()

# Renaming columns

To have "unique words", otherwise my code will not run

In [None]:
LasVegas.rename({"User country":"User_country",
                 "Nr. reviews": "Nr_reviews",
                 "Nr. hotel reviews": "Nr_hotel_reviews",
                 "Period of stay": "Period_of_stay",
                 "Traveler type": "Traveler_type",
                 "Tennis court": "Tennis_court",
                 "Free internet": "Free_internet",
                 "Hotel name": "Hotel_name",
                 "Hotel stars": "Hotel_stars",
                 "Nr. rooms": "Nr_rooms",
                 "User continent": "User_continent",
                "Member years": "Member_years",
                "Review month": "Review_month",
                "Review weekday": "Review_weekday",
                "Helpful votes": "Helpful_votes"}, inplace=True,axis=1)

# Linear Correlation

I'll use the correlation method Pearson to see how the x variables are related
to my y "Helpful_votes". 

In [None]:
# Linear Correlation for LasVegas dataset
LasVegas_corr = LasVegas.corr(method = 'pearson').round(decimals = 1)
LasVegas_corr.loc[ : , 'Helpful_votes'].round(decimals = 2).sort_values(ascending = True , )

In [None]:
#generate col+ for reg
for col in LasVegas:
    print(col + " +")

# Running a regression model

I'll now run a regression model with all the x variables 

In [None]:
#first try to for all feature
# importing an additional package
import statsmodels.formula.api as smf # predictive modeling with nice outputs
# Step 1: INSTANTIATE a model object
# Linear Regression
linear_regression = smf.ols(formula =  """Helpful_votes ~
User_country +
Nr_reviews +
Nr_hotel_reviews +
Score +
Period_of_stay +
Traveler_type +
Pool +
Gym +
Tennis_court +
Spa +
Casino +
Free_internet +
Hotel_name +
Hotel_stars +
Nr_rooms +
User_continent +
Member_years +
Review_month +
Review_weekday
""",
                  data = LasVegas)


# Step 2: FIT the data into the model object
results = linear_regression.fit()


# Step 3: analyze the SUMMARY output
print(results.summary())

Output: running a regression with all the x variables allows me to see which is 
the value of all the p values. I'll remove p-values bigger than 0.05 as it means
that they are not significant but I'll keep Hotel_stars as it's a categorical 
variable and it gives me a lower p-value about 0.092. I'll use Nr_reviews, 
Nr_hotel_reviews and Hotel_stars for my regression model.

In [None]:
LasVegas.columns

In [None]:
linear_regression_reduced = smf.ols(formula =  """Helpful_votes ~
Nr_reviews +
Nr_hotel_reviews +
Hotel_stars

""",
                  data = LasVegas)


# Step 2: FIT the data into the model object
results_reduced = linear_regression_reduced.fit()


# Step 3: analyze the SUMMARY output
print(results_reduced.summary())

I'll keep these three as variables as Nr_reviews and Nr_hotel_reviews give me 
p-value of 0.000 and Hotel_stars is about 0.051.

In [None]:
# log transforming and saving it to the dataset
eps = 0.000001
LasVegas['log_Helpful_votes'] = np.log(LasVegas['Helpful_votes']+eps)
LasVegas['log_Nr_reviews'] = np.log(LasVegas['Nr_reviews']+eps)
LasVegas['log_Nr_hotel_reviews'] = np.log(LasVegas['Nr_hotel_reviews']+eps)
LasVegas['log_Hotel_stars'] = np.log(LasVegas['Hotel_stars']+eps)

# Preparing training and set for model development

In [None]:
# preparing response variable data
LasVegas_data = LasVegas.loc[:, ['Nr_reviews', 'Nr_hotel_reviews', 'Hotel_stars']]

# preparing response variable data
LasVegas_target = LasVegas.loc[ : , 'Helpful_votes']
log_LasVegas_target = LasVegas.loc[ : , 'log_Helpful_votes']


# preparing training and testing sets
x_train, x_test, y_train, y_test = train_test_split(
            LasVegas_data,
            LasVegas_target,
            test_size = 0.25,
            random_state = 219)


# checking the shapes of the datasets
print(f"""
Training Data
-------------
X-side: {x_train.shape}
y-side: {y_train.shape[0]}


Testing Data
------------
X-side: {x_test.shape}
y-side: {y_test.shape[0]}
""")

In [None]:
#generating data list 
x_variables = ['User_country', 'Nr_reviews', 'Nr_hotel_reviews',
       'Score', 'Period_of_stay', 'Traveler_type', 'Pool', 'Gym',
       'Tennis_court', 'Spa', 'Casino', 'Free_internet', 'Hotel_name',
       'Hotel_stars', 'Nr_rooms', 'User_continent', 'Member_years',
       'Review_month', 'Review_weekday']

#
for val in x_variables:
    print(f"{val} +")

Concatenate together x_train and y_train to work with statsmodel. I'll test my
best model to see how it's performing

In [None]:
# merging X_train and y_train so that they can be used in statsmodels
LasVegas_train = pd.concat([x_train, y_train], axis = 1)

# Step 1: build a model
lm_best = smf.ols(formula = """Helpful_votes ~ 
Nr_reviews +
Nr_hotel_reviews +
Hotel_stars
                                         
                                        """,
                            data = LasVegas_train)

# Step 2: fit the model based on the data
results = lm_best.fit()


# Step 3: analyze the summary output
print(results.summary())

# OLS Regression

In [None]:
# applying model in scikit-learn

# preparing x-variables
ols_data = LasVegas.loc[:, x_variables]
LasVegas_data = LasVegas.drop(columns="Helpful_votes")
LasVegas_target = LasVegas.loc[:,"Helpful_votes"]


###############################################
# setting up more than one train-test splits 
###############################################
# FULL X-dataset 
x_train_FULL, x_test_FULL, y_train_FULL, y_test_FULL = train_test_split(
            LasVegas_data,     # x-variables
            LasVegas_target,   # y-variable
            test_size = 0.25,
            random_state = 219)


# OLS p-value x-dataset
x_train_OLS, x_test_OLS, y_train_OLS, y_test_OLS = train_test_split(
            ols_data,         # x-variables
            LasVegas_target,   # y-variable
            test_size = 0.25,
            random_state = 219)

In [None]:
# INSTANTIATING a model object
lr = LinearRegression()


# FITTING to the training data
lr_fit = lr.fit(x_train_OLS, y_train_OLS)


# PREDICTING on new data
lr_pred = lr_fit.predict(x_test_OLS)


# SCORING the results
print('OLS Training Score :', lr.score(x_train_OLS, y_train_OLS).round(4))  # using R-square
print('OLS Testing Score  :',  lr.score(x_test_OLS, y_test_OLS).round(4)) # using R-square

lr_train_score = lr.score(x_train_OLS, y_train_OLS).round(4)
lr_test_score = lr.score(x_test_OLS, y_test_OLS).round(4)

# displaying and saving the gap between training and testing
print('OLS Train-Test Gap :', abs(lr_train_score - lr_test_score).round(4))
lr_test_gap = abs(lr_train_score - lr_test_score).round(4)

I've tried to run it using the OLS but I'll also try with the FULL and see if I 
can reduce the gap and obtain a better R squared

In [None]:
# INSTANTIATING a model object
lr1 = LinearRegression()


# FITTING to the training data
lr1_fit = lr1.fit(x_train_FULL, y_train_FULL)


# PREDICTING on new data
lr1_pred = lr1_fit.predict(x_test_FULL)


# SCORING the results
print('OLS Training Score :', lr1.score(x_train_FULL, y_train_FULL).round(4)) 
print('OLS Testing Score  :',  lr1.score(x_test_FULL, y_test_FULL).round(4)) 

lr1_train_score = lr1.score(x_train_FULL, y_train_FULL).round(4)
lr1_test_score = lr1.score(x_test_FULL, y_test_FULL).round(4)

# displaying and saving the gap between training and testing
print('OLS Train-Test Gap :', abs(lr1_train_score - lr1_test_score).round(4))
lr1_test_gap = abs(lr1_train_score - lr1_test_score).round(4)

I've obtained a better R squared, I'll keep the FULL one as OLS model

In [None]:
# zipping each feature name to its coefficient
lr1_model_values = zip(LasVegas[x_variables].columns,
                      lr1_fit.coef_.round(decimals = 2))


# setting up a placeholder list to store model features
lr1_model_lst = [('intercept', lr1_fit.intercept_.round(decimals = 2))]


# printing out each feature-coefficient pair one by one
for val in lr1_model_values:
    lr1_model_lst.append(val)
    

# checking the results
for pair in lr1_model_lst:
    print(pair)

# Lasso Regression

In [None]:
import sklearn
# INSTANTIATING a model object
lasso_model = sklearn.linear_model.Lasso(alpha     = 1.0,  # default shrinkage
                                         normalize = False) # default magitude


# FITTING to the training data
lasso_fit = lasso_model.fit(x_train_FULL, y_train_FULL)


# PREDICTING on new data
lasso_pred = lasso_fit.predict(x_test_FULL)


# SCORING the results
print('Lasso Training Score :', lasso_model.score(x_train_FULL, y_train_FULL).round(4))
print('Lasso Testing Score  :', lasso_model.score(x_test_FULL, y_test_FULL).round(4))


# saving scoring data for future use
lasso_train_score = lasso_model.score(x_train_FULL, y_train_FULL).round(4) 
lasso_test_score  = lasso_model.score(x_test_FULL, y_test_FULL).round(4)   


# displaying and saving the gap between training and testing
print('Lasso Train-Test Gap :', abs(lasso_train_score - lasso_test_score).round(4))
lasso_test_gap = abs(lasso_train_score - lasso_test_score).round(4)

In [None]:
# zipping each feature name to its coefficient
lasso_model_values = zip(LasVegas_data.columns, lasso_fit.coef_.round(decimals = 2))


# setting up a placeholder list to store model features
lasso_model_lst = [('intercept', lasso_fit.intercept_.round(decimals = 2))]


# printing out each feature-coefficient pair one by one
for val in lasso_model_values:
    lasso_model_lst.append(val)
    

# checking the results
for pair in lasso_model_lst:
    print(pair)

In [None]:
# dropping coefficients that are equal to zero

# printing out each feature-coefficient pair one by one
for feature, coefficient in lasso_model_lst:
        
        if coefficient == 0:
            lasso_model_lst.remove((feature, coefficient))

            
# checking the results
for pair in lasso_model_lst:
    print(pair)

# ARD Regression

In [None]:
# INSTANTIATING a model object
ard_model = sklearn.linear_model.ARDRegression(normalize  = False)


# FITTING the training data
ard_fit = ard_model.fit(x_train_FULL, y_train_FULL)


# PREDICTING on new data
ard_pred = ard_fit.predict(x_test_FULL)


print('Training Score:', ard_model.score(x_train_FULL, y_train_FULL).round(4))
print('Testing Score :',  ard_model.score(x_test_FULL, y_test_FULL).round(4))


# saving scoring data for future use
ard_train_score = ard_model.score(x_train_FULL, y_train_FULL).round(4)
ard_test_score  = ard_model.score(x_test_FULL, y_test_FULL).round(4)


# displaying and saving the gap between training and testing
print('ARD Train-Test Gap :', abs(ard_train_score - ard_test_score).round(4))
ard_test_gap = abs(ard_train_score - ard_test_score).round(4)

In [None]:
# zipping each feature name to its coefficient
ard_model_values = zip(LasVegas_data.columns, ard_fit.coef_.round(decimals = 5))


# setting up a placeholder list to store model features
ard_model_lst = [('intercept', ard_fit.intercept_.round(decimals = 2))]


# printing out each feature-coefficient pair one by one
for val in ard_model_values:
    ard_model_lst.append(val)
    

# checking the results
for pair in ard_model_lst:
    print(pair)

In [None]:
# dropping coefficients that are equal to zero

# printing out each feature-coefficient pair one by one
for feature, coefficient in ard_model_lst:
        
        if coefficient == 0:
            ard_model_lst.remove((feature, coefficient))

            
# checking the results
for pair in ard_model_lst:
    print(pair)

In [None]:
# comparing results

print(f"""
Model             Train Score         Test Score          Train Test Gap                                         
-----             -----------         ----------          ----------               
OLS***              {lr1_train_score.round(3)}               {lr_test_score.round(3)}             {lr1_test_gap.round(3)}    
Lasso               {lasso_train_score.round(3)}               {lasso_test_score.round(3)}             {lasso_test_gap.round(3)}
ARD                 {ard_train_score.round(3)}               {ard_test_score.round(3)}             {ard_test_gap.round(3)}
                        
""")


# creating a dictionary for model results
model_performance = {
    
    'Model Type'    : ['OLS***', 'Lasso', 'ARD'],
           
    'Training' : [lr1_train_score.round(3), lasso_train_score.round(3),
                                   ard_train_score.round(3)],
           
    'Testing'  : [lr1_test_score.round(3), lasso_test_score.round(3),
                                   ard_test_score.round(3)],
                    
    'Train-Test Gap' : [lr1_test_gap.round(3), lasso_test_gap.round(3),
                                        ard_test_gap.round(3)],
                    
    'Model Size' : [len(lr1_model_lst), len(lasso_model_lst),
                                    len(ard_model_lst)]
    }
                    

# converting model_performance into a DataFrame
model_performance = pd.DataFrame(model_performance)

model_performance


My best model is OLS, with an higher R squared and a lower gap