In [79]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

In [138]:
df_3star = pd.read_csv("dp_textblob_3_star.csv")
df_4star = pd.read_csv("dp_textblob_4_star.csv")
df_5star = pd.read_csv("dp_textblob_5_star.csv")
print(df_3star.shape)
print(df_4star.shape)
print(df_5star.shape)

(12331, 11)
(16349, 12)
(30972, 12)


# 3 Star

In [139]:
df_3star["polarity"] = np.where(df_3star["rating"] >= 4, "Positive", "Negative")
df_3star["polarity"] = np.where(df_3star["rating"] == 3.0, np.NaN, df_3star["polarity"])
df_3star = df_3star.dropna(subset = "polarity")
df_3star.head()

Unnamed: 0,travel_type,rating,label,covid,is_local,aspect_sentiment,topic_2,topic_0,topic_1,topic_3,overall_textblob_polarity,polarity
0,couple,4.0,Positive,PostCovid,0,"[[hotel, clean, 2], [hotel, comfortable, 2], [...",0.311111,0.143333,0.205556,,0.149749,Positive
1,family,5.0,Positive,PostCovid,0,"[[room, great, 0], [room, small, 0], [hotel, g...",0.7,0.275,,,0.455,Positive
2,friends,5.0,Positive,PostCovid,0,"[[food, close, 2], [check, easy, 3]]",0.0,,,0.433333,0.396296,Positive
3,solo,5.0,Positive,PostCovid,0,"[[staff, great, 3]]",,,,0.8,0.312083,Positive
4,business,4.0,Positive,PostCovid,0,"[[room, 'also night', 0], [time, long, 3]]",,0.0,,-0.05,0.372917,Positive


In [140]:
encoder = OneHotEncoder(sparse=False)
cols = ['travel_type', 'covid']
df_encoded = pd.DataFrame(encoder.fit_transform(df_3star[cols]))
df_encoded.columns = encoder.get_feature_names(cols)

df_3star_final = df_3star.drop(cols ,axis=1)
df_3star_final = pd.concat([df_encoded, df_3star_final], axis=1)

df_3star_final['label'] = df_3star_final['label'].apply(lambda x: 1 if x == 'Positive' else 0)
df_3star_final.head()



Unnamed: 0,travel_type_business,travel_type_couple,travel_type_family,travel_type_friends,travel_type_solo,travel_type_nan,covid_Covid,covid_PostCovid,covid_PreCovid,rating,label,is_local,aspect_sentiment,topic_2,topic_0,topic_1,topic_3,overall_textblob_polarity,polarity
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,4.0,1,0.0,"[[hotel, clean, 2], [hotel, comfortable, 2], [...",0.311111,0.143333,0.205556,,0.149749,Positive
1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,5.0,1,0.0,"[[room, great, 0], [room, small, 0], [hotel, g...",0.7,0.275,,,0.455,Positive
2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,5.0,1,0.0,"[[food, close, 2], [check, easy, 3]]",0.0,,,0.433333,0.396296,Positive
3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,5.0,1,0.0,"[[staff, great, 3]]",,,,0.8,0.312083,Positive
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,4.0,1,0.0,"[[room, 'also night', 0], [time, long, 3]]",,0.0,,-0.05,0.372917,Positive


## LogReg Baseline

In [141]:
features = ["travel_type_business", "travel_type_couple", "travel_type_family", "travel_type_friends", 
            "travel_type_solo", "is_local", "covid_PreCovid", "overall_textblob_polarity"]
target = "label"

In [142]:
X = df_3star_final[features]
y = df_3star_final[target]

In [143]:
X = X.fillna(0)

In [144]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():


LogisticRegression()

In [145]:
y_pred = logreg.predict(X_test)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))

Accuracy of logistic regression classifier on test set: 0.95


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():


In [146]:
coefficients = logreg.coef_[0]
coefficients

array([-0.27414562, -0.16959379, -0.54194528,  0.09918599, -0.52258609,
        0.64402139, -0.22782123, 17.90344235])

In [147]:
print("3 Star Baseline")
for i in range(len(features)):
    print(f'Coeff of {features[i]}: %.3f' %coefficients[i])

3 Star Baseline
Coeff of travel_type_business: -0.274
Coeff of travel_type_couple: -0.170
Coeff of travel_type_family: -0.542
Coeff of travel_type_friends: 0.099
Coeff of travel_type_solo: -0.523
Coeff of is_local: 0.644
Coeff of covid_PreCovid: -0.228
Coeff of overall_textblob_polarity: 17.903


### LogReg with Aspects

In [148]:
add_features = ["topic_0", "topic_1", "topic_2", "topic_3"]
features += add_features
features.remove("overall_textblob_polarity")

In [149]:
X = df_3star_final[features]
y = df_3star_final[target]

In [150]:
X = X.fillna(0)

In [151]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():


LogisticRegression()

In [152]:
y_pred = logreg.predict(X_test)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))

Accuracy of logistic regression classifier on test set: 0.81


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():


In [153]:
coefficients = logreg.coef_[0]
coefficients

array([-0.2391013 , -0.21263723, -0.42945266, -0.11149445, -0.63975048,
        1.08619464, -0.33920653,  3.57453046,  3.43249177,  4.50062515,
        4.62652208])

In [154]:
print("3 Star Baseline + Aspects")
for i in range(len(features)):
    print(f'Coeff of {features[i]}: %.3f' %coefficients[i])

3 Star Baseline + Aspects
Coeff of travel_type_business: -0.239
Coeff of travel_type_couple: -0.213
Coeff of travel_type_family: -0.429
Coeff of travel_type_friends: -0.111
Coeff of travel_type_solo: -0.640
Coeff of is_local: 1.086
Coeff of covid_PreCovid: -0.339
Coeff of topic_0: 3.575
Coeff of topic_1: 3.432
Coeff of topic_2: 4.501
Coeff of topic_3: 4.627


# 4 Star

In [155]:
df_4star["polarity"] = np.where(df_4star["rating"] >= 4, "Positive", "Negative")
df_4star["polarity"] = np.where(df_4star["rating"] == 3.0, np.NaN, df_4star["polarity"])
df_4star = df_4star.dropna(subset = "polarity")
df_4star.head()

Unnamed: 0,travel_type,rating,label,covid,is_local,aspect_sentiment,topic_0,topic_1,topic_3,topic_4,topic_2,overall_textblob_polarity,polarity
0,,,,PostCovid,1,"[[hotel, affordable, 0], [pool, affordable, 1]]",0.0,0.0,,,,0.6,Negative
1,solo,,,PostCovid,0,"[[room, comfortable, 3]]",,,0.4,,,0.411515,Negative
3,,,,PostCovid,1,"[[room, excellent, 3], [room, polite, 3]]",,,0.5,,,0.507407,Negative
4,,,,PostCovid,0,"[[room, double, 3], [room, twin, 3], [hotel, d...",0.0,,0.0,,,0.12,Negative
5,,,,PostCovid,0,"[[airport, nice, 0], [airport, close, 0], [roo...",0.3,,0.366667,,,0.254762,Negative


In [156]:
encoder = OneHotEncoder(sparse=False)
cols = ['travel_type', 'covid']
df_encoded = pd.DataFrame(encoder.fit_transform(df_4star[cols]))
df_encoded.columns = encoder.get_feature_names(cols)

df_4star_final = df_4star.drop(cols ,axis=1)
df_4star_final = pd.concat([df_encoded, df_4star_final], axis=1)

df_4star_final['label'] = df_4star_final['label'].apply(lambda x: 1 if x == 'Positive' else 0)
df_4star_final.head()



Unnamed: 0,travel_type_business,travel_type_couple,travel_type_family,travel_type_friends,travel_type_solo,travel_type_nan,covid_Covid,covid_PostCovid,covid_PreCovid,rating,label,is_local,aspect_sentiment,topic_0,topic_1,topic_3,topic_4,topic_2,overall_textblob_polarity,polarity
0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,,0,1.0,"[[hotel, affordable, 0], [pool, affordable, 1]]",0.0,0.0,,,,0.6,Negative
1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,,0,0.0,"[[room, comfortable, 3]]",,,0.4,,,0.411515,Negative
2,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,,0,,,,,,,,,
3,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,,0,1.0,"[[room, excellent, 3], [room, polite, 3]]",,,0.5,,,0.507407,Negative
4,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,,0,0.0,"[[room, double, 3], [room, twin, 3], [hotel, d...",0.0,,0.0,,,0.12,Negative


# LogReg Baseline

In [157]:
features = ["travel_type_business", "travel_type_couple", "travel_type_family", "travel_type_friends", 
            "travel_type_solo", "is_local", "covid_PreCovid", "overall_textblob_polarity"]
target = "label"

In [158]:
X = df_4star_final[features]
y = df_4star_final[target]

In [159]:
X = X.fillna(0)

In [160]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():


LogisticRegression()

In [161]:
y_pred = logreg.predict(X_test)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))

Accuracy of logistic regression classifier on test set: 0.80


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():


In [162]:
coefficients = logreg.coef_[0]
coefficients

array([-3.23373232, -2.97053882, -2.82619039, -3.34726232, -3.17290161,
       -0.25764641,  0.78219315,  3.61975613])

In [163]:
print("4 Star Baseline")
for i in range(len(features)):
    print(f'Coeff of {features[i]}: %.3f' %coefficients[i])

4 Star Baseline
Coeff of travel_type_business: -3.234
Coeff of travel_type_couple: -2.971
Coeff of travel_type_family: -2.826
Coeff of travel_type_friends: -3.347
Coeff of travel_type_solo: -3.173
Coeff of is_local: -0.258
Coeff of covid_PreCovid: 0.782
Coeff of overall_textblob_polarity: 3.620


### LogReg w Aspects

In [164]:
add_features = ["topic_0", "topic_1", "topic_2", "topic_3", "topic_4"]
features += add_features
features.remove("overall_textblob_polarity")

In [166]:
X = df_4star_final[features]
y = df_4star_final[target]

In [167]:
X = X.fillna(0)

In [168]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))

Accuracy of logistic regression classifier on test set: 0.79


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():


In [169]:
coefficients = logreg.coef_[0]
print("4 Star Baseline + Aspects")
for i in range(len(features)):
    print(f'Coeff of {features[i]}: %.3f' %coefficients[i])

4 Star Baseline + Aspects
Coeff of travel_type_business: -3.090
Coeff of travel_type_couple: -2.850
Coeff of travel_type_family: -2.714
Coeff of travel_type_friends: -3.204
Coeff of travel_type_solo: -3.046
Coeff of is_local: -0.243
Coeff of covid_PreCovid: 0.770
Coeff of topic_0: 1.247
Coeff of topic_1: 0.835
Coeff of topic_2: 0.663
Coeff of topic_3: 0.963
Coeff of topic_4: 0.614


# 5 Star

In [170]:
df_5star["polarity"] = np.where(df_5star["rating"] >= 4, "Positive", "Negative")
df_5star["polarity"] = np.where(df_5star["rating"] == 3.0, np.NaN, df_5star["polarity"])
df_5star = df_5star.dropna(subset = "polarity")
df_5star.head()

Unnamed: 0,travel_type,rating,label,covid,is_local,aspect_sentiment,topic_0,topic_1,topic_2,topic_3,topic_4,overall_textblob_polarity,polarity
0,couple,5.0,Positive,PostCovid,0,"[[hotel, wonderful, 0]]",1.0,,,,,0.541667,Positive
1,couple,,,PostCovid,0,"[[lunch, delicious, 1]]",,1.0,,,,0.546753,Negative
2,business,5.0,Positive,PostCovid,0,"[[hotel, attentive, 0], [hotel, 'kind visiting...",0.5,,,,,0.51,Positive
3,business,5.0,Positive,PostCovid,0,"[[place, exclusive, 0], [place, comfortable, 0...",0.266667,,,,,0.352381,Positive
4,business,5.0,Positive,PostCovid,0,"[[hotel, warm, 0], [hotel, welcome, 0], [time,...",0.7,,0.0,,,0.234722,Positive


In [171]:
encoder = OneHotEncoder(sparse=False)
cols = ['travel_type', 'covid']
df_encoded = pd.DataFrame(encoder.fit_transform(df_5star[cols]))
df_encoded.columns = encoder.get_feature_names(cols)

df_5star_final = df_5star.drop(cols ,axis=1)
df_5star_final = pd.concat([df_encoded, df_5star_final], axis=1)

df_5star_final['label'] = df_5star_final['label'].apply(lambda x: 1 if x == 'Positive' else 0)
df_5star_final.head()



Unnamed: 0,travel_type_business,travel_type_couple,travel_type_family,travel_type_friends,travel_type_solo,travel_type_nan,covid_Covid,covid_PostCovid,covid_PreCovid,rating,label,is_local,aspect_sentiment,topic_0,topic_1,topic_2,topic_3,topic_4,overall_textblob_polarity,polarity
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,5.0,1,0.0,"[[hotel, wonderful, 0]]",1.0,,,,,0.541667,Positive
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,,0,0.0,"[[lunch, delicious, 1]]",,1.0,,,,0.546753,Negative
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,5.0,1,0.0,"[[hotel, attentive, 0], [hotel, 'kind visiting...",0.5,,,,,0.51,Positive
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,5.0,1,0.0,"[[place, exclusive, 0], [place, comfortable, 0...",0.266667,,,,,0.352381,Positive
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,5.0,1,0.0,"[[hotel, warm, 0], [hotel, welcome, 0], [time,...",0.7,,0.0,,,0.234722,Positive


## LogReg Baseline

In [172]:
features = ["travel_type_business", "travel_type_couple", "travel_type_family", "travel_type_friends", 
            "travel_type_solo", "is_local", "covid_PreCovid", "overall_textblob_polarity"]
target = "label"

In [173]:
X = df_5star_final[features]
y = df_5star_final[target]

In [174]:
X = X.fillna(0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))

Accuracy of logistic regression classifier on test set: 0.82


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():


In [175]:
coefficients = logreg.coef_[0]
print("5 Star Baseline")
for i in range(len(features)):
    print(f'Coeff of {features[i]}: %.3f' %coefficients[i])

5 Star Baseline
Coeff of travel_type_business: -1.149
Coeff of travel_type_couple: -0.785
Coeff of travel_type_family: -0.835
Coeff of travel_type_friends: -0.646
Coeff of travel_type_solo: -0.891
Coeff of is_local: 0.065
Coeff of covid_PreCovid: 0.384
Coeff of overall_textblob_polarity: 5.213


## LogReg w Aspects

In [176]:
add_features = ["topic_0", "topic_1", "topic_2", "topic_3", "topic_4"]
features += add_features
features.remove("overall_textblob_polarity")

In [177]:
X = df_5star_final[features]
y = df_5star_final[target]

In [178]:
X = X.fillna(0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))

Accuracy of logistic regression classifier on test set: 0.78


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():


In [179]:
coefficients = logreg.coef_[0]
print("5 Star Baseline + Aspects")
for i in range(len(features)):
    print(f'Coeff of {features[i]}: %.3f' %coefficients[i])

5 Star Baseline + Aspects
Coeff of travel_type_business: -1.093
Coeff of travel_type_couple: -0.763
Coeff of travel_type_family: -0.809
Coeff of travel_type_friends: -0.589
Coeff of travel_type_solo: -0.834
Coeff of is_local: 0.115
Coeff of covid_PreCovid: 0.238
Coeff of topic_0: 1.534
Coeff of topic_1: 1.456
Coeff of topic_2: 0.869
Coeff of topic_3: 1.070
Coeff of topic_4: 0.944
