In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
import matplotlib.pyplot as plt
import seaborn as sns
import re


df = pd.read_csv('/content/Hotel_Reviews.csv')

In [None]:
df['Average_Score'].unique()
df["Average_Score_Round"] = df["Average_Score"].apply(lambda x: int(round(x)))


In [None]:

def label_reviews(row):
  review = row['Average_Score_Round']
  if(review == 6 and review == 5):
    return 1
  if(review == 7):
    return 2
  if(review == 8):
     return 3
  if(review == 9):
    return 4
  else:
      return 5


def createLabelsFromReviewPoints(df):
  df['Target']= df.apply (lambda row: label_reviews(row), axis=1)
  return df
  
df = createLabelsFromReviewPoints(df)


In [None]:
df1 = df.sample(n = 160000) 

In [None]:
df1["Review"] = df1["Negative_Review"] + df1["Positive_Review"]
#df1['Target'] = np.where(df1.eval("Average_Score_Round > 8"), "Good", "Bad")

In [None]:
df_final = df1[['Review','Target','Negative_Review','Positive_Review']]# remove 'No Negative' or 'No Positive' from text
df_final["Negative_Review"] = df_final["Negative_Review"].apply(lambda x: x.replace("No Negative", ""))
df_final["Positive_Review"] = df_final["Positive_Review"].apply(lambda x: x.replace("No Positive", ""))

In [None]:
from nltk.corpus import wordnet
import string
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.tokenize import WhitespaceTokenizer
from nltk.stem import WordNetLemmatizer

In [None]:
def get_wordnet_pos(pos_tag):
    if pos_tag.startswith('J'):
        return wordnet.ADJ
    elif pos_tag.startswith('V'):
        return wordnet.VERB
    elif pos_tag.startswith('N'):
        return wordnet.NOUN
    elif pos_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [None]:
def clean_text(text):
    # lower text
    text = text.lower()
    # tokenize text and remove puncutation
    text = [word.strip(string.punctuation) for word in text.split(" ")]
    # remove words that contain numbers
    text = [word for word in text if not any(c.isdigit() for c in word)]
    # remove stop words
    stop = stopwords.words('english')
    text = [x for x in text if x not in stop]
    # remove empty tokens
    text = [t for t in text if len(t) > 0]
    # pos tag text
    pos_tags = pos_tag(text)
    # lemmatize text
    text = [WordNetLemmatizer().lemmatize(t[0], get_wordnet_pos(t[1])) for t in pos_tags]
    # remove words with only one letter
    text = [t for t in text if len(t) > 1]
    # join all
    text = " ".join(text)
    return(text)

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

In [None]:
#df["review_clean"] = df["Review"].apply(lambda x: clean_text(x))
df_final['Review'] = df_final['Review'].apply(lambda text: clean_text(text))

In [None]:
#Splitting the data to train and test
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X = df_final["Review"]
y = df_final["Target"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)

In [None]:
from sklearn.feature_extraction.text import HashingVectorizer
has_vec = HashingVectorizer(tokenizer = my_tokenizer,stop_words={'english'},
                            ngram_range= (1,3),alternate_sign=False,strip_accents='unicode',n_features=8000).fit(X_train)

train_hash = has_vec.transform(X_train)
test_hash = has_vec.transform(X_test)


Hash_words = pd.DataFrame(train_hash.toarray())

In [None]:
# fitting a logistic regression model
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import multilabel_confusion_matrix
import warnings
warnings.filterwarnings("ignore")

# Fitting Logistic regression to the training set
logreg = LogisticRegression()
logreg.fit(train_hash, y_train)

# Predicting the test set results
y_pred_logreg = logreg.predict(test_hash))

Accuracy: 60.14


In [None]:
from sklearn.svm import LinearSVC
svc = LinearSVC(multi_class='ovr')
svc.fit(train_hash, y_train)

# Predicting the test set results
y_pred_svc = svc.predict(test_hash)



In [None]:
# Building Multinomial Naive Bayes modle and fit it to our training set
from sklearn.naive_bayes import MultinomialNB
classifier1 = MultinomialNB()
classifier1.fit(train_hash, y_train)
pred = classifier1.predict(test_hash)

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=5,random_state=1)
rf.fit(train_hash, y_train)

# Predicting the test set results
y_pred_rf = rf.predict(test_hash)

In [None]:
# Performance comparsion using Machine learning algorithms 
from sklearn import ensemble, linear_model, neighbors, svm, tree, neural_network
from sklearn import svm, model_selection, tree, linear_model, neighbors, naive_bayes, ensemble
from sklearn.metrics import mean_squared_error,confusion_matrix, precision_score, recall_score, auc,roc_curve

MLA = [
       linear_model.LogisticRegressionCV(),
       #Navies Bayes
       naive_bayes.MultinomialNB(),
       svm.LinearSVC(),
       ensemble.RandomForestClassifier(),  
]

In [None]:
# Training the data into model and calculating performance 
MLA_columns = []
MLA_compare = pd.DataFrame(columns = MLA_columns)

row_index = 0
for alg in MLA:
  predicted = alg.fit(train_hash, y_train).predict(test_hash)
  MLA_name = alg.__class__.__name__
  MLA_compare.loc[row_index,'MLA Name'] = MLA_name
  MLA_compare.loc[row_index, 'MLA Train Accuracy'] = round(alg.score(train_hash, y_train), 4)
  MLA_compare.loc[row_index, 'MLA Test Accuracy'] = round(alg.score(test_hash, y_test), 4)
  MLA_compare.loc[row_index, 'MLA Precission'] = precision_score(y_test, predicted,average='weighted')
  MLA_compare.loc[row_index, 'MLA Recall'] = recall_score(y_test, predicted,average='weighted')
  row_index+=1
    
MLA_compare.sort_values(by = ['MLA Test Accuracy'], ascending = False, inplace = True)    
MLA_compare


In [None]:
plt.subplots(figsize=(12,6))
sns.barplot(x="MLA Name", y="MLA Train Accuracy",data=MLA_compare,palette='hot',edgecolor=sns.color_palette('dark',7))
plt.xticks(rotation=50)
plt.title('MLA Train Accuracy Comparison')
plt.show()

In [None]:
plt.subplots(figsize=(12,6))
sns.barplot(x="MLA Name", y="MLA Test Accuracy",data=MLA_compare,palette='hot',edgecolor=sns.color_palette('dark',7))
plt.xticks(rotation=50)
plt.title('MLA Test Accuracy Comparison')
plt.show()