In [3]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Load the dataset
data = pd.read_csv('Hotel_Reviews.csv')

# Combine the negative and positive reviews into a single review text
data['Combined_Review'] = data['Negative_Review'] + ' ' + data['Positive_Review']

# Define the target variable as 'positive' if Reviewer_Score >= 5, otherwise 'negative'
data['satisfaction'] = data['Reviewer_Score'].apply(lambda x: 'positive' if x >= 5 else 'negative')
X = data['Combined_Review']
y = data['satisfaction']

# Check for class imbalance
print("Class distribution:\n", y.value_counts())

# Vectorize the reviews with increased max features and bi-grams
vectorizer = CountVectorizer(max_features=2000, ngram_range=(1, 2))  # Use up to 2000 features and include bi-grams
X = vectorizer.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

# Train the Logistic Regression model with balanced class weights
model = LogisticRegression(max_iter=200, class_weight='balanced')
model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = model.predict(X_test)

# Classification report,accuracy score and confusion matrix
print("\nLogistic Regression Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix for Logistic Regression:")
print(confusion_matrix(y_test, y_pred))

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Sample reviews for demonstration (adjust or expand this if you want to predict on specific samples)
sample_reviews = [
    "Not worth the money. Very disappointed.",
    "I loved the product, it works great!",
    "The delivery was late and unprofessional.",
    "Amazing quality! Will buy again.",
    "The service was terrible and unhelpful."
]

# Predict and assign star ratings (5 for positive, 1 for negative) for each sample review
results = []
for review in sample_reviews:
    review_vector = vectorizer.transform([review])
    predicted_sentiment = model.predict(review_vector)[0]
    star_rating = 5 if predicted_sentiment == "positive" else 1
    results.append({"review": review, "predicted_sentiment": predicted_sentiment, "star_rating": star_rating})

# Create a results DataFrame
results_df = pd.DataFrame(results)
print("\nResults DataFrame:")
print(results_df)

# Calculate the overall rating as the mean of predicted ratings
overall_rating = results_df["star_rating"].mean()
print(f"\nOverall Rating (Decimal): {overall_rating:.2f}")
print(f"Overall Rating (Rounded): {round(overall_rating)}")


Class distribution:
 satisfaction
positive    493457
negative     22281
Name: count, dtype: int64


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



Logistic Regression Classification Report:
              precision    recall  f1-score   support

    negative       0.20      0.85      0.32     11080
    positive       0.99      0.85      0.91    246789

    accuracy                           0.85    257869
   macro avg       0.60      0.85      0.62    257869
weighted avg       0.96      0.85      0.89    257869

Confusion Matrix for Logistic Regression:
[[  9373   1707]
 [ 37911 208878]]
Accuracy: 0.85

Results DataFrame:
                                      review predicted_sentiment  star_rating
0    Not worth the money. Very disappointed.            negative            1
1       I loved the product, it works great!            positive            5
2  The delivery was late and unprofessional.            positive            5
3           Amazing quality! Will buy again.            positive            5
4    The service was terrible and unhelpful.            negative            1

Overall Rating (Decimal): 3.40
Overall Rating (R