In [2]:
import pandas as pd

data_file = "data/data.csv"
df = pd.read_csv(data_file)

In [3]:
text_col = "yelp review"
label_col = "stars"

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df[text_col], df[label_col], test_size=0.2, random_state=42)

# Create a pipeline with TfidfVectorizer and LogisticRegression
model = make_pipeline(TfidfVectorizer(), LogisticRegression())

# Train the model
model.fit(X_train, y_train)

In [5]:
# Predict on the test set
y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Model accuracy: {accuracy}")

Model accuracy: 0.8982506247768655


In [6]:
# Function to predict star label for a new review
def predict_star_label(review):
    return model.predict([review])[0]

# Example usage
new_review = "The food was great and the service was excellent!"
predicted_label = predict_star_label(new_review)
print(f"Predicted star label: {predicted_label}")

Predicted star label: 2


In [7]:
# Combine the test reviews and their predicted labels into a DataFrame
results_df = pd.DataFrame({'yelp review': X_test, 'stars': y_test, 'predicted stars': y_pred})

# Display the results
print(results_df.head())

                                            yelp review  stars  \
2900  You wouldn't believe the experience we had her...      1   
3143  WHOA!  So many things to look at and buy!!!  Y...      2   
8231  I have eaten her twice. I have been absolutely...      2   
3855  Water tastes like it's piped in from the Rio S...      1   
8045  I took my car to the location on at Tempe Mark...      1   

      predicted stars  
2900                1  
3143                2  
8231                2  
3855                1  
8045                1  


In [8]:
results_df.to_csv('data/results.csv', index=False)

In [9]:
# Print the pipeline steps
print(model)

# Print the parameters of each step
for step in model.named_steps:
    print(f"Step: {step}")
    print(model.named_steps[step].get_params())

Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer()),
                ('logisticregression', LogisticRegression())])
Step: tfidfvectorizer
{'analyzer': 'word', 'binary': False, 'decode_error': 'strict', 'dtype': <class 'numpy.float64'>, 'encoding': 'utf-8', 'input': 'content', 'lowercase': True, 'max_df': 1.0, 'max_features': None, 'min_df': 1, 'ngram_range': (1, 1), 'norm': 'l2', 'preprocessor': None, 'smooth_idf': True, 'stop_words': None, 'strip_accents': None, 'sublinear_tf': False, 'token_pattern': '(?u)\\b\\w\\w+\\b', 'tokenizer': None, 'use_idf': True, 'vocabulary': None}
Step: logisticregression
{'C': 1.0, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100, 'multi_class': 'deprecated', 'n_jobs': None, 'penalty': 'l2', 'random_state': None, 'solver': 'lbfgs', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}


In [10]:
import joblib

# Save the model to a file
model_filename = 'data/model.pkl'
joblib.dump(model, model_filename)

['data/model.pkl']

In [11]:
from sklearn.metrics import confusion_matrix

# Generate the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Extract TP, FP, TN, FN values
TP = conf_matrix[1, 1]
FP = conf_matrix[0, 1]
TN = conf_matrix[0, 0]
FN = conf_matrix[1, 0]

print(f"TP: {TP}, FP: {FP}, TN: {TN}, FN: {FN}")

TP: 1256, FP: 133, TN: 1260, FN: 152


In [12]:
# Calculate the predicted positive rate for each actual star rating
predicted_positive_rate = results_df.groupby('stars')['predicted stars'].mean()

# Display the predicted positive rate
print(predicted_positive_rate)
predicted_positive_rate_df = predicted_positive_rate.reset_index()
print(predicted_positive_rate_df)

# Check for bias
if predicted_positive_rate.std() > 0.1:  # Threshold can be adjusted
    print("The model shows bias across different star ratings.")
else:
    print("The model does not show significant bias across different star ratings.")

print(predicted_positive_rate.std())

stars
1    1.095477
2    1.892045
Name: predicted stars, dtype: float64
   stars  predicted stars
0      1         1.095477
1      2         1.892045
The model shows bias across different star ratings.
0.5632586822842476
