In [1]:
The Effects of Feature Scaling: From Bag-Of_Words to Tf-Idf

In [None]:
# Tf-Idf -> Term Frequency Inverse Document Frequency -> A Better Approach

# 4-1
# Loading and cleaning the Yelp reviews dataset in Python
import json
import pandas as pd

# Load data (business data)
biz_f = open('yelp_academic_dataset_business.json')
biz_df = pd.DataFrame([json.loads(x) for x in biz_f.readlines()])
biz_f.close()

In [None]:
# Load Yelp reviews data
review_file = open('yelp_academic_dataset_review.json')
review_df = pd.DataFrame([json.loads(x) for x in review_file.readlines()])
review_file.close()

In [None]:
# Pull out only Nightlife and Restaurants businesses
two_biz = biz_df[biz_df.apply(lambda x: 'Nightlife' in x['categories'] or 'Restaurants' in x['categories'], axis = 1)]

# Join with the reviews to get all reviews on the two types of business
twobiz_reviews = two_biz.merge(review_df, on='business_id', how='inner')

# Trim away the features we won't use
twobiz_reviews = twobiz_reviews[['business_id', 'name', 'stars_y', 'text', 'categories']]

# Create the target column--True for Nightlife businesses, and False otherwise
two_biz.reviews['target'] = twobiz_reviews.apply(lambda x: 'Nightlife' in x['categories'], axis = 1)

In [None]:
# Creating a balanced classification dataset
# When datasets have categories and those are unbalanced categories, there is a need to balance the dataset equally...

from sklearn.model_selection import train_test_split
# Create a class-balanced subsample 
nightlife = twobiz_reviews[twobiz_reviews.apply(lambda x:'Nightlife' in x['categories'], axis=1)]
restaurants = twobiz_reviews[twobiz_reviews.apply(lambda x:'Restaurants' in x['categories'], axis=1)]

nightlife_subset = nightlife.sample(frac=0.1, random_state=123)
restaurant_subset = restaurants.sample(frac=0.021, random_state=123)

combined = pd.concat([nightlife_subset, restaurant_subset])

# Split into training and test datasets
training_data, test_data = train_test_split(combined, train_size = 0.7, random_state=123)
print(training_data.shape)
print(test_data.shape)

In [None]:
# Transform Features

# Represent the review text as a bag-of-words
from sklearn.feature_extraction import text

bow_transform = text.CountVectorizer()

X_tr_bow = bow_transform.fit_transform(training_data['text'])
X_te_bow = bow_transform.transform(test_data['text'])

print(len(bow_transform.vocabulary_))

y_tr = training_data['target']
y_te = test_data['target']

In [None]:
# Create the tf-idf representation using the bag-of-words matrix
tfidf_trfm = text.TfidfTransformer(norm=None)
X_tr_tfidf = tfidf_trfm.fit_transform(X_tr_bow)
X_te_tfidf = tfidf_trfm.transform(X_te_bow)

# L2-Normalize the bag-of-words representation
X_tr_l2 = preproc.normalize(X_tr_bow, axis=0)
X_te_l2 = preproc.normalize(X_te_bow, axis=0)

In [None]:
# Training Logistic regression classifiers with default params
from sklearn.linear_model import LogisticRegression
def simple_logistic_classify(X_tr, y_tr, X_test, y_test, description):
    m = LogisticRegression().fit(X_tr, y_tr)
    s = m.score(X_test, y_test)
    print('Test score with ', description, ' features: ', s)
    return m

In [None]:
# Calling models
m1 = simple_logistic_classify(X_tr_bow, y_tr, X_te_bow, y_te, 'bow')
m2 = simple_logistic_classify(X_tr_l2, y_tr, X_te_l2, y_te, 'l2-normalized')
m3 = simple_logistic_classify(X_tr_tfidf, y_tr, X_te_tfidf, y_te, 'tf-idf')

In [None]:
# The results may show that BOw is the best but that's not true, because when the classifier is not tuned fine,
# Then this becomes a pitfall at many times when taking the descision for the best classifier to be used.

In [None]:
# To tune the hyperparameters we use techniques such as resampling (k-fold cross validation) and,
# regularization. These are some hyperparameters to tune the model.
# Grid Search is generally used for hyperparameter tuning.

# Tuning logistic regression hyperparameters with grid search
import sklearn.model_selection as modsel
# Specify a search grid, then do a 5-fold grid search for each of the feature sets
param_grid_ = {'C':[1e-5, 1e-3, 1e-1, 1e0, 1e1, 1e2]}

# Tune classifier for bag-of-words representation
bow_search = modsel.GridSearchCV(LogisticRegression(), cv=5, param_grid = param_grid_)
bow_search.fit(X_tr_bow, y_tr)

# Tune classifier fir L2-normalized word vector
l2_search = modsel.GridSearchCV(LogisticRegression(), cv=5, param_grid = param_grid_)
l2_search.fit(X_tr_l2, y_tr)

# Tune classifier for tf-idf
tfidf_search = modsel.GridSearchCV(LogisticRegression(), cv=5, param_grid = param_grid_)
tfidf_search.fit(X_tr_tfidf, y_tr)

In [None]:
print(bow_search.cv_results_)

In [None]:
# Plotting the cross validation results in a box-and-whiskers plot to visualize and compare classifier performance
search_results = pd.DataFrame.from_dict({'bow': bow_search.cv_results_['mean_test_score'], 'tfidf': tfidf_search.cv_results_['mean_test_score'], 'l2': l2_search.cv_results_['mean_test_score']})

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")

ax = sns.boxplot(data = search_results, width=0.4)
ax.set_ylabel('Accuracy', size=14)
ax.tick_params(labelsize=14)

In [None]:
# Final training and testing step to compare the different feature sets
# Train a final model on the entire training set, using the best hyperparameter
# settings found prev. Measure accuracy on the test set...
m1 = simple_logistic_classify(X_tr_bow, y_tr, X_te_bow, y_te, 'bow', _C = bow_search.best_params_['C'])
m2 = simple_logistic_classify(X_tr_l2, y_tr, X_te_l2, y_te, 'l2-normalized', _C = l2_search.best_params_['C'])
m3 = simple_logistic_classify(X_tr_tfidf, y_tr, X_te_tfidf, y_te, 'tf-idf', _C = tfidf_search.best_params_['C'])

# See the test accuracies