In [1]:
# 2-2
# Visualizing Business Review Counts
# Yelp Dataset
import pandas as pd
import json
import matplotlib.pyplot as plt
import seaborn as sns

ModuleNotFoundError: No module named 'pandas'

In [None]:
biz_file = open('yelp_academic_dataset_business.json')
biz_df = pd.DataFrame([json.loads(x) for x in biz_file.readlines()])
biz_file.close()

In [None]:
# Plot the histogram of the review counts
sns.set_style('whitegrid')
fig, ax = plt.subplots()
biz_df['review_count'].hist(ax=ax, bins=100)
ax.set_yscale('log')
ax.tick_params(labelsize = 14)
ax.set_xlabel('Review Count', fontsize=14)
ax.set_ylabel('Occurrence', fontsize=14)

In [None]:
# 2-3
# Quantizing Counts with fixed-width bins
import numpy as np
# Generate 20 random integers uniformly between 0 and 99
small_counts = np.random.randint(0, 100, 20)
print(small_counts)
# Map to evenly spaced bins 0-9 by division
np.floor_divide(small_counts, 10)
# An array of counts that span several magnitudes
large_counts = []
# Map to exponential-width bins via the log function
np.floor(np.log10(large_counts))

In [None]:
# 2-4
# Computing deciles of Yelp business review counts
deciles= biz_df['review_count'].quantile([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])
print(deciles)
# Visualize the deciles on the histogram
sns.set_style('whitegrid')
fig, ax = plt.subplots()
biz_df['review_count'].hist(ax=ax, bins=100)
for pos in deciles:
    handle= plt.axvline(pos, color = 'r')
ax.legend([handle], ['deciles'], fontsize=14)
ax.set_yscale('log')
ax.set_xscale('log')
ax.tick_params(labelsize=14)
ax.set_xlabel('Review Count', fontsize=14)
ax.set_ylabel('Occurrence', fontsize=14)

In [None]:
# 2-5
# Binning counts by quantiles
# Map the counts to quartiles
pd.qcut(large_counts, 4, labels = False)
# Compute the quantiles themselves
large_counts_series = pd.Series(large_counts)
large_counts_series.quantile([0.25, 0.5, 0.75])

In [None]:
# 2-6
# Log Transformation
# Visualizing the distribution of review counts before and after log transform
fig, (ax1, ax2) = plt.subplots(2,1)
biz_df['review_count'].hist(ax=ax1, bins=100)
ax1.tick_params(labelsize=14)
ax1.set_xlabel('review_count', fontsize=14)
ax1.set_ylabel('Occurrence', fontsize=14)

biz_df['log_review_count'].hist(ax=ax2, bins=100)
ax2.tick_params(labelsize=14)
ax2.set_xlabel('log10(review_count)', fontsize=14)
ax2.set_ylabel('Occurrence', fontsize=14)

In [None]:
# 2-8
# Using log transformed Yelp review counts to predict average business rating
import pandas as pd
import numpy as np
import json
from sklearn import linear_model
from sklearn.model_selection import cross_val_score

# we add 1 to the raw count to prevent the logarithm from exploding into -ve infinity in case count = 0
biz_df['log_review_count'] = np.log10(biz_df['review_count'] + 1)

# Train linear regression models to predict the avg start rating of a business,
# using the review_count feature with and without log transformation.
# Compare the 10-fold cross validation score of the 2 models
m_orig = linear_model.LinearRegression()
scores_orig = cross_val_score(m_orig, biz_df['review_count'], ..., biz_df['stars'], cv=10)

m_log = linear_model.LinearRegression()
scores_log = cross_val_score(m_log, biz_df['log_review_count'], ..., biz_df['stars'], cv=10)

print("R-squared score without log transform: %0.5f (+/- %0.5f)" % (scores_orig.mean(), scores_orig.std() * 2))
print("R-squared score with log transform: %0.5f (+/- % 0.5f)" % (scores_log.mean(), scores_log.std() * 2))

In [None]:
# 2-9
# Using log transformed word counts in the Online News popularity dataset to predict article popularity
df = pd.read_csv('OnlineNewsPopularity.csv', delimiter = ', ')
# n_tokens_count represents the number of words in the news article
df['log_n_tokens_content'] = np.log10(df['n_tokens_content'] + 1)

# Train two linear regression models to predict the number of shares of an article, on using the original feature and,
# the other one is the log transformed version.
m_orig = linear_model.LinearRegression()
scores_orig = cross_val_score(m_orig, df[['n_tokens_content']], ..., df['shares'], cv=10)

m_log = linear_model.LinearRegression()
scores_log = cross_val_score(m_log, df[['log_n_tokens_content']], ..., df['shares'], cv=10)

print("R-squared score without log transform: %0.5f (+/- %0.5f)" % (scores_orig.mean(), scores_orig.std() * 2))
print("R-squared score with log transform: %0.5f (+/- % 0.5f)" % (scores_log.mean(), scores_log.std() * 2))

In [None]:
# 2-10
# Visualizing the correlation between input and output in the news popularity prediction problem
fig2, (ax1, ax2) = plt.subplots(2, 1)
ax1.scatter(df['n_tokens_content'], df['shares'])
ax1.tick_params(labelsize=14)
ax1.set_xlabel('Number of Words in Article', fontsize=14)
ax1.set_ylabel('Number of shares', fontsize=14)

ax2.scatter(df['log_n_tokens_content'], df['shares'])
ax2.tick_params(labelsize=14)
ax2.set_xlabel('Log of Number of Words in Article', fontsize=14)
ax2.set_ylabel('Number of shares', fontsize=14)

In [None]:
# 2-11
# Visualizing the correlation between input and output in Yelp business review prediction
# These two visualizatoins are there to show where the log transformation is more useful and it all depends on the type of data
# and how it is scattered...
fig, (ax1, ax2) = plt.subplots(2, 1)
ax1.scatter(biz_df['review_count'], df['stars'])
ax1.tick_params(labelsize=14)
ax1.set_xlabel('Review Count', fontsize=14)
ax1.set_ylabel('Avg Star Rating', fontsize=14)

ax2.scatter(df['log_review_count'], df['stars'])
ax2.tick_params(labelsize=14)
ax2.set_xlabel('Log of Review Count', fontsize=14)
ax2.set_ylabel('Avg Star Rating', fontsize=14)

In [None]:
# 2-12 
# Power Transform
# BOX-COX TRANSFORMATION
# Box-Cox Transformation of Yelp Review Counts
from scipy import stats
# This transformation assumes that input_data is positive.
# Check the min to make sure
print(biz_df['review_count'].min())

# Setting inp parameter lmbda to 0 gives us the log transform ( without constant offset)
rc_log = stats.boxcox(biz_df['review_count'], lmbda=0)
# By default, the scipy implementation of Box-Cox transform finds the lambda
# parameter that will make the output the closest to a normal distribution
rc_bc, bc_params = stats.boxcox(biz_df['review_count'])
print(bc_params)

# Visualizing the histogram of original, log transformed, and Box-Cox transformed counts
fig, (ax1, ax2, ax3) = plt.subplots(3, 1)
# Original review count histogram
biz_df['review_count'].hist(ax=ax1, bins=100)
ax1.set_yscale('log')
ax1.tick_params(labelsize=14)
ax1.set_title('Review Counts Histogram', fontsize = 14)
ax1.set_xlabel('')
ax1.set_ylabel('Occurence', fontsize=14)

# Review count after log transform
biz_df['rc_log'].hist(ax=ax2, bins=100)
ax2.set_yscale('log')
ax2.tick_params(labelsize=14)
ax2.set_title('Log Transformed Counts Histogram', fontsize = 14)
ax2.set_xlabel('')
ax2.set_ylabel('Occurence', fontsize=14)

# Review Counts after optimal Box-Cox transform
biz_df['rc_bc'].hist(ax=ax3, bins=100)
ax3.set_yscale('log')
ax3.tick_params(labelsize=14)
ax3.set_title('Box-Cox Transformed Counts Histogram', fontsize = 14)
ax3.set_xlabel('')
ax3.set_ylabel('Occurence', fontsize=14)

In [None]:
# 2-14
# Probability Plots of original and transformed counts against the normal distribution
fig2, (ax1, ax2, ax3) = plt.subplots(3, 1)
prob1 = stats.probplot(biz_df['review_count'], dist=stats.norm, plot=ax1)
ax1.set_xlabel('')
ax1.set_title('ProbPlot against normal distribution')

prob2 = stats.probplot(biz_df['rc_log'], dist=stats.norm, plot=ax2)
ax2.set_xlabel('')
ax2.set_title('ProbPlot after log transform')

prob3 = stats.probplot(biz_df['rc_bc'], dist=stats.norm, plot=ax3)
ax3.set_xlabel('Theoretical Quantiles')
ax3.set_title('ProbPlot after Box-Cox transform')

In [None]:
# 2-15
# Feature Scaling and Normalization
# Feature Scaling Implementation
import pandas as pd
import sklearn.preprocessing as preproc
# Load data
df = pd.read_csv("OnlineNewsPopularity.csv", delimiter = ',')
# Look at the original data- the number of words in an article
print(df['n_tokens_content'].as_matrix())

# Min-Max Scaling
df['minmax'] = preproc.minmax_scale(df['n_tokens_content'])
print(df['minmax'].as_matrix())

# Standardization- some outputs can be -ve
df['standardized'] = preproc.StandardScaler().fit_transform(df['n_tokens_content'])
print(df['standardized'].as_matrix())

# L-2 Normalization
df['l2_normalized'] = preproc.normalize(df['n_tokens_content'], axis = 0)
print(df['l2_normalized'].as_matrix())

# Plotting Histograms of original and scaled data
fig, (ax1, ax2, ax3, ax4) = plt.subplots(4, 1)
fig.tight_layout()

df['n_tokens_count'].hist(ax=ax1, bins=100)
ax1.tick_params(labelsize=14)
ax1.set_xlabel('Article Word Count', fontsize=14)
ax1.set_ylabel('Number of Articles', fontsize=14)

df['minmax'].hist(ax=ax2, bins=100)
ax2.tick_params(labelsize=14)
ax2.set_xlabel('Min-Max Scaled Word Count', fontsize=14)
ax2.set_ylabel('Number of Articles', fontsize=14)

df['standardized'].hist(ax=ax3, bins=100)
ax3.tick_params(labelsize=14)
ax3.set_xlabel('Standardized Word Count', fontsize=14)
ax3.set_ylabel('Number of Articles', fontsize=14)

df['l2_normalized'].hist(ax=ax4, bins=100)
ax4.tick_params(labelsize=14)
ax4.set_xlabel('L-2 Normalized Word Count', fontsize=14)
ax4.set_ylabel('Number of Articles', fontsize=14)


In [None]:
# 2-17
# Interaction Features Prediction
# Better method than taking just one feature into consideration for prediction...
from sklearn import linear_model
from sklearn.model_selection import train_test_split
import sklearn.preprocessing as preproc

# Assume df is a dataframe containing UCI Online News Popularity dataset
print(df.columns)
# Select the content-based features as singleton features in the model, skipping over the derived features
features = []
X = df[features]
y = df['shares']

# Create pairwise interaction features, skipping the constant bias term
X2 = preproc.PolynomialFeatures(include_bias=False).fit_transform(X)
print(X2.shape)

# Create train/test sets for both feature sets
X1_train, X1_test, X2_train, X2_test, y_train, y_test = train_test_split(X, X2, y, test_size=0.3, random_state=123)

def evaluate_feature(X_train, X_test, y_train, y_test):
    model = linear_model.LinearRegression().fit(X_train, y_train)
    r_score = model.score(X_test, y_test)
    return (model, r_score)

(m1, r1) = evaluate_feature(X1_train, X1_test, y_train, y_test)
(m2, r2) = evaluate_feature(X2_train, X2_test, y_train, y_test)

print("R-Squared Score with singleton features: %0.5f" % r1)
print("R-Squared Score with pairwise features: %0.10f" % r2)