# Predicting the Weekly Gross for Broadway shows
#### Sidorenko Elena

## Getting Started

In [None]:
import pandas as pd    
import seaborn as sns   
import matplotlib.pyplot as plt 
import numpy as np   
import datetime as dt
import matplotlib.image as mpimg
import statsmodels.api as sm 

In [None]:
# loading csv file
df = pd.read_csv('broadway.csv')

## Data inspection

In [None]:
# Inspecting the data
df.head(3) 
df.info() 
# can already see that there are missing observations in 'potential_gross' and 'top_ticket_price'. This can later be cleaned
# week_ending should be set as date type
# potentially setting 'show' as a category to later group top shows
# potentially setting 'theatre' as a category to later group top theatres

In [None]:
df.describe()

In [None]:
df['show'].nunique()

In [None]:
df['week_ending'].min()

In [None]:
df['week_ending'].max()

In [None]:
# maximum number of weeks each show was performed for
max_weeks = df['week_number'].max()
max_show = df.loc[df['week_number'] == max_weeks, 'show'].iloc[0]

print(f"The show with the maximum number of weeks is '{max_show}' with {max_weeks} weeks.")

In [None]:
# data dictionary
img         = mpimg.imread('DataDictionary.PNG') 
imgplot     = plt.imshow(img)
plt.show()

## Preparing the data set

In [None]:
# Step 1.Number of observations to start with
df.shape[0] 

In [None]:
# Step 2. Changing variable names
df = df.rename(columns={'pct_capacity':'percent_capacity'})

In [None]:
# Step 3. Changing variable type

# Generically
df['show']         = df['show'].astype('category')
df['theatre']      = df['theatre'].astype('category')

# Data-time variables
df['week_ending']  = pd.to_datetime(df['week_ending'])

### Dealing with categorical variables

In [None]:
# Step 4. Categorical variables 1: Changing value of observation 
# no issues found to perform this step

In [None]:
# Step 5. Checking how many unique shows and theatres there are

unique_shows       = df['show'].nunique()
unique_theatres    = df['theatre'].nunique()
print('unique shows: ', unique_shows,' unique theatres: ', unique_theatres)
# there are a lot of theatres and even more shows. It is best to either group each into top 10 and others, OR only take the top 10.

In [None]:
# Step 6. Getting the top 10 most frequently performing shows, and top 10 theatres that had the most showes performed

top10_shows         = df['show'].value_counts().head(10)
top10_theatres      = df['theatre'].value_counts().head(10)

top10_shows_list    = list(top10_shows.index)
top10_theatres_list = list(top10_theatres.index)

In [None]:
# Step 7. Categorical variables 2: Binarizing categorical variables (where this makes sense!)

# binarising categorical variables. Setting top 10 most performed theatres as 1 and the rest as 0
substrings_theatres   = "|".join(top10_theatres_list)
df['top_10_theatres'] = np.where(df['theatre'].str.contains('|'.join(substrings_theatres)), 1, 0)

# binarising categorical variables. Setting top 100 most performed shows as 1 and the rest as 0
substrings_shows      = "|".join(top10_shows_list)
df['top_10_shows']    = np.where(df['show'].str.contains('|'.join(substrings_shows)), 1, 0)

In [None]:
print(df['show'].value_counts().head(100))

In [None]:
# Step 8. Categorical variables 3: removing observations where values are not relevant for case at hand
df = df.groupby('show').filter(lambda x: len(x) >= 78) 

# Some shows were performed too little times
# we are intersted in shows that got a lot of views and therfore public atention
# let's work with the top 300 shows
# we need to get rid of any show with less than 95 observations.

In [None]:
print(df['show'].nunique())

### Checking whether to remove or replace missing values

In [None]:
# Step 9. Categorical variables 4: Generating empty values where value does not make sense
# petrol
df_backup = df

# Filling Null potential_gross values with seats_in_theatre * avg_ticket_price
df_backup['potential_gross'].fillna(df_backup.apply(lambda x: x['seats_in_theatre'] * x['avg_ticket_price'], axis=1), inplace=True)

# adding new column that shows whether the potential gross equals to seats * average ticket price 
df_backup['is_potential_gross_correct'] = np.where(df_backup['potential_gross'] == df_backup['seats_in_theatre'] * df_backup['avg_ticket_price'], 1, 0)

In [None]:
# checking whether there are any 'potential_gross' that is not equal to 'seats_in_theatre'*"avg_ticket_price"
(df_backup['potential_gross'] == df_backup['seats_in_theatre'] * df_backup['avg_ticket_price']).all()

In [None]:
# checking if the potential_gross is simply seats_in_theatre * avg_ticket_price
print(df_backup['is_potential_gross_correct'].value_counts())

So all of the potential gross prices are not that equation. Therefore we can remove the missing values. Or try and predict them. But it's not an important.Thank gosh that was in a backup dataset. We can create a new dataset, and remove all null values from the original dataset

In [None]:
# Step 10. Dropping unnecessary columns
df = df.drop('performances', axis=1)
df = df.drop('percent_capacity', axis=1)
df = df.drop('weekly_gross_overall', axis=1)
df = df.drop('is_potential_gross_correct', axis=1)

In [None]:
# Step 11. Dropping missing values
df = df.dropna(subset=['top_ticket_price'])

In [None]:
# Display df
pd.options.display.max_columns = None 
df.head(2) 

In [None]:
df.info()

In [None]:
# Step 12. Inspect the number of observations left
df.shape[0]

In [None]:
# Step 13. Save cleaned data set
df.to_csv('Cleaned_Broadway.csv', index=False)

# Data is cleaned, Next step

In [None]:
import pandas as pd     
import numpy as np    
import statsmodels.api as sm
import seaborn as sns
 
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
import numpy as np
 
import matplotlib.pyplot as plt
df = pd.read_csv('Cleaned_Broadway.csv')
df.info()

## Summary Statistics

In [None]:
df = pd.read_csv('Cleaned_Broadway.csv')
df['week_ending'] = pd.to_datetime(df['week_ending'])

In [None]:
# Summary statistics
df.describe()

In [None]:
sns.set(font_scale = 1, style = 'whitegrid')
sns.histplot(df['week_number'], color = 'orange', kde = True, bins=50)
plt.title('Distribution of Week Numbers')

In [None]:
sns.set(font_scale=1, style='whitegrid')
plt.title('Distribution of Weekly Gross')
sns.histplot(data = df, x='weekly_gross', color = 'orange', bins=40)

## Getting Tweets

In [None]:
pip install tweepy

In [None]:
import pandas as pd

import string
import nltk
from nltk.corpus import stopwords
import tweepy

from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.feature_extraction.text import TfidfTransformer 

from sklearn.naive_bayes import MultinomialNB 
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

In [None]:
# Accessing Twitter 
consumer_key    = 'wI9UK5H6w9fr3BxqQ1tLajzFu'
consumer_secret = 'DgrXdbgiorNNLobKjMLT1ZHdk9MYng9bahIwUA8L9TUpkuTbFc'
access_key      = '1618892347487535104-kxehJMwndfnwrYjBnGfoei6LPSLk6S'
access_secret   = 'zARciIULKUtQLzDpd7IRfG2xvN8VAAmBdJimC8FfZ9G7S'

In [None]:
# Twitter authentication 
auth            = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_key, access_secret)

In [None]:
# Creating an API object
api             = tweepy.API(auth)

In [None]:
#### looping though several objects to extract tweets
def extract_tweets (api, show):
    show_tweets  = api.search_tweets(show + 'Broadway OR theatre', lang = 'en', tweet_mode = 'extended', count = 30)
    tweet_list   = []
    for tweet in show_tweets:
        tweet_list.append({
            'show'           : show, # the first 'show' is the column name, the second show is the actual data of the column
            'text'            : tweet.full_text,
            'Favorite count'  : tweet.favorite_count,
            'Retweet count'   : tweet.retweet_count,
            'Created at'      : tweet.created_at
        })
    return(tweet_list)

#### Looping through the shows to compile dataframe
shows_list       = list(df['show'].unique())
tweets           = []

for show in shows_list:
    show_tweets  = extract_tweets(api, show)
    tweets.extend(show_tweets)
tweets           = pd.DataFrame(tweets)
tweets

### Normalization of Tweets

In [None]:
import re
import string 
from   nltk.corpus import stopwords

In [None]:
def preprocess_text(text):
    # remove url and mentions (@elonmusk)
    text_clean = re.sub(r"hhtp\S+","", text)
    text_clean = re.sub(r"@\S+","",text_clean)
        
    # Remove punctuation
    text_clean = text_clean.translate(str.maketrans("","",string.punctuation))
    
    # Replace digits with the word number 
    text_clean = re.sub(r'\b\d+\b', 'number', text_clean)
    
    
    # Stop words 1: Tokenize the text into words
    words      = text_clean.split()
    
    # Stop words 2: Remove the stop words
    stop_words = set(stopwords.words("english"))
    words      = [word for word in words if word.lower() not in stop_words]
    
    # Stop words 3: Join the words back into a single string
    text_clean = " ".join(words)
    
    return text, text_clean

tweets[['text', 'text_clean']]    = tweets['text'].apply(preprocess_text).apply(pd.Series)
tweets

#### saving everything to a csv
tweets.to_csv('tweets.csv', index = False)

### Vectorization

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
tweets             = pd.read_csv('tweets.csv')
tweets.head()

In [None]:
bow_transformer    = CountVectorizer(analyzer = preprocess_text).fit(tweets['text_clean'])

In [None]:
# print the total number of words
print(len(bow_transformer.vocabulary_))

In [None]:
# Transformation on entire dataframe
tweets_bow         = bow_transformer.transform(tweets['text_clean'])  

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer

tf_idf_transformer = TfidfTransformer().fit(tweets_bow)

In [None]:
# Apply transformation to the entire bag of words
tweets_tfidf       = tf_idf_transformer.transform(tweets_bow)

In [None]:
print(tweets_tfidf.shape)

### Natural Language Processing

In [None]:
pip install spacy

In [None]:
import spacy
nlp = spacy.load('en_core_web_md')

In [None]:
!python -m spacy download en_core_web_md

In [None]:
import nltk  

In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()

### Applying Vader to Tweets

In [None]:
tweets['scores']     = tweets['text_clean'].apply(lambda review: sid.polarity_scores(review))
tweets

In [None]:
tweets['negative']   = tweets['scores'].apply(lambda x: x['neg'])
tweets['neutral']    = tweets['scores'].apply(lambda x: x['neu'])
tweets['positive']   = tweets['scores'].apply(lambda x: x['pos'])
tweets['compound']   = tweets['scores'].apply(lambda x: x['compound'])

tweets['comp_score'] = tweets['compound'].apply(lambda x: 'pos' if x >= 0 else 'neg')
tweets

In [None]:
# Binarizing sentiment scores
tweets['pos'] = 0
tweets['neg'] = 0

tweets.loc[tweets['comp_score'] == 'pos', 'pos'] = 1
tweets.loc[tweets['comp_score'] == 'neg', 'neg'] = 1

tweets.head()

### Visualizing Tweets

In [None]:
figure = plt.figure(figsize=(5,5))
sns.countplot(x='comp_score', data = tweets)
plt.title('Tweets Polarity for the Top 100 Shows')

In [None]:
pip install wordcloud

In [None]:
from wordcloud import WordCloud

In [None]:
pos_tweets = tweets[tweets['comp_score'] == 'pos']
pos_tweets = pos_tweets.sort_values(['compound'], ascending = False)

neg_tweets = tweets[tweets['comp_score'] == 'neg']
neg_tweets = neg_tweets.sort_values(['compound'], ascending = False)

In [None]:
text      = ' '.join([word for word in pos_tweets['text_clean']])
plt.figure(figsize = (20,15), facecolor = 'None')
wordcloud = WordCloud(max_words = 500, width = 1600, height = 800).generate(text)
plt.imshow(wordcloud, interpolation = 'bilinear')
plt.axis('off')
plt.title('Most frequent words in posititve tweets', fontsize = 19)
plt.show()

In [None]:
text      = ' '.join([word for word in neg_tweets['text_clean']])
plt.figure(figsize = (20,15), facecolor = 'None')
wordcloud = WordCloud(max_words = 500, width = 1600, height = 800).generate(text)
plt.imshow(wordcloud, interpolation = 'bilinear')
plt.axis('off')
plt.title('Most frequent words in negative tweets', fontsize = 19)
plt.show()

### Calculating Score Per Show

In [None]:
# Group by movie and calculate mean and standard deviation
grouped_tweets         = tweets.groupby('show').agg({
                                      'compound':       ['mean', 'std'],
                                      'Favorite count': ['mean', 'std'],
                                      'Retweet count':  ['mean', 'std']
                                        })

# Flatten the column names of the resulting dataframe
grouped_tweets.columns = ['_'.join(col).strip() for col in grouped_tweets.columns.values]

grouped_tweets['show'] = grouped_tweets.index

grouped_tweets

In [None]:
# Create bar plot
plt.figure(figsize=(10, 5))
plt.bar(grouped_tweets['show'], grouped_tweets['compound_mean'], color=grouped_tweets['compound_mean'].apply(lambda x: 'green' if x >= 0 else 'red'),width=0.5)
plt.title('Twitter Sentiment Analysis for 100 Unique Shows',fontsize = 10)
plt.xlabel('Show', fontsize = 10)
plt.ylabel('Sentiment Score')
plt.xticks(fontsize=5)
plt.show()

In [None]:
# Group by movie and calculate mean and standard deviation
grouped_tweets         = tweets.groupby('show').agg({
                                      'compound':       ['mean', 'std'],
                                      'Favorite count': ['mean', 'std'],
                                      'Retweet count':  ['mean', 'std']
                                        })

# Flatten the column names of the resulting dataframe
grouped_tweets.columns = ['_'.join(col).strip() for col in grouped_tweets.columns.values]

grouped_tweets['show'] = grouped_tweets.index

grouped_tweets

In [None]:
# Drop the 'show' column
grouped_tweets = grouped_tweets.drop('show', axis=1)

# Reset index of grouped_tweets
grouped_tweets = grouped_tweets.reset_index()

# Drop the 'index' column
grouped_tweets = grouped_tweets.drop('index', axis=1)

In [None]:
# Merge the two dataframes on the 'show' column
df             = pd.merge(df, grouped_tweets, on='show')

df.head()

### Train and Test 

In [None]:
# The week_ending data type should be switched to datetime
df['week_ending']      = pd.to_datetime(df['week_ending'])
df['year']             = df['week_ending'].dt.year

In [None]:
# check the data before running the model
df.info()

In [None]:
# drop NA from columns
df.dropna(subset=['compound_std'], inplace=True)
df.dropna(subset=['Favorite count_std'], inplace=True)
df.dropna(subset=['Retweet count_std'], inplace=True)

In [None]:
# finding the value that is in the mimddle of the dataframe
median_date            = df['week_ending'].median()

# splitting the dataset in 2 parts
df_less_than_median    = df[df['week_ending']< median_date]
df_greater_than_median = df[df['week_ending']>= median_date]

#### Regression Model

In [None]:
# Regression model
reg_model  = sm.OLS(df_less_than_median['weekly_gross'],
               df_less_than_median[[  'week_number','seats_sold','avg_ticket_price','top_ticket_price','seats_in_theatre',
                     'previews','compound_mean','compound_std','Favorite count_mean','Favorite count_std',
                     'Retweet count_mean','Retweet count_std']])

# fit the model and run the summary
reg_model_result = reg_model.fit()
print(reg_model_result.summary())

In [None]:
# Define predictors and dependent variable
dependent_variable     = 'weekly_gross'
predictors             = ['week_number','seats_sold','avg_ticket_price','top_ticket_price','seats_in_theatre',
                          'previews','compound_mean','compound_std','Favorite count_mean','Favorite count_std',
                          'Retweet count_mean','Retweet count_std']

#### Random Forest and Lasso Models

In [None]:
# Initiate threge models
rf    = RandomForestRegressor()
lasso = Lasso()

In [None]:
# Train the models
rf.fit    (df_less_than_median[predictors], df_less_than_median[dependent_variable])
lasso.fit (df_less_than_median[predictors], df_less_than_median[dependent_variable])

In [None]:
# Make predictions using df_greater_than_median
rf_predictions        = rf.predict(df_greater_than_median[predictors])
lasso_predictions     = lasso.predict(df_greater_than_median[predictors])
reg_model_predictions = reg_model_result.predict(df_greater_than_median[predictors])

In [None]:
# Output 
# Calculate RMSE, initially calculate MSE and then take the root
rf_mse         = mean_squared_error(df_greater_than_median[dependent_variable],rf_predictions)
rf_rmse        = np.sqrt(rf_mse)

lasso_mse      = mean_squared_error(df_greater_than_median[dependent_variable],lasso_predictions)
lasso_rmse     = np.sqrt(lasso_mse)

reg_model_mse  = mean_squared_error(df_greater_than_median[dependent_variable],reg_model_predictions)
reg_model_rmse = np.sqrt(reg_model_mse)

In [None]:
# Calculate the standard deviation of the whole thing
rf_std         = np.std(rf_predictions)
lasso_std      = np.std(lasso_predictions)
reg_model_std  = np.std(reg_model_predictions)

In [None]:
#Output
print("RMSE for Random Forest:", rf_rmse)
print("RMSE for Regression Model:", lasso_rmse)
print("RMSE for Regression Model:", reg_model_rmse)
print("Standard deviation for Random Forest:", rf_std)
print("Standard deviation for Lasso:", lasso_std)
print("Standard deviation for Regression Model:", reg_model_std)

### Visualising the models

In [None]:
# Data for the bar plot
rmse    = [rf_rmse, lasso_rmse, reg_model_rmse]
std     = [rf_std, lasso_std, reg_model_std]
models  = ['Random Forest', 'Lasso', 'Regression Model']

# Create the bar plot
fig, ax = plt.subplots()
ax.bar(models, rmse, yerr=std, align='center', alpha=0.5, color = 'pink', ecolor='orange', capsize=10)
ax.set_ylabel('RMSE and Standard deviation')
plt.title('Comparing RMSE of 3 Models')
plt.show()

In [None]:
# Plotting Residuals
lasso_residuals     = df_greater_than_median[dependent_variable] - lasso_predictions
rf_residuals        = df_greater_than_median[dependent_variable] - rf_predictions
reg_model_residuals = df_greater_than_median[dependent_variable] - reg_model_predictions

# create a figure with 3 subplots
fig, axs = plt.subplots(1, 3, figsize=(12,4))

# Lasso Model
axs[0].scatter(lasso_predictions, lasso_residuals, color = 'pink')
axs[0].axhline(y=0, color='r', linestyle='-')
axs[0].set_xlabel('Predicted values')
axs[0].set_ylabel('Residuals')
axs[0].set_title('Residual Plot for Lasso Model')

# Random Forest Model
axs[1].scatter(rf_predictions, rf_residuals, color = 'green')
axs[1].axhline(y=0, color='r', linestyle='-')
axs[1].set_xlabel('Predicted values')
axs[1].set_ylabel('Residuals')
axs[1].set_title('Residual Plot for Random Forest Model')

# Regression Model
axs[2].scatter(reg_model_predictions, reg_model_residuals, color = 'purple')
axs[2].axhline(y=0, color='r', linestyle='-')
axs[2].set_xlabel('Predicted values')
axs[2].set_ylabel('Residuals')
axs[2].set_title('Residual Plot for Regression Model')

fig.tight_layout()
plt.show()

In [None]:
# Actual vs Predicted Values
actual = df_greater_than_median[dependent_variable]

# Create a scatter plot
plt.scatter(actual, lasso_predictions, color = 'pink')

# Add axis labels and title
plt.xlabel('Actual Weekly Gross')
plt.ylabel('Predicted Weekly Gross')
plt.title('Actual vs Predicted Weekly Gross, Lasso Model')

# Add a diagonal line to show perfect predictions
plt.plot([actual.min(), actual.max()], [actual.min(), actual.max()], 'k--', lw=3)

# Display the plot
plt.show()

In [None]:
plt.scatter(actual, rf_predictions, color = 'green')

# Add axis labels and title
plt.xlabel('Actual Weekly Gross')
plt.ylabel('Predicted Weekly Gross')
plt.title('Actual vs Predicted Weekly Gross, Random Forest Model')

# Add a diagonal line to show perfect predictions
plt.plot([actual.min(), actual.max()], [actual.min(), actual.max()], 'k--', lw=3)

# Display the plot
plt.show()

In [None]:
plt.scatter(actual, reg_model_predictions, color = 'purple')

# Add axis labels and title
plt.xlabel('Actual Weekly Gross')
plt.ylabel('Predicted Weekly Gross')
plt.title('Actual vs Predicted Weekly Gross, Regression Model')

# Add a diagonal line to show perfect predictions
plt.plot([actual.min(), actual.max()], [actual.min(), actual.max()], 'k--', lw=3)

# Display the plot
plt.show()

#### Identifying variables that increase RMSE

In [None]:
plt.figure(figsize=(10, 10))
sns.set(font_scale=0.5)
sns.heatmap(df.corr(), annot=True)
plt.show()

In [None]:
sns.boxplot(x = 'week_number', y = 'weekly_gross', data = df)

In [None]:
test2         = df.pivot_table(values = 'weekly_gross', index = 'week_number', columns = 'year')
sns.heatmap(test2)

In [None]:
sns.lmplot( x = 'year', y = 'weekly_gross', data = df)

In [None]:
plt.hist(df['compound_mean'], bins=10, color = 'pink')
plt.title('Sentiment Analysis of Tweets about Climate Change')
plt.xlabel('Sentiment Score')
plt.ylabel('Frequency')
plt.show()

### Polynomial Regression

In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression

In [None]:
# transform the data into polynomial features
poly_features    = PolynomialFeatures(degree=3, include_bias=False)
x_poly           = poly_features.fit_transform(df_less_than_median[predictors])

In [None]:
# train the polynomial regression model
poly             = LinearRegression()
poly.fit(x_poly, df_less_than_median[dependent_variable])

In [None]:
# predict values
x_new_poly       = poly_features.transform(df_greater_than_median[predictors])
poly_predictions = poly.predict(x_new_poly)


poly_mse         = mean_squared_error(df_greater_than_median[dependent_variable], poly_predictions)
poly_rmse        = np.sqrt(poly_mse)
poly_std         = np.std(poly_predictions)

In [None]:
print("RMSE for Polynomial Model:", poly_rmse)
print("STD for Polynomial Model:", poly_std)

In [None]:
# Plot the data and the polynomial fit
plt.plot(df_greater_than_median[predictors], poly_predictions, color='red')
plt.xlabel(predictors)
plt.ylabel(dependent_variable)
plt.title('Polynomial Regression')
plt.show()

In [None]:
# Data for the bar plot
rmse    = [rf_rmse, lasso_rmse, reg_model_rmse, poly_rmse]
std     = [rf_std, lasso_std, reg_model_std, poly_std]
models  = ['Random Forest', 'Lasso', 'Regression Model', 'Polynomial Model']

# Create the bar plot
fig, ax = plt.subplots()
ax.bar(models, rmse, yerr=std, align='center', alpha=0.5, color = 'black', ecolor='orange', capsize=10)
ax.set_ylabel('RMSE and Standard deviation')
plt.title('Comparing RMSE of 4 Models, Polynomial Degree 3')
plt.show()

In [None]:
plt.scatter(actual, poly_predictions, color = 'black')

# Add axis labels and title
plt.xlabel('Actual Weekly Gross')
plt.ylabel('Predicted Weekly Gross')
plt.title('Actual vs Predicted Weekly Gross, Polynomial Model')

# Add a diagonal line to show perfect predictions
plt.plot([actual.min(), actual.max()], [actual.min(), actual.max()], 'k--', lw=3)

# Display the plot
plt.show()

In [None]:
# Plotting Residuals
poly_residuals = df_greater_than_median[dependent_variable] - poly_predictions

# create a figure with 3 subplots
fig, axs       = plt.subplots(1, 4, figsize=(16,4))

# Lasso Model
axs[0].scatter(lasso_predictions, lasso_residuals, color = 'pink')
axs[0].axhline(y=0, color='r', linestyle='-')
axs[0].set_xlabel('Predicted values')
axs[0].set_ylabel('Residuals')
axs[0].set_title('Residual Plot for Lasso Model')

# Random Forest Model
axs[1].scatter(rf_predictions, rf_residuals, color = 'green')
axs[1].axhline(y=0, color='r', linestyle='-')
axs[1].set_xlabel('Predicted values')
axs[1].set_ylabel('Residuals')
axs[1].set_title('Residual Plot for Random Forest Model')

# Regression Model
axs[2].scatter(reg_model_predictions, reg_model_residuals, color = 'purple')
axs[2].axhline(y=0, color='r', linestyle='-')
axs[2].set_xlabel('Predicted values')
axs[2].set_ylabel('Residuals')
axs[2].set_title('Residual Plot for Regression Model')

# Polynomial Model
axs[3].scatter(poly_predictions, poly_residuals, color = 'black')
axs[3].axhline(y=0, color='r', linestyle='-')
axs[3].set_xlabel('Predicted values')
axs[3].set_ylabel('Residuals')
axs[3].set_title('Residual Plot for Polynomial Model')

fig.tight_layout()
plt.show()