In [1]:
import pandas as pd

# Read CSV file
df = pd.read_csv("D:/Work/Fiver/Ldgs/07/ldgs333-attachments/dataset_9.csv")

In [2]:
# Filter the dataset for the "action" or "comedy" genre
filtered_df = df[df['genre'].isin(['Action', 'Comedy'])]

# Group the dataset by genre and aggregate the reviews
grouped_df = filtered_df.groupby('genre')['review'].apply(list).reset_index()

# Access the reviews for action genre (if it exists)
if 'Action' in grouped_df['genre'].values:
    action_reviews = grouped_df[grouped_df['genre'] == 'Action']['review'].iloc[0]
else:
    action_reviews = []

# Access the reviews for comedy genre (if it exists)
if 'Comedy' in grouped_df['genre'].values:
    comedy_reviews = grouped_df[grouped_df['genre'] == 'Comedy']['review'].iloc[0]
else:
    comedy_reviews = []

In [3]:
print(action_reviews)



In [4]:
print(comedy_reviews)



In [5]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string

In [6]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('vader_lexicon')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Furqan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Furqan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Furqan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Furqan\AppData\Roaming\nltk_data...
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Furqan\AppData\Roaming\nltk_data...


True

In [7]:
def preprocess_review(review):
    # Convert to lowercase
    review = review.lower()
    
    # Tokenization
    tokens = word_tokenize(review)
    
    # Stopword removal
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    
    # Removing punctuation and special characters
    tokens = [token for token in tokens if token not in string.punctuation]
    
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    # Removing numbers and short words
    tokens = [token for token in tokens if not token.isnumeric() and len(token) > 2]
    
    # Join tokens back into a single string
    processed_review = ' '.join(tokens)
    
    return processed_review

In [8]:
# Apply preprocessing to action reviews
preprocessed_action_reviews = [preprocess_review(review) for review in action_reviews]

# Apply preprocessing to comedy reviews
preprocessed_comedy_reviews = [preprocess_review(review) for review in comedy_reviews]

In [9]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import text 

stop_words = text.ENGLISH_STOP_WORDS
# Initialize the CountVectorizer
vectorizer = CountVectorizer(min_df=5, max_df=0.9,
                             stop_words=list(stop_words),lowercase=True, # Updated from tutorial code
                             token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')

# Fit and transform the preprocessed action reviews
action_vectors = vectorizer.fit_transform(preprocessed_action_reviews)
action_feathures=vectorizer.get_feature_names_out()

In [10]:
from sklearn.decomposition import LatentDirichletAllocation

# Initialize the LDA model
lda_model = LatentDirichletAllocation(n_components=5, max_iter=10, random_state=258, learning_method='online')

# Fit the LDA model to the action vectors
action_topics = lda_model.fit_transform(action_vectors)

In [11]:
def print_topics(model, vectorizer, top_n=5):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx+1))
        print([(vectorizer.get_feature_names_out()[i], topic[i]) # Updated from tutorial code
                        for i in topic.argsort()[:-top_n - 1:-1]])
 
print("Topics for action:")
print_topics(lda_model, vectorizer)

Topics for action:
Topic 1:
[('movie', 2149.56915198962), ('film', 1592.0474199171545), ('bond', 1033.0212378922809), ('action', 793.9166932718836), ('like', 693.2024402689934)]
Topic 2:
[('movie', 952.8743100779357), ('film', 444.5367493476542), ('bad', 290.57547683744707), ('action', 284.1553824492118), ('good', 255.4551554599044)]
Topic 3:
[('bond', 283.9780670598673), ('angel', 141.39099356458283), ('chan', 112.2706053922607), ('charlie', 112.14023726289163), ('jackie', 92.09229461936474)]
Topic 4:
[('ryan', 122.83377108123577), ('jack', 91.64931159145063), ('affleck', 82.0294129328534), ('ben', 60.91342594686537), ('fear', 57.505342476346144)]
Topic 5:
[('car', 301.88773737640696), ('movie', 215.09139654451735), ('fast', 200.94577563529208), ('furious', 138.09836769072572), ('diesel', 116.17720313926314)]


In [12]:


vectorizer1 = CountVectorizer(min_df=5, max_df=0.9,
                             stop_words=list(stop_words),lowercase=True, # Updated from tutorial code
                             token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')
# Transform the preprocessed comedy reviews
comedy_vectors = vectorizer1.fit_transform(preprocessed_comedy_reviews)

comedy_feathures=vectorizer1.get_feature_names_out()

lda_model1 = LatentDirichletAllocation(n_components=5, max_iter=10, random_state=258, learning_method='online')


# Fit the LDA model to the comedy vectors
comedy_topics = lda_model1.fit_transform(comedy_vectors)



def print_topics(model, vectorizer, top_n=5):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx+1))
        print([(vectorizer.get_feature_names_out()[i], topic[i]) # Updated from tutorial code
                        for i in topic.argsort()[:-top_n - 1:-1]])
 
print("Topics for comedy:")
print_topics(lda_model1, vectorizer1)


Topics for comedy:
Topic 1:
[('perry', 54.87208791117396), ('willis', 48.058285988672246), ('matthew', 40.32411021353698), ('movie', 33.82325373660288), ('bruce', 33.33600617932683)]
Topic 2:
[('movie', 823.5855978222761), ('funny', 315.43394570590885), ('film', 267.41236050715133), ('good', 257.01197454100213), ('really', 226.9849106455348)]
Topic 3:
[('bridget', 167.97030276106892), ('film', 131.4387426341969), ('movie', 96.34445891028086), ('jones', 76.74216942082954), ('grant', 75.35329683938919)]
Topic 4:
[('greg', 189.60923043289918), ('stiller', 160.98415191170267), ('film', 122.07880959913945), ('jack', 110.10831137631777), ('parent', 98.27302002595607)]
Topic 5:
[('movie', 398.90500835139505), ('parody', 129.3281186364968), ('film', 125.12921538712614), ('scary', 115.23370812066639), ('scream', 113.57311447958703)]


In [13]:
# # Assuming you want to retrieve the top 10 significant words for each topic
# num_top_words = 10
# feature_names = vectorizer.get_feature_names_out()

# for topic_idx, topic in enumerate(action_topics.components_):
#     print(f"Topic #{topic_idx + 1}:")
#     top_words = [feature_names[i] for i in topic.argsort()[:-num_top_words - 1:-1]]
#     print(top_words)


In [14]:
print("Action topics")
print(action_topics)

Action topics
[[9.70898937e-01 7.33466972e-03 7.22222309e-03 7.15411349e-03
  7.39005714e-03]
 [9.09049220e-01 8.73080778e-02 1.21500669e-03 1.21102670e-03
  1.21666842e-03]
 [1.27868884e-02 9.49189611e-01 1.25731383e-02 1.28251182e-02
  1.26252438e-02]
 ...
 [3.42779514e-01 3.02005330e-03 2.97485382e-03 6.48214963e-01
  3.01061625e-03]
 [4.73374356e-01 8.85256779e-04 8.73327071e-04 8.69661574e-04
  5.23997398e-01]
 [2.03757342e-01 3.31788688e-03 7.86398775e-01 3.23925727e-03
  3.28673867e-03]]


In [15]:
print("Comedy topics")
print(comedy_topics)

Comedy topics
[[0.00695279 0.97282064 0.00676386 0.00674796 0.00671475]
 [0.00451034 0.67886027 0.00450812 0.30742506 0.0046962 ]
 [0.0010465  0.9957595  0.00106252 0.00106668 0.00106481]
 ...
 [0.00531954 0.00539196 0.97865819 0.00530583 0.00532448]
 [0.00459874 0.00460583 0.98162527 0.00458304 0.00458711]
 [0.98415905 0.00397368 0.00395533 0.00395126 0.00396068]]


In [16]:
# Compare the topics for action and comedy
action_topic_distribution = action_topics.mean(axis=0)
comedy_topic_distribution = comedy_topics.mean(axis=0)

# Print the topic distributions
print("Action Topic Distribution:")
print(action_topic_distribution)
print("\nComedy Topic Distribution:")
print(comedy_topic_distribution)

Action Topic Distribution:
[0.57197291 0.24794602 0.08106762 0.03143042 0.06758303]

Comedy Topic Distribution:
[0.05360933 0.54647797 0.12563609 0.11399364 0.16028296]


In [17]:

import numpy as np

In [18]:
# Calculate the average strength of each topic across all movies
average_strengths = np.mean(action_topics, axis=0)

# Sort the average strengths in descending order and get the indices of the top 2 topics
top_topic_indices = np.argsort(average_strengths)[::-1][:2]
# Select the top 2 topics
top_topics_action = action_topics[:, top_topic_indices]

# Print the top 2 topics
print("Top 2 Topics for action movies:")
print(len(top_topics_action))

Top 2 Topics for action movies:
1269


In [19]:
# Calculate the average strength of each topic across all movies
average_strengths = np.mean(comedy_topics, axis=0)

# Sort the average strengths in descending order and get the indices of the top 2 topics
top_topic_indices = np.argsort(average_strengths)[::-1][:2]
# Select the top 2 topics
top_topics_comedy = comedy_topics[:, top_topic_indices]

# Print the top 2 topics
print("Top 2 Topics for comedy movies:")
print(top_topics_comedy)

Top 2 Topics for comedy movies:
[[0.97282064 0.00671475]
 [0.67886027 0.0046962 ]
 [0.9957595  0.00106481]
 ...
 [0.00539196 0.00532448]
 [0.00460583 0.00458711]
 [0.00397368 0.00396068]]


In [20]:
def extract_topics(lda_model, feature_names, num_words,vector):
    # Get the topic-word distributions from the LDA model
    topic_word_distributions = lda_model.components_
    
    # Get the document-topic distributions from the LDA model
    document_topic_distributions = lda_model.transform(vector)
    
    # Aggregate the topic prevalence across the corpus
    topic_prevalence = np.sum(document_topic_distributions, axis=0)
    
    # Sort the topics based on prevalence
    sorted_topics = np.argsort(topic_prevalence)[::-1]
    
    # Initialize an empty list to store the top words for each topic
    top_words_list = []
    
    # Iterate over each topic
    for rank, topic_idx in enumerate(sorted_topics[:2]):
        # Get the top N words for the topic
        topic_words = topic_word_distributions[topic_idx]
        top_words_idx = topic_words.argsort()[:-num_words - 1:-1]
        top_words = [feature_names[idx] for idx in top_words_idx]
        
        # Add the top words to the list
        top_words_list.append(top_words)
    
    # Return the list of top words for the top 2 topics
    return top_words_list


In [21]:
action_topicwords=extract_topics(lda_model,action_feathures,5,action_vectors)



In [22]:
print(action_topicwords)

[['movie', 'film', 'bond', 'action', 'like'], ['movie', 'film', 'bad', 'action', 'good']]


In [23]:
comedy_topicwords=extract_topics(lda_model1,comedy_feathures,5,comedy_vectors)

In [24]:
from gensim.corpora import Dictionary
from gensim.models.ldamodel import LdaModel
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis
def create_topic_map(documents):
    # Create a dictionary from the documents
    id2word = Dictionary(documents)

    # Create a corpus using the documents
    corpus = [id2word.doc2bow(doc) for doc in documents]

    # Train the LDA model
    lda_model = LdaModel(corpus=corpus, id2word=id2word, num_topics=2)

    # Create the topic map using pyLDAvis
    topic_map = gensimvis.prepare(lda_model, corpus, id2word)

    # Return the topic map
    return topic_map


ModuleNotFoundError: No module named 'gensim'

In [None]:
print("ACTION MOVIES TOPIC MAP FOR TOP 2 TOPICS:")
topic_map=create_topic_map(action_topicwords)
pyLDAvis.display(topic_map)

In [None]:
print(comedy_topicwords)

In [None]:
print("COMEDY MOVIES TOPIC MAP FOR TOP 2 TOPICS:")
topic_map1=create_topic_map(comedy_topicwords)
pyLDAvis.display(topic_map1)

In [None]:
# Calculate the average strength of each topic across all action movies
average_strengths = np.mean(action_topics, axis=0)

# Select the indices of the top two topics for action movies
top_topic_indices_action = np.argsort(average_strengths)[::-1][:2]

# Filter the preprocessed action reviews based on the top two topics
selected_action_reviews = [preprocessed_action_reviews[i] for i, topic in enumerate(top_topics_action) if np.argmax(topic) in top_topic_indices_action]

# Print the selected action reviews
print("Selected Action Reviews:")
print(len(selected_action_reviews))

# Calculate the average strength of each topic across all comedy movies
average_strengths = np.mean(comedy_topics, axis=0)

# Select the indices of the top two topics for comedy movies
top_topic_indices_comedy = np.argsort(average_strengths)[::-1][:2]

# Filter the preprocessed comedy reviews based on the top two topics
selected_comedy_reviews = [preprocessed_comedy_reviews[i] for i, topic in enumerate(top_topics_comedy) if np.argmax(topic) in top_topic_indices_comedy]
# Print the selected comedy reviews
print("Selected Comedy Reviews:")
print(len(selected_comedy_reviews))


In [None]:
# vectorizer = CountVectorizer(min_df=5, max_df=0.9,
#                              stop_words=list(stop_words),lowercase=True, # Updated from tutorial code
#                              token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')

# # Fit and transform the preprocessed action reviews
# action_vectors = vectorizer.fit_transform(selected_action_reviews)

# # Initialize the LDA model
# lda_model = LatentDirichletAllocation(n_components=2, max_iter=10, random_state=258, learning_method='online')

# # Fit the LDA model to the action vectors
# action_topics = lda_model.fit_transform(action_vectors)
# def print_topics(model, vectorizer, top_n=5):
#     for idx, topic in enumerate(model.components_):
#         print("Topic %d:" % (idx+1))
#         print([(vectorizer.get_feature_names_out()[i], topic[i]) # Updated from tutorial code
#                         for i in topic.argsort()[:-top_n - 1:-1]])
 
# print("Topics for action:")
# print_topics(lda_model, vectorizer)


In [None]:

# vectorizer1 = CountVectorizer(min_df=5, max_df=0.9,
#                              stop_words=list(stop_words),lowercase=True, # Updated from tutorial code
#                              token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')
# # Transform the preprocessed comedy reviews
# comedy_vectors = vectorizer1.fit_transform(selected_comedy_reviews)

# lda_model1 = LatentDirichletAllocation(n_components=2, max_iter=10, random_state=258, learning_method='online')


# # Fit the LDA model to the comedy vectors
# comedy_topics = lda_model1.fit_transform(comedy_vectors)



# def print_topics(model, vectorizer, top_n=5):
#     for idx, topic in enumerate(model.components_):
#         print("Topic %d:" % (idx+1))
#         print([(vectorizer.get_feature_names_out()[i], topic[i]) # Updated from tutorial code
#                         for i in topic.argsort()[:-top_n - 1:-1]])
 
# print("Topics for comedy:")
# print_topics(lda_model1, vectorizer1)


In [None]:
from nltk.sentiment import SentimentIntensityAnalyzer

# Initialize the sentiment analyzer
sid = SentimentIntensityAnalyzer()


# Calculate sentiment scores for action reviews
action_sentiment_scores = [sid.polarity_scores(review)["compound"] for review in selected_action_reviews]

# Calculate sentiment scores for comedy reviews
comedy_sentiment_scores = [sid.polarity_scores(review)["compound"] for review in selected_comedy_reviews]


In [None]:
print("Action Sentiment Scores: ")
print(action_sentiment_scores)

# interpretation
Based on the sentiment analysis conducted on the top 2 topics for comedy genre movies, here are some observations:<br>
The majority of the sentiment scores fall in the positive range, with values above 0.5. This indicates that the sentiment towards most action movies was positive.

There are a few negative sentiment scores below 0, suggesting a negative sentiment associated with some action movies. These movies might have received negative feedback or were perceived unfavorably by the audience.

Some sentiment scores are close to 0, indicating a neutral sentiment or an ambiguous interpretation of the movies. These scores suggest that the sentiment towards these movies may not be strongly positive or negative, and the audience's perception might be more mixed or uncertain.

Overall, the sentiment scores suggest that the majority of the action movies analyzed received positive sentiment. This indicates that the audience generally had a favorable perception of these movies.

In [None]:
print("Comedy sentiment score")
print(comedy_sentiment_scores)

# Interpretation 
To summarize the sentiment analysis conducted on the top 2 topics for comedy genre movies, here are some observations:

The majority of the sentiment scores are positive, with values above 0.5. This indicates a generally positive sentiment towards comedy-related items or instances.

There are a few negative sentiment scores below 0, suggesting a negative sentiment associated with some comedy movies. These movies might not have been perceived as humorous or might have received negative feedback from the audience.

Some sentiment scores are close to 0, indicating a neutral sentiment or an ambiguous interpretation. This suggests that the sentiment towards these movies may not be strongly positive or negative, and the audience's perception might be mixed or uncertain.

Overall, the sentiment scores suggest that the majority of the comedy movies were perceived positively. This indicates that the audience generally had a favorable perception of these comedic elements.

In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# Split the data into independent variables (sentiment score) and dependent variable (box_office_revenue)
X = comedy_sentiment_scores


# Group the dataset by genre and aggregate the box_office_revenue
grouped_df = filtered_df.groupby('genre')['box_office_revenue'].apply(list).reset_index()

# Access the box_office_revenue for comedy genre (if it exists)
if 'Comedy' in grouped_df['genre'].values:
    comedy_box = grouped_df[grouped_df['genre'] == 'Comedy']['box_office_revenue'].iloc[0]
else:
    comedy_box = []


# Calculate the average strength of each topic across all comedy movies
average_strengths = np.mean(comedy_topics, axis=0)

# Select the indices of the top two topics for comedy movies
top_topic_indices_comedy = np.argsort(average_strengths)[::-1][:2]

# select box office revenu  based on the top two topics of comedy
selected_comedy_reviews = [comedy_box[i] for i, topic in enumerate(top_topics_comedy) if np.argmax(topic) in top_topic_indices_comedy]

    
    
y = selected_comedy_reviews


# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert the lists to NumPy arrays
X_train = np.array(X_train).reshape(-1, 1)
y_train = np.array(y_train).reshape(-1, 1)
X_test = np.array(X_test).reshape(-1, 1)
y_test = np.array(y_test).reshape(-1, 1)

# Create a linear regression model and fit it to the training data
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Print the evaluation metrics
print("Comedy regression analysis")
print("Mean Squared Error: ", mse)
print("R-squared: ", r2)


# Interpreatation
In the given comedy regression analysis, the Mean Squared Error (MSE) value of 829,301,651,351,364.4 is quite high, indicating a large amount of error between the predicted comedy ratings and the actual ratings. This suggests that the model's predictions are not very accurate or precise.

The R-squared (R^2) value of 0.02774524950135082 suggests that only approximately 2.77% of the variance in comedy ratings can be explained by the independent variables used in the analysis. This means that the included independent variables, such as sentiment score, have limited explanatory power in predicting comedy ratings accurately.

Considering these results, it seems that the regression model for comedy is not performing well. The high MSE and low R-squared indicate that the model's predictions are not reliable and that the included independent variables do not strongly influence comedy ratings. It might be necessary to consider alternative or additional variables to improve the model's predictive ability for comedy ratings.


In [None]:
# Split the data into independent variables (sentiment score) and dependent variable (revenue)
X = action_sentiment_scores

# Group the dataset by genre and aggregate the reviews
grouped_df = filtered_df.groupby('genre')['box_office_revenue'].apply(list).reset_index()

# Access the reviews for Action genre (if it exists)
if 'Action' in grouped_df['genre'].values:
    Action_box = grouped_df[grouped_df['genre'] == 'Action']['box_office_revenue'].iloc[0]
else:
     Action_box= []

# Calculate the average strength of each topic across all movies
average_strengths = np.mean(action_topics, axis=0)

# Sort the average strengths in descending order and get the indices of the top 2 topics
top_topic_indices = np.argsort(average_strengths)[::-1][:2]
# select box ofice revenu reviews based on the top two topics of action movie
selected_box_revenu = [Action_box[i] for i, topic in enumerate(top_topics_action) if np.argmax(topic) in top_topic_indices_action]


y=selected_box_revenu


# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert the lists to NumPy arrays
X_train = np.array(X_train).reshape(-1, 1)
y_train = np.array(y_train).reshape(-1, 1)
X_test = np.array(X_test).reshape(-1, 1)
y_test = np.array(y_test).reshape(-1, 1)

# Create a linear regression model and fit it to the training data
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Print the evaluation metrics
print("Action regression analysis")
print("Mean Squared Error: ", mse)
print("R-squared: ", r2)


# interpretation
In the action regression analysis, the Mean Squared Error (MSE) value of 1,222,277,061,325,680.8 indicates a relatively high level of error between the predicted and actual box office revenue for action movies. This suggests that the model's predictions are not very accurate or precise.

The R-squared (R^2) value of 0.042426693486870226 suggests that approximately 4.24% of the variance in box office revenue for action movies can be explained by the independent variable of sentiment scores. This indicates that sentiment scores have a limited ability to explain the variation in box office revenue for action movies accurately.

Based on these results, it can be concluded that the regression model for action movies may not be performing well. The high MSE and relatively low R-squared value indicate that the model's predictions are not closely aligned with the actual box office revenue, and the included independent variable (sentiment scores) has limited explanatory power.

To improve the predictive ability of the model for action movie box office revenue, it might be necessary to consider additional independent variables or explore other factors that could better explain the variability in box office performance.


# Comparision

Comparing the results of the regression analysis for action and comedy movies, we can observe the following:

Mean Squared Error (MSE):

Action: The MSE for action movies is 1,222,277,061,325,680.8.
Comedy: The MSE for comedy movies is 829,301,651,351,364.4.
The MSE values indicate the average squared difference between the predicted and actual box office revenue or ratings for each genre. In both cases, the MSE values are relatively high, suggesting a significant amount of error or deviation in the predictions. However, the MSE for action movies is slightly higher, indicating a potentially larger error compared to comedy movies.

R-squared (R^2) Coefficient:

Action: The R-squared value for action movies is 0.042426693486870226.
Comedy: The R-squared value for comedy movies is 0.02774524950135082.
The R-squared values represent the proportion of variance in the box office revenue or ratings that can be explained by the independent variables (sentiment scores). In both cases, the R-squared values are relatively low, indicating that the sentiment scores have limited explanatory power for both genres. However, the R-squared value for action movies is slightly higher, suggesting a marginally better ability to explain the variation in box office revenue compared to comedy movies.

Overall, based on these results, it seems that the regression model for action movies performs slightly worse than the one for comedy movies. Both models have relatively high MSE values and low R-squared values, indicating limitations in accurately predicting box office revenue or ratings based on sentiment scores alone. Additional factors or independent variables might be needed to improve the models' predictive abilities for both genres.

# Managerial implications:

For comedy movies: Given the higher R-squared value, sentiment scores seem to have a more significant impact on the box office revenue of comedy movies. Filmmakers and studios can pay closer attention to audience sentiment and use it as a potential indicator of the movie's success. They may consider investing in marketing strategies that target the sentiments associated with comedy genres to attract more viewers.

For action movies: The lower R-squared value suggests that sentiment scores have a relatively weaker influence on the box office revenue of action movies. Filmmakers and studios should focus on other factors such as star power, action sequences, and storyline to attract audiences and drive box office success. While sentiment scores may still provide some insights, they may not be as reliable in predicting revenue for action movies.

Overall, understanding the differential impact of sentiment scores on different genres can help movie industry professionals make more informed decisions regarding production, marketing, and audience targeting strategies for comedy and action movies.