In [None]:
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
df = pd.read_excel('./data/creepypastas.xlsx')
df.head()

In [None]:
# clean and prepare the data

# drop invalid creepypasta row (advertisement)
df = df.drop(index=2511)

In [None]:
# turn estimated_reading_time into a number
print(df['estimated_reading_time'].unique())


In [None]:
# estimated_reading_time is either "X minutes", "X minut", "< 1 minute" or nan
df['estimated_reading_time'] = df['estimated_reading_time'].str.replace(' minutes', '')
df['estimated_reading_time'] = df['estimated_reading_time'].str.replace('< 1 minute', '1')
df['estimated_reading_time'] = df['estimated_reading_time'].str.replace(' minut', '')
# convert to numeric
df['estimated_reading_time'] = pd.to_numeric(df['estimated_reading_time'])
df.head(n=1)


In [None]:
# turn date into a datetime
df['publish_date'] = pd.to_datetime(df['publish_date'])
df.head(n=1)

In [None]:
# plot ratings to see distribution
plot = df['average_rating'].plot.hist(bins=25)
plot.set_xlabel('Average Rating')
plot.title.set_text('Distribution of Average Ratings')
# ratings are skewed left, most ratings are between 6.5 and 8.5
df['average_rating'].describe()

In [None]:
# plot reading times to see distribution
plot = df['estimated_reading_time'].plot.hist(bins=50)
plot.set_xlabel('Estimated Reading Time (minutes)')
plot.title.set_text('Distribution of Estimated Reading Times')
# most stories are relatively short
print(df['estimated_reading_time'].describe())

In [None]:
# let's see the popularity of creepypastas over time
# let's plot month by month how many creepypastas were published
stories_per_year = df['publish_date'].dt.year.value_counts()
plot = plt.plot(stories_per_year.sort_index())
plt.title('Number of Creepypastas Published per Year')
plt.xlabel('Year')
plt.ylabel('Number of Creepypastas')
# huge spike in 2012 that seems to stay for a couple of years, then a resurgence in 2019

In [None]:
# is there a correlation between number of stories published and the time of year?
# let's plot month by month how many creepypastas were published
stories_per_month = df['publish_date'].dt.month.value_counts()
plot = plt.plot(stories_per_month.sort_index())
plt.title('Number of Creepypastas Published per Month')
plt.xlabel('Month')
plt.ylabel('Number of Creepypastas')
months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct','Nov', 'Dec']
ticks = plt.xticks(stories_per_month.sort_index().index, months)
# pretty large spike in October, potentially due to Halloween

In [None]:
# interesting basic insights so far.
# let's dig deeper into the data
# do shorter stories get higher ratings?
plot = plt.plot(df['average_rating'], df['estimated_reading_time'], 'o')
plt.title('Average Rating vs. Estimated Reading Time')
plt.xlabel('Average Rating')
plt.ylabel('Estimated Reading Time (minutes)')

In [None]:
# let's dig into the tags and categories
# what are the most popular tags?

tags = []
tag_dict = {}
for tag in df['tags']:
    if type(tag) == str:
        # check for multiple tags
        tags_in_entry = tag.split(', ')
        if(len(tags_in_entry) > 1):
            for tag in tags_in_entry:
                # clean tag and add to list and dict
                tag = tag.replace('\n','')
                tag = tag.strip()
                tags.append(tag)
                # increment tag count in dict
                tag_dict[tag] = tag_dict.get(tag, 0) + 1
        else:
            tag = tag.replace('\n','')
            tags.append(tag)
            tag_dict[tag] = tag_dict.get(tag, 0) + 1
# remove duplicates
tags = list(set(tags))
# sort dictionary by value
tag_dict = dict(sorted(tag_dict.items(), key=lambda item: item[1], reverse=True))
print("The 5 most popular creepypasta tags are:")
for i in range(5):
    print(list(tag_dict.keys())[i], ":", list(tag_dict.values())[i])

In [None]:
# let's do the same for categories
categories = []
category_dict = {}
for category in df['categories']:
    if type(category) == str:
        categories_in_entry = category.split(', ')
        if(len(categories_in_entry) > 1):
            for category in categories_in_entry:
                category = category.replace('\n','')
                # some entries have "Please wait..." as a category on accident, let's remove those
                category = category.replace('Please wait...', '')
                category = category.strip()
                
                
                categories.append(category)
                category_dict[category] = category_dict.get(category, 0) + 1
        else:
            # some entries have "Please wait..." as a category on accident, let's remove those
            category = category.replace('Please wait...', '')
            category = category.replace('\n','')
            category = category.strip()
            categories.append(category)
            category_dict[category] = category_dict.get(category, 0) + 1
# remove duplicates
categories = list(set(categories))
# sort dictionary by value
category_dict = dict(sorted(category_dict.items(), key=lambda item: item[1], reverse=True))
print("The 10 most popular creepypasta categories are:")
for i in range(10):
    print(list(category_dict.keys())[i], ":", list(category_dict.values())[i])



In [None]:
# hmm, lot's of beings and entities! Is there a correlation between the category and the average rating?
# let's plot the average rating for each category
category_average_ratings = {}
all_ratings = []
for category in categories:
    # print(category)
    # get avg rating from stories that have this category (stories may have multiple)
    rating = df[df['categories'].str.contains(category)]['average_rating'].mean()
    category_average_ratings[category] = rating
    all_ratings.append(rating)

# sort dict by avg rating
category_average_ratings = dict(sorted(category_average_ratings.items(), key=lambda item: item[1], reverse=True))

figure = plt.figure(figsize=(15, 10))

plot = plt.bar(category_average_ratings.keys(), category_average_ratings.values())

ticks = plt.xticks(rotation='vertical')

ax = plt.gca()
ymin = min(all_ratings) - 0.5
ymax = max(all_ratings) + 0.5
ax.set_ylim([ymin, ymax])

plt.title('Average Rating by Category')

In [None]:
# it appears that the highest rated creepypastas, on average, are those with happy endings!
# how many creepypastas have happy endings?
happy_endings = df[df['categories'].str.contains('Happy Endings')]['categories'].count()
print("Number of creepypastas with happy endings:", happy_endings)

In [None]:
# pretty small sample size! Which makes sense, creepypastas are supposed to be scary and unsettling
# let's look at the next highest rated category, True Scary Stories
true_scary_stories = df[df['categories'].str.contains('True Scary Stories')]
true_scary_stories_count = true_scary_stories['categories'].count()
print("Number of creepypastas with true scary stories:", true_scary_stories_count)

In [None]:
# Only a single creepypasta with a true scary story! 
# This also makes sense, creepypastas are supposed to be fictional
# Let's look at this one true story.
true_story = true_scary_stories.iloc[0]
print("The true story: ", true_story['story_name'])

In [None]:
# after doing some research, it turns out that this story is totally fake.
# https://en.wikipedia.org/wiki/Dybbuk_box

# let's look at the next highest rated category, Natural Disasters and Storms
natural_disasters = df[df['categories'].str.contains('Natural Disasters and Storms')]
natural_disasters_count = natural_disasters['categories'].count()
print("Number of creepypastas with natural disasters:", natural_disasters_count)

In [None]:
# Turns out the highest rated categories all have a super small sample size. Let's see how many stories of each category there are

figure = plt.figure(figsize=(15, 10))
plot = plt.bar(category_dict.keys(), category_dict.values())
ticks = plt.xticks(rotation='vertical')
plt.title('Number of Creepypastas per Category')

In [None]:
# let's do some analysis on the text itself
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(stopwords.words('english'))

# get all the words from the creepypastas
all_text_content = '\n'.join(df['body'])
print("Total characters in all creepypastas:", len(all_text_content))

# get all words
tokens = [t.lower() for t in nltk.word_tokenize(all_text_content) if t.isalpha()]

In [None]:
# get all words excluding stopwords
words = [w for w in tokens if not w in stop_words]
print("Total words in all creepypastas (excluding stopwords):", len(words))

In [None]:
distinct_words = set(words)
# https://en.wikipedia.org/wiki/Lexical_diversity
lexical_diversity = len(distinct_words) / len(words)

print("Total distinct words:", len(distinct_words))
print("Lexical diversity:", lexical_diversity)

In [None]:
# a fun thing to look for: collocations
# this is a pair of words that occur together often in the text
# textual analysis inspired by https://medium.com/@finalfire/visualizing-data-from-norwegian-wood-by-haruki-murakami-502e117fdcc6

ntext = nltk.Text(tokens)
ntext.collocations(window_size=4)

In [None]:
# fun to see, though with such a large corpus, the collocations are not very interesting
# let's look at the most common words
fdist = nltk.FreqDist(words)
most_common = fdist.most_common(20)

figure = plt.figure(figsize=(15, 10))
plot = plt.bar([x[0] for x in most_common], [x[1] for x in most_common])
ticks = plt.xticks(rotation=45)
plt.xlabel('Word')
plt.ylabel('Frequency')
plt.title('Most Common Words in Creepypastas')

In [None]:
# once again, not very exciting, but interestingly, one of the most common words is 'eyes'
# eyes can indeed be very creepy.
# aditionally, 'night' is a common word, which makes sense, as many scary stories take place at night.

In [None]:
# let's look at the most common words in the titles now
all_titles = '\n'.join(df['story_name'])
print("Total characters in all creepypasta titles:", len(all_titles))
title_tokens = [t.lower() for t in nltk.word_tokenize(all_titles) if t.isalpha()]
title_words = [w for w in title_tokens if not w in stop_words]
title_fdist = nltk.FreqDist(title_words)
title_most_common = title_fdist.most_common(20)
plt.figure(figsize=(15, 10))
plt.bar([x[0] for x in title_most_common], [x[1] for x in title_most_common])
ticks = plt.xticks(rotation=45)
plt.xlabel('Word')
plt.ylabel('Frequency')
title = plt.title('Most Common Words in Creepypasta Titles')

In [None]:
# ah, much more interesting! (Likely because titles typically have much less generic text)
# turns out men are really scary, along with dark houses at night!

In [None]:
# what about words that rarely occur?
# https://en.wikipedia.org/wiki/Hapax_legomenon
rare_words = fdist.hapaxes()
print("Total words that occur only once (hapax legomena):", len(rare_words))
# let's pick some at random
import random
random_words = random.sample(rare_words, 10)
string = "Random sample of hapax legomena: "
rand_words = ', '.join(random_words)
string += rand_words
print(string)

In [None]:
# just from a few samples, it seems like a lot of these words are typos or made up words. others are totally unexpected
# that makes total sense, but is kinda disappointing

In [None]:
# now, let's take the highest rated story and perform textual analysis
highest_rated = df[df['average_rating'] == df['average_rating'].max()].iloc[0]
print("Highest rated creepypasta:", highest_rated['story_name'])
highest_rated_text = highest_rated['body']
print("Total characters in '"+highest_rated['story_name']+"':", len(highest_rated_text))
highest_rated_tokens = [t.lower() for t in nltk.word_tokenize(highest_rated_text) if t.isalpha()]
highest_rated_words = [w for w in highest_rated_tokens if not w in stop_words]
highest_rated_fdist = nltk.FreqDist(highest_rated_words)
highest_rated_most_common = highest_rated_fdist.most_common(20)
plt.figure(figsize=(15, 10))
plt.bar([x[0] for x in highest_rated_most_common], [x[1] for x in highest_rated_most_common])
ticks = plt.xticks(rotation=45)
plt.xlabel('Word')
plt.ylabel('Frequency')
title = plt.title('Most Common Words in '+highest_rated['story_name'])

In [None]:
# the most common words alone give a pretty good idea of what the story is about (it is freaky, be warned!)
# let's look at the collocations
highest_rated_ntext = nltk.Text(highest_rated_tokens)
highest_rated_ntext.collocations(window_size=4)

In [None]:
# let's try to find the shape of the story, inspired by Kurt Vonnegut
# https://www.youtube.com/watch?v=oP3c1h8v2ZQ

from nltk.sentiment import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')
sia = SentimentIntensityAnalyzer()



story_text = highest_rated['body']
story_sentences = [s for s in nltk.sent_tokenize(story_text)]
print("Total sentences in '"+highest_rated['story_name']+"':", len(story_sentences))

In [None]:
sentiment_scores = [sia.polarity_scores(s)['compound'] for s in story_sentences]
plot = plt.plot(sentiment_scores)
plt.xlabel('Sentence')
plt.ylabel('Sentiment Score')
plt.title('Sentiment Score of Sentences in \''+highest_rated['story_name']+"\'")
ax = plt.gca()

ax.set_ylim([-1, 1])

In [None]:
# now let's try to get a running average over a sentence window to get a better idea of the shape of the story
window_size = 15

sentiment_series = pd.Series(sentiment_scores)
sentiment_mean = sentiment_series.rolling(window_size).mean().values
x_axis = sentiment_series.index

plot = plt.plot(x_axis, sentiment_mean)
plt.plot(x_axis, [0 for i in range(len(x_axis))], 'k--')

plt.xlabel('Sentence') 
plt.ylabel('Compound Sentiment Score')
title = plt.title('Compound Sentiment Score of Sentences in \''+highest_rated['story_name']+"\'")

In [None]:
# now we can see much more of the story's shape
# The story is overall negative, with a few positive spikes
# let's view the lowest point and highest point of the story
# this is the point in which the average sentiment score within the window is the lowest/highest
import numpy as np
lowest_point = np.nanmin(sentiment_mean)
lowest_point_index = np.where(sentiment_mean == lowest_point)[0][0]
print("Lowest point of the story:", story_sentences[lowest_point_index])
print("Lowest point sentiment score:", lowest_point)
print()
highest_point = np.nanmax(sentiment_mean)
highest_point_index = np.where(sentiment_mean == highest_point)[0][0]
print("Highest point of the story:", story_sentences[highest_point_index])
print("Highest point sentiment score:", highest_point)

In [None]:
# during the low point, the narrator is describing pain from an injury. Makes sense!
# during the high point, the narrator is using language such as 'I commanded', which could potentially be seen as positive / powerful
# but overall, the high point is still only very slightly positive, landing it in neutral territory.
# looking at the sentence per sentence scores, it seems there is one very positive sentence that is bringing the average up.
# what is this sentence?
highest_sentence_index = np.argmax(sentiment_scores)
print("Highest sentiment sentence:", story_sentences[highest_sentence_index])
print("Highest sentiment score:", sentiment_scores[highest_sentence_index])


In [None]:
# ah! the narrator is finally successful in his task (which in this case, is stabilization of his body) (long story)

In [None]:
df.head()

In [None]:
# now let's do some machine learning!

# let's try to predict the average rating of a creepypasta based on the categories and estimated reading time.

# I think we need to encode the categories somehow
# we will use sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import mean_squared_error



In [None]:
ml_df = df.copy()

In [None]:
ml_df['categories'] = ml_df['categories'].apply(lambda x: x.replace('\n', ''))
ml_df['categories'] = ml_df['categories'].apply(lambda x: x.replace('Please wait...', ''))
ml_df['categories'] = ml_df['categories'].apply(lambda x: x.split(', '))
ml_df['categories'] = ml_df['categories'].apply(lambda x: [y.strip() for y in x])


In [None]:
ml_df.head()

In [None]:
# using multilabelbinarizer to encode categories since a story can have multiple categories
mlb = MultiLabelBinarizer()
categories_encoded = pd.DataFrame(mlb.fit_transform(ml_df['categories']), columns=mlb.classes_, index=ml_df.index)
ml_df_encoded = pd.concat([ml_df.drop(['categories', 'average_rating', 'tags', 'body', 'story_name', 'publish_date'], axis=1), categories_encoded], axis=1)
ml_df_encoded.head()

In [None]:
ml_df.head()

In [None]:
# train test split!
X_train, X_test, y_train, y_test = train_test_split(ml_df_encoded, ml_df['average_rating'], test_size=0.2, random_state=42)


In [None]:
# Create a model (linear regression isn't great, let's try random forest)
model = RandomForestRegressor(random_state=42)

# Fit the model to the training data
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

In [None]:
predicted_ratings = pd.DataFrame(columns=['story_name', 'predicted_rating', 'categories', 'estimated_reading_time'])

In [None]:
# now that we have fit a model, let's get predictions for randomly generated creepypastas

# let's use openai to generate random titles, their categories, and their estimated reading times
# the title is just for fun, we will use the categories and estimated reading time for our model
import openai
from dotenv import load_dotenv
load_dotenv()
import os
openai.api_key = os.getenv("OPENAI_API_KEY")

In [None]:
system_prompt = "You generate creepypasta titles, categories, and estimated reading times. Valid categories are:"
for category in ml_df_encoded.columns[8:]:
    system_prompt += "\n" + category

print(system_prompt)


In [None]:

response = openai.ChatCompletion.create(
    model="gpt-3.5-turbo",
    messages=[
        {"role": "user", "content": system_prompt},
        {"role": "assistant", "content": "I can do that!"},
        {"role": "user", "content": "Generate a creepypasta title, category, and estimated reading time."},
        {"role": "assistant", "content": "Title: The Creepy Pasta\nCategories: Drugs and Addictions, Magic\nEstimated Reading Time: 5"},
        {"role": "user", "content": "Generate a unique creepypasta title, categories, and estimated reading time."}
    ],
    temperature=1.2
)

In [None]:
content = response['choices'][0]['message']['content']
content = content.split('\n')
print(content)

In [None]:
title = content[0].split(':')[1].strip()
categories = content[1].split(':')[1].strip().split(', ')
reading_time = content[2].split(':')[1].strip()
if ' minutes' in reading_time:
    reading_time = reading_time.replace(' minutes', '')
    reading_time = int(reading_time)

print("Title:", title)
print("Categories:", categories)
print("Reading Time:", reading_time)

In [None]:
# get predicted rating

generated_df = pd.DataFrame(columns=ml_df_encoded.columns)
generated_df.loc[0] = 0
generated_df['estimated_reading_time'] = reading_time
for category in categories:
    if category in generated_df.columns:
        print("Found category:", category)
        generated_df[category] = 1
    else:
        print("Unkown category:", category)
    
generated_df.head()


In [None]:
predicted_rating = model.predict(generated_df)
print("Predicted rating for '"+title+"':", predicted_rating[0])

In [None]:
predicted_ratings = pd.concat([predicted_ratings, pd.DataFrame([[title, predicted_rating[0], categories, reading_time]], columns=['story_name', 'predicted_rating', 'categories', 'estimated_reading_time'])], ignore_index=True)

In [None]:
# now we have a dataframe of predicted ratings (based on category and reading time) for randomly generated creepypastas!
predicted_ratings.head()

In [None]:
# according to the analysis above, people seem to dislike spiders and insects.
# additionally longer creepypastas have higher ratings
# with this knowledge, let's try to create a poorly rated creepypasta

bad_creepypasta = pd.DataFrame(columns=ml_df_encoded.columns)
bad_creepypasta.loc[0] = 0
bad_creepypasta['estimated_reading_time'] = 1
bad_creepypasta['Spiders'] = 1
bad_creepypasta['Insects'] = 1

predicted_rating = model.predict(bad_creepypasta)
print("Predicted rating for the bad creepypasta:", predicted_rating[0])

In [None]:
# let's see how well this creepypasta does with the same content but alternate reading times

bad_creepypasta_reading_times = pd.DataFrame(columns=ml_df_encoded.columns)
for i in range(0, 99):
    bad_creepypasta_reading_times.loc[i] = 0
bad_creepypasta_reading_times['Spiders'] = 1
bad_creepypasta_reading_times['Insects'] = 1
bad_creepypasta_reading_times['estimated_reading_time'] = [i for i in range(1, 100)]

plt.plot(bad_creepypasta_reading_times['estimated_reading_time'], model.predict(bad_creepypasta_reading_times))
plt.xlabel('Estimated Reading Time')
plt.ylabel('Predicted Rating')
title = plt.title('Predicted Rating of Bad Creepypasta by Estimated Reading Time')

In [None]:
# now let's keep the same reading time but change the content

bad_creepypasta_content = pd.DataFrame(columns=ml_df_encoded.columns)
for i in range(0, len(categories)):
    bad_creepypasta_content.loc[i] = 0
bad_creepypasta_content['estimated_reading_time'] = 60
for i in range(0,len(categories)):
    bad_creepypasta_content.loc[i, categories[i]] = 1

figure = plt.figure(figsize=(15, 10))

plt.plot(category_dict.keys(), model.predict(bad_creepypasta_content))
ticks = plt.xticks(rotation='vertical')
plt.xlabel('Category')
plt.ylabel('Predicted Rating')
title = plt.title('Predicted Rating of Short Creepypasta by Category')

In [None]:
# and just for fun to test our model, let's try to create a long creepypasta with a happy ending
happy_ending_creepypasta = pd.DataFrame(columns=ml_df_encoded.columns)
happy_ending_creepypasta.loc[0] = 0
happy_ending_creepypasta['estimated_reading_time'] = 60
happy_ending_creepypasta['Feelspastas and Happy Endings'] = 1

predicted_rating = model.predict(happy_ending_creepypasta)
print("Predicted rating for the happy ending creepypasta:", predicted_rating[0])

In [None]:
# fun project! Now we know how to create the best creepypastas, and can predict their ratings!