Haydon's EDA notebook. Viewer discretion advised.

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

SHOW_DECISION_TREES = False


In [None]:
df = pd.read_csv("data/kickstarter_data_full.csv", low_memory=False)
df.head()

In [None]:
# drop certain useless columns
useless_cols = ["Unnamed: 0", "id", "photo", "slug", "currency_symbol", "currency_trailing_code", # truly worthless columns
                "state_changed_at", "creator", "location", "profile", "urls", "source_url", # may need processing to extract information, currently a hindrance
                "friends", "is_starred", "is_backing", "permissions", # absolutely no idea what these are, but lots of NaN
                "create_to_launch", "launch_to_deadline", "launch_to_state_change", # These are already converted to int_days for our convenience
                "deadline_weekday", "state_changed_at_weekday", "created_at_weekday", "launched_at_weekday", # might encode later, likely no information
                "deadline_month", "deadline_day", "deadline_yr", "deadline_hr", "state_changed_at_month", # encodings that nobody asked for
                "state_changed_at_day", "state_changed_at_yr", "state_changed_at_hr", "created_at_month", # encodings that nobody asked for pt.2
                "created_at_day", "created_at_yr", "created_at_hr", "launched_at_month", "launched_at_day", # encodings that nobody asked for pt.3
                "launched_at_yr", "launched_at_hr"]
df.drop(useless_cols, axis=1, inplace=True)
df.head()

In [None]:
# Convert all currencies to USD.
df[df['currency']!="USD"].head()
df['usd_goal'] = df['goal'] * df['static_usd_rate']
df[df['currency']!="USD"].head()

In [None]:
# more useless columns ()
useless_cols = ['goal', 'pledged', 'static_usd_rate', 'currency', # useless now that we've standardized USD
                'state' # not our target! it is 'SuccessfulBool'
                ]
df.drop(useless_cols, axis=1, inplace=True)
df.head()

In [None]:
# Curious what categories we have (I just KNOW that music will be the least successful cat)
print(df["category"].unique())
# Filling the NaNs with "Misc"
df["category"] = df["category"].fillna(value="Misc")
#print(df["category"].unique())

# One-hot encode "categry"
category_dummies = pd.get_dummies(df['category'], prefix='cat')
# Add the one-hot encoded columns and drop the original
df = pd.concat([df, category_dummies], axis=1)
df.drop(columns=['category'], inplace=True)

In [None]:
# Log Transform on the usd goal
col = 'usd_goal'

# interpolating the missing raised values with the median before log transform
median_raised = df[col].median()
def safe_log(x):
    if x == 0:
        x = median_raised
    return np.log(x)

# Apply safe log transformation
df[col + ' (log)'] = df[col].apply(safe_log)

plt.figure(figsize=(8, 5))
sns.histplot(df[col + ' (log)'], bins=20, kde=True)
plt.title(f"Distribution of {col + ' (log)'}")
plt.xlabel(col + ' (log)')
plt.ylabel("Frequency")
plt.tight_layout()
plt.show()

In [None]:
# Save post EDA data
print(df.columns)
df = df.sample(frac=1).reset_index(drop=True) # shuffle!
df.to_csv("data/post_eda.csv")