Haydon's EDA notebook. Viewer discretion advised.

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier

from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.linear_model import Ridge
from sklearn.tree import DecisionTreeClassifier

SHOW_DECISION_TREES = False


In [None]:
df = pd.read_csv("data/kickstarter_data_full.csv", low_memory=False)
df.head()

In [None]:
# drop certain useless columns
useless_cols = ["Unnamed: 0", "id", "photo", "slug", "currency_symbol", "currency_trailing_code", # truly worthless columns
                "state_changed_at", "creator", "location", "profile", "urls", "source_url", # may need processing to extract information, currently a hindrance
                "friends", "is_starred", "is_backing", "permissions", # absolutely no idea what these are, but lots of NaN
                "create_to_launch", "launch_to_deadline", "launch_to_state_change", # These are already converted to int_days for our convenience
                "deadline_weekday", "state_changed_at_weekday", "created_at_weekday", "launched_at_weekday", # might encode later, likely no information
                "deadline_month", "deadline_day", "deadline_yr", "deadline_hr", "state_changed_at_month", # encodings that nobody asked for
                "state_changed_at_day", "state_changed_at_yr", "state_changed_at_hr", "created_at_month", # encodings that nobody asked for pt.2
                "created_at_day", "created_at_yr", "created_at_hr", "launched_at_month", "launched_at_day", # encodings that nobody asked for pt.3
                "launched_at_yr", "launched_at_hr"]
df.drop(useless_cols, axis=1, inplace=True)
df.head()

In [None]:
# Convert all currencies to USD.
df[df['currency']!="USD"].head()
df['usd_goal'] = df['goal'] * df['static_usd_rate']
df[df['currency']!="USD"].head()

In [None]:
# more useless columns ()
useless_cols = ['goal', 'pledged', 'static_usd_rate', 'currency', # useless now that we've standardized USD
                'state' # not our target! it is 'SuccessfulBool'
                ]
df.drop(useless_cols, axis=1, inplace=True)
df.head()

In [None]:
# Curious what categories we have (I just KNOW that music will be the least successful cat)
print(df["category"].unique())
# Filling the NaNs with "Misc"
df["category"] = df["category"].fillna(value="Misc")
#print(df["category"].unique())

# One-hot encode "categry"
category_dummies = pd.get_dummies(df['category'], prefix='cat')
# Add the one-hot encoded columns and drop the original
df = pd.concat([df, category_dummies], axis=1)
df.drop(columns=['category'], inplace=True)

In [None]:
numerical_data = df.select_dtypes(include='number').dropna()
print(numerical_data.columns)

In [None]:
label_col = 'SuccessfulBool'
X = numerical_data.drop(columns=[label_col])
y = numerical_data[label_col]

In [None]:
# Same feature importance method as before,
# but now we're determining the ideal number of features for LR using RFE cross validation

# Initialize the linear regression model
model = LinearRegression()

# Try different feature counts and evaluate model performance
num_features = list(range(1, len(X.columns) + 1))
cv_scores = []

for n in num_features:
    # Perform Recursive Feature Elimination (RFE) with n selected features
    rfe = RFE(model, n_features_to_select=n)
    rfe.fit(X, y)  # Fit RFE to data

    # Compute cross-validation score (Negative MSE for minimization)
    score = np.mean(cross_val_score(model, rfe.transform(X), y, cv=5, scoring="neg_mean_squared_error"))
    cv_scores.append(-score)  # Convert back to positive MSE

# Determine the optimal number of features
optimal_num_features = num_features[np.argmin(cv_scores)]

# Fit RFE with the optimal number of features
rfe_optimal = RFE(model, n_features_to_select=optimal_num_features)
rfe_optimal.fit(X, y)

# Extract selected features
optimal_features = list(X.columns[rfe_optimal.support_])
print(f"Optimal number of features: {optimal_num_features}")
print(f"Selected optimal features: {optimal_features}")

# Plot Number of Features vs. Validation Error
plt.figure(figsize=(8, 6))
plt.plot(num_features, cv_scores, marker="o", linestyle="-", color="b")
plt.axvline(optimal_num_features, color='r', linestyle="--", label=f"Optimal Features: {optimal_num_features}")
plt.xlabel("Number of Features")
plt.ylabel("Cross-Validated MSE")
plt.title("Determining the Optimal Number of Features for MLR")
plt.legend()
plt.grid()
plt.show()

In [None]:
# Split data randomly into train and validation sets, fit a model to the train set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

# We'll use Ridge Regression (L2 Regularization)
mlr = Ridge(alpha=1.0)  # Adjust alpha for more/less regularization (too lazy to automate hyperparameter tuning atm)
mlr.fit(X_train, y_train)

# Print model Coefficients
mlr_coefficients = pd.Series(mlr.coef_, index=X.columns)
print("Model Coefficients:")
print(mlr_coefficients)

In [None]:
def evaluate_classifiers(X, y):
    # Model testing from Ensemble assignment
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

    # Bagging ensembles
    bag_clf = BaggingClassifier(
        DecisionTreeClassifier(random_state=42), n_estimators=500,
        max_samples=10, bootstrap=True, n_jobs=-1, random_state=42)
    bag_clf.fit(X_train, y_train)
    y_pred = bag_clf.predict(X_test)

    # determine accuracy score for the bagging method
    print(f"{label_col}: Bagging DTR accuracy: {accuracy_score(y_test, y_pred)}")

    # now use a standard decision tree classifier
    tree_clf = DecisionTreeClassifier(random_state=42)
    tree_clf.fit(X_train, y_train)
    y_pred_tree = tree_clf.predict(X_test)

    # compare bagging method with standard decision tree classifier
    print(f"{label_col}: Standard DTR accuracy: {accuracy_score(y_test, y_pred_tree)}")

    rnd_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, n_jobs=-1, random_state=42)
    rnd_clf.fit(X_train, y_train)
    y_pred_rf = rnd_clf.predict(X_test)
    print(f"{label_col}: RFC accuracy: {accuracy_score(y_test, y_pred_rf)}")

    ada_clf = AdaBoostClassifier(
        DecisionTreeClassifier(max_depth=1), n_estimators=200, learning_rate=0.5, random_state=42)
    ada_clf.fit(X_train, y_train)
    y_pred_ada = ada_clf.predict(X_test)
    print(f"{label_col}: Adaboost accuracy: {accuracy_score(y_test, y_pred_ada)}")

    for d in range(2, 8, 1):
        rnd_clf = RandomForestClassifier(n_estimators=500, n_jobs=-1, random_state=42, max_depth=d)
        rnd_clf.fit(X_train, y_train)
        y_pred_rf = rnd_clf.predict(X_test)
        print(f"{label_col}: New RFC, depth {d} accuracy: {accuracy_score(y_test, y_pred_rf)}")

    ada_lrs = [0.00005, 0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5]
    for lr in ada_lrs:
        ada_clf = AdaBoostClassifier(
        DecisionTreeClassifier(max_depth=1), n_estimators=200, learning_rate=lr, random_state=42)
        ada_clf.fit(X_train, y_train)
        y_pred_ada = ada_clf.predict(X_test)
        print(f"{label_col}: New Adaboost, LR {lr} accuracy: {accuracy_score(y_test, y_pred_ada)}")


In [None]:
# Evaluating classifiers with NO restraints on data (ie. providing backer counts & pledges)
evaluate_classifiers(X, y)

In [None]:
# Evaluating classifiers WITHOUT backers counts
evaluate_classifiers(X.drop(columns=['backers_count']), y)

In [None]:
# Evaluating classifiers WITHOUT usd pledge information
evaluate_classifiers(X.drop(columns=['usd_pledged']), y)

In [None]:
# Evaluating classifiers WITHOUT backers counts & pledge information
evaluate_classifiers(X.drop(columns=['backers_count', 'usd_pledged']), y)