In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from aem.kae import pRest

from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.linear_model import Ridge
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree


SHOW_DECISION_TREES = False

In [None]:
df = pd.read_csv("data/post_eda.csv")
#print(df['Unnamed: 0'])
df.drop(columns=["Unnamed: 0"], inplace=True)
print(df.columns)

In [None]:
label_col = 'SuccessfulBool'
#numerical_data = df.select_dtypes(include=['number'])
numerical_data = df.select_dtypes(include=['number', 'bool'])
numerical_data.dropna(inplace=True)
preX = numerical_data.drop(columns=[label_col])
prey = numerical_data[label_col]

In [None]:
# Same feature importance method as before,
# but now we're determining the ideal number of features for LR using RFE cross validation

# Initialize the linear regression model
model = LinearRegression()

# Try different feature counts and evaluate model performance
num_features = list(range(1, len(preX.columns) + 1))
cv_scores = []

for n in num_features:
    # Perform Recursive Feature Elimination (RFE) with n selected features
    rfe = RFE(model, n_features_to_select=n)
    rfe.fit(preX, prey)  # Fit RFE to data

    # Compute cross-validation score (Negative MSE for minimization)
    score = np.mean(cross_val_score(model, rfe.transform(preX), prey, cv=5, scoring="neg_mean_squared_error"))
    cv_scores.append(-score)  # Convert back to positive MSE

# Determine the optimal number of features
optimal_num_features = num_features[np.argmin(cv_scores)]

# Fit RFE with the optimal number of features
rfe_optimal = RFE(model, n_features_to_select=optimal_num_features)
rfe_optimal.fit(preX, prey)

# Extract selected features
optimal_features = list(preX.columns[rfe_optimal.support_])
print(f"Optimal number of features: {optimal_num_features}")
print(f"Selected optimal features: {optimal_features}")

# Plot Number of Features vs. Validation Error
plt.figure(figsize=(8, 6))
plt.plot(num_features, cv_scores, marker="o", linestyle="-", color="b")
plt.axvline(optimal_num_features, color='r', linestyle="--", label=f"Optimal Features: {optimal_num_features}")
plt.xlabel("Number of Features")
plt.ylabel("Cross-Validated MSE")
plt.title("Determining the Optimal Number of Features for MLR")
plt.legend()
plt.grid()
plt.show()

In [None]:
# Split data randomly into train and validation sets, fit a model to the train set
X_train, X_test, y_train, y_test = train_test_split(preX, prey, test_size=0.15, random_state=42, shuffle=True)

# We'll use Ridge Regression (L2 Regularization)
mlr = Ridge(alpha=1.0)  # Adjust alpha for more/less regularization (too lazy to automate hyperparameter tuning atm)
mlr.fit(X_train, y_train)

# Print model Coefficients
mlr_coefficients = pd.Series(mlr.coef_, index=preX.columns)
print("Model Coefficients:")
print(mlr_coefficients)

In [None]:
def evaluate_classifiers(X, y):
    # Model testing from Ensemble assignment
    features = [x for x in X.columns]
    print(f"Features: {features}")
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, shuffle=True)

    best_acc = 0.0
    best_clf = None

    # 1) Standard Decision Tree
    tree_clf = DecisionTreeClassifier(random_state=42)
    tree_clf.fit(X_train, y_train)
    y_pred_tree = tree_clf.predict(X_test)
    acc_tree = accuracy_score(y_test, y_pred_tree)
    print(f"{label_col}: Standard DTR accuracy: {acc_tree}")
    if acc_tree > best_acc:
        best_acc, best_clf = acc_tree, tree_clf

    # 2) Bagging
    bag_clf = BaggingClassifier(
        DecisionTreeClassifier(random_state=42),
        n_estimators=500,
        max_samples=10,
        bootstrap=True,
        n_jobs=-1,
        random_state=42
    )
    bag_clf.fit(X_train, y_train)
    y_pred = bag_clf.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print(f"{label_col}: Bagging DTR accuracy: {acc}")
    if acc > best_acc:
        best_acc, best_clf = acc, bag_clf

    # 3) Random Forest (default)
    rnd_clf = RandomForestClassifier(
        n_estimators=500,
        max_leaf_nodes=16,
        n_jobs=-1,
        random_state=42
    )
    rnd_clf.fit(X_train, y_train)
    y_pred_rf = rnd_clf.predict(X_test)
    acc_rf = accuracy_score(y_test, y_pred_rf)
    print(f"{label_col}: RFC accuracy: {acc_rf}")
    if acc_rf > best_acc:
        best_acc, best_clf = acc_rf, rnd_clf

    # 4) AdaBoost (default)
    ada_clf = AdaBoostClassifier(
        DecisionTreeClassifier(max_depth=1),
        n_estimators=200,
        learning_rate=0.5,
        random_state=42
    )
    ada_clf.fit(X_train, y_train)
    y_pred_ada = ada_clf.predict(X_test)
    acc_ada = accuracy_score(y_test, y_pred_ada)
    print(f"{label_col}: Adaboost accuracy: {acc_ada}")
    if acc_ada > best_acc:
        best_acc, best_clf = acc_ada, ada_clf

    # 5) RFC with varying max_depth
    for d in range(2, 8):
        clf = RandomForestClassifier(
            n_estimators=500,
            max_depth=d,
            n_jobs=-1,
            random_state=42
        )
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        acc_d = accuracy_score(y_test, y_pred)
        print(f"{label_col}: New RFC, depth {d} accuracy: {acc_d}")
        if acc_d > best_acc:
            best_acc, best_clf = acc_d, clf

    # 6) AdaBoost with varying learning rates
    ada_lrs = [0.00005, 0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5]
    for lr in ada_lrs:
        clf = AdaBoostClassifier(
            DecisionTreeClassifier(max_depth=1),
            n_estimators=200,
            learning_rate=lr,
            random_state=42
        )
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        acc_lr = accuracy_score(y_test, y_pred)
        print(f"{label_col}: New Adaboost, LR {lr} accuracy: {acc_lr}")
        if acc_lr > best_acc:
            best_acc, best_clf = acc_lr, clf

    print(f"\nBest model: {best_clf.__class__.__name__} with accuracy {best_acc:.4f}")
    return features, best_clf

In [None]:
# Evaluating classifiers with NO restraints on data (ie. providing backer counts & pledges)
features, best_model = evaluate_classifiers(preX, prey)

In [None]:
# Evaluating classifiers WITHOUT backers counts & pledge information
inaccesible_cols = ['backers_count', 'usd_pledged']
features, best_model = evaluate_classifiers(preX.drop(columns=inaccesible_cols), prey)

In [None]:
# Evaluating classifiers WITHOUT backers counts & pledge information, and kickstarter mod information
inaccesible_cols = ['backers_count', 'usd_pledged', 'spotlight', 'staff_pick']
features, best_model = evaluate_classifiers(preX.drop(columns=inaccesible_cols), prey)

In [None]:
print(best_model)
print(features)

In [None]:
y_pred = best_model.predict(preX[features])
acc = accuracy_score(prey, y_pred)
print(f"Accuracy: {acc}")

In [None]:
if isinstance(best_model, DecisionTreeClassifier):
    plt.figure(figsize=(200, 100))
    tree.plot_tree(best_model, feature_names=features, filled=True)
    plt.show()