In [None]:
# Cell 0
# All the needed imports

import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    classification_report,
    confusion_matrix,
    ConfusionMatrixDisplay
)
import matplotlib.pyplot as plt
from nltk.sentiment import SentimentIntensityAnalyzer
from tqdm.notebook import tqdm
from IPython.display import display


In [None]:
# Cell 1
# Load the raw data

original_df = pd.read_csv('ks-projects-201801.csv')

# Remove all nans
original_df = original_df.dropna()
display(original_df.head())

In [None]:
# Cell 2
# Reorganize the data

# Drop the following columns: ID, usd_pledged, usd_pledged_real, usd_goal_real
df = original_df.drop(
    ["pledged", "usd pledged", "usd_pledged_real", "goal", "backers"], axis=1
)

# Rename the columns
df = df.rename(
    columns={
        "ID": "id",
        "name": "name",
        "category": "category",
        "main_category": "main_category",
        "deadline": "deadline",
        "launched": "launched",
        "state": "state",
        "currency": "currency",
        "country": "country",
        "usd_goal_real": "goal",
    }
)

# Convert the launch and deadline to year-month-day
df['launched'] = pd.to_datetime(pd.to_datetime(df['launched'], format="%Y-%m-%d %H:%M:%S").dt.date)
df['deadline'] = pd.to_datetime(pd.to_datetime(df['deadline'], format="%Y-%m-%d").dt.date)

# Calculate the amount of days between launch and deadline
df['duration'] = (df['deadline'] - df['launched']).dt.days
df['start_month'] = df['launched'].dt.month_name()
df['end_month'] = df['deadline'].dt.month_name()
df['start_year'] = df['launched'].dt.year
df['end_year'] = df['deadline'].dt.year
# df['start_day_name'] = df['launched'].dt.day_name()
# df['end_day_name'] = df['deadline'].dt.day_name()
display(df.head())

In [None]:
# Cell 3
# Print start shape
print(df.shape)

# Keep only success or fail
df = df[(df['state'] == 'failed') | (df['state'] == 'successful')]

# Remove country
df = df[df['country'] != 'N,0"']

# df = df.drop(columns=['currency', 'country'], axis=1)

# Reset index
df = df.reset_index(drop=True)

# Print end shape
print(df.shape)

# Final data before one hot encoding everything
display(df.head())

In [None]:
# Cell 4
def sentiment_analysis(df: pd.DataFrame, name):
    sia = SentimentIntensityAnalyzer()
    result = [] 
    for _, row in tqdm(df.iterrows(), total=len(df)):
        text = row[name]
        temp = sia.polarity_scores(text)
        result.append([temp['neg'], temp['neu'], temp['pos'], temp['compound']])
    # Generate column names
    column_names = []
    for value in ['negative', 'neutral', 'positive', 'compound']:
        column_names.append(f"{name}_{str(value)}")

    # Create a pandas DataFrame
    data_df = pd.DataFrame(np.array(result), columns=column_names)

    # Return the final new DataFrame
    # print(new_df.shape)
    # print(data_df.dropna().shape)

    df = df.reset_index(drop=True)
    data_df = data_df.reset_index(drop=True)
    return pd.concat([df, data_df], axis=1)
df = sentiment_analysis(df, 'name')

In [None]:
# Cell 5
display(df.head(n=5))


In [None]:
# Cell 6
def one_hot_encode_column(df: pd.DataFrame, name):
    # The existing column
    column = df[name]

    # Data frame without the existing column
    new_df = df.drop([name], axis=1)

    # Get the unique values
    unique = column.unique()
    print(unique)

    # Create a mapping from the unique value to the index
    mapping = {key: index for index, key in enumerate(unique)}

    # The encoded data
    encoded = np.zeros((df.shape[0], len(unique)))

    # Show mapping
    # for key, index in mapping.items():
    #     temp = np.zeros((len(unique)))
    #     temp[index] = 1.0
    #     print(f"{temp}: {key}")

    # Encode each value
    for offset, value in enumerate(column):
        index = mapping[value]
        encoded[offset][index] = 1

    # Generate column names
    column_names = []
    for value in unique:
        column_names.append(f"{name}_{str(value)}")

    # Create a pandas DataFrame
    data_df = pd.DataFrame(encoded, columns=column_names, dtype=np.uint8)

    # Return the final new DataFrame
    # print(new_df.shape)
    # print(data_df.dropna().shape)

    new_df = new_df.reset_index(drop=True)
    data_df = data_df.reset_index(drop=True)
    return pd.concat([new_df, data_df], axis=1)

df = one_hot_encode_column(df, "category")
df = one_hot_encode_column(df, "main_category")
df = one_hot_encode_column(df, "currency")
df = one_hot_encode_column(df, "country")
df = one_hot_encode_column(df, "start_month")
df = one_hot_encode_column(df, "end_month")
# df = one_hot_encode_column(df, "start_day_name")
# df = one_hot_encode_column(df, "end_day_name")


In [None]:
# Cell 7
df_features = df.copy()
df_features = df_features.drop(columns=[
    # 'main_category',
    # 'country',
    # 'currency',
    'name', 'state', 'deadline', 'launched'], axis=1)
# Normalize goal and duration
df_features['goal'] = (df_features['goal'] - df_features['goal'].mean()) / df_features['goal'].std()
df_features['duration'] = (df_features['duration'] - df_features['duration'].mean()) / df_features['duration'].std()
# df_features['duration'] = (df_features['duration'] - df_features['duration'].min()) / ( df_features['duration'].max() - df_features['duration'].min())
df_features['start_year'] = (df_features['start_year'] - df_features['start_year'].min()) / ( df_features['start_year'].max() - df_features['start_year'].min())
df_features['end_year'] = (df_features['end_year'] - df_features['end_year'].min()) / ( df_features['end_year'].max() - df_features['end_year'].min())
# df_features['start_year'] = (df_features['start_year'] - df_features['start_year'].mean()) / df_features['start_year'].std()
# df_features['end_year'] = (df_features['end_year'] - df_features['end_year'].mean()) / df_features['end_year'].std()
df_labels = df[['id', 'state']]
df_labels['state'] = (df['state'] == 'successful').astype(np.uint8)

In [None]:
# Cell 8
# Get the min and max years
display(df_labels.head())
display(df_features.head())
print(df['launched'].dt.year.min())
print(df['launched'].dt.year.max())

In [None]:
# Cell 9
def success_failure_summary(labels):
    success_count = labels['state'].sum()
    fail_count = labels.shape[0] - success_count 
    print(f"# of success: {success_count}")
    print(f"# of fail: {fail_count}")
    print(f"Percent containing success: {round(success_count / labels.shape[0] * 100, 2)}%")
    print(f"Percent containing fail: {round(fail_count / labels.shape[0] * 100, 2)}%")

print(" -- Before --")
success_failure_summary(df_labels)
print()

# Get the successes
success_mask = df_labels['state'] == 1
success_features = df_features[success_mask]
success_labels = df_labels[success_mask]

# Get the failures
fail_mask = df_labels['state'] != 1
fail_features = df_features[fail_mask]
fail_labels = df_labels[fail_mask]

# Get the minimum length
min_length = min(success_mask.sum(), fail_mask.sum())

# Get the min length of successes and failures
success_features = success_features.head(min_length)
success_labels = success_labels.head(min_length)
fail_features = fail_features.head(min_length)
fail_labels = fail_labels.head(min_length)

# Create final features and labels
final_labels = pd.concat([success_labels, fail_labels], axis=0)
final_features =pd.concat([success_features, fail_features], axis=0)

# Uncomment if you do not want to even the success and failed
# final_labels = df_labels
# final_features = df_features

print(df_features.shape)
print(final_features.shape)
display(final_features.head())

print(" -- After --")
success_failure_summary(final_labels)
print()
# X_train, X_test, y_train, y_test = train_test_split(final_features, final_labels, test_size=0.20)
# X_train, X_validation, y_train, y_validation = train_test_split(X_train, y_train, test_size=0.05)
X_train, X_test, y_train, y_test = train_test_split(final_features, final_labels, test_size=0.20)
X_test, X_validation, y_test, y_validation = train_test_split(X_test, y_test, test_size=0.5)

X_train.reset_index(inplace=True, drop=True)
X_test.reset_index(inplace=True, drop=True)
X_validation.reset_index(inplace=True, drop=True)
y_train.reset_index(inplace=True, drop=True)
y_test.reset_index(inplace=True, drop=True)
y_validation.reset_index(inplace=True, drop=True)

print(" -- Train --")
print(f"Percent of data {round(X_train.shape[0] / final_features.shape[0] * 100, 3)}%")
success_failure_summary(y_train)
print()

print(" -- Test --")
print(f"Percent of data {round(X_test.shape[0] / final_features.shape[0] * 100, 3)}%")
success_failure_summary(y_test)
print()

print(" -- Validation --")
print(f"Percent of data {round(X_validation.shape[0] / final_features.shape[0] * 100, 3)}%")
success_failure_summary(y_validation)
print()

In [None]:
# Cell 10
# Lastly Drop all ids
final_X_train =  X_train.drop(columns=['id'], axis=1)
final_X_test = X_test.drop(columns=['id'], axis=1)
final_X_validation = X_validation.drop(columns=['id'], axis=1)
final_y_train = y_train.drop(columns=['id'], axis=1)
final_y_test = y_test.drop(columns=['id'], axis=1)
final_y_validation = y_validation.drop(columns=['id'], axis=1)

In [None]:
# Cell 11
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
class OurDataset(Dataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.features[idx, :], self.labels[idx]

# Reset the index
final_X_train.reset_index(inplace=True, drop=True)
final_X_test.reset_index(inplace=True, drop=True)
final_X_validation.reset_index(inplace=True, drop=True)
final_y_test.reset_index(inplace=True, drop=True)
final_y_train.reset_index(inplace=True, drop=True)
final_y_validation.reset_index(inplace=True, drop=True)

# Load up data into dataset
training_data = OurDataset(final_X_train.values.astype(np.float32), final_y_train.values.ravel().astype(np.float32))
test_data = OurDataset(final_X_test.values.astype(np.float32), final_y_test.values.ravel().astype(np.float32))
validation_data = OurDataset(final_X_validation.values.astype(np.float32), final_y_validation.values.ravel().astype(np.float32))
batch_size = 64

# Create data loaders.
train_dataloader = DataLoader(training_data, batch_size=batch_size)
test_dataloader = DataLoader(test_data, batch_size=batch_size)
validation_dataloader = DataLoader(validation_data, batch_size=batch_size)

for X, y in train_dataloader:
    print(f"Shape of X [N, F]: {X.shape} {X.dtype}")
    print(f"Shape of y: {y.shape} {y.dtype}")
    break

In [None]:
# Cell 12
# Get cpu or gpu device for training.
import torch
from torch import nn
device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
# device = "cpu"
print(f"Using {device} device")

feature_count = final_X_train.shape[1]

# Define model
class ComplexNeuralNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.flatten = nn.Flatten()
        size = 3 * feature_count
        self.layers = nn.Sequential(
            nn.Linear(feature_count, size),
            nn.LeakyReLU(),
            nn.Linear(size, size),
            nn.LeakyReLU(),
            nn.Linear(size, size),
            nn.LeakyReLU(),
            nn.Linear(size, size),
            nn.LeakyReLU(),
            nn.Linear(size, size),
            nn.LeakyReLU(),
            nn.Linear(size, size),
            nn.LeakyReLU(),
            nn.Linear(size, 1),
            nn.Sigmoid()
        )
    def forward(self, x):
        x = self.flatten(x)
        logits = self.layers(x)
        return torch.flatten(logits)
class NeuralNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.flatten = nn.Flatten()
        size = 2 * feature_count
        self.layers = nn.Sequential(
            nn.Linear(feature_count, feature_count),
            nn.LeakyReLU(),
            nn.Linear(feature_count, size),
            nn.LeakyReLU(),
            nn.Linear(size, size),
            nn.LeakyReLU(),
            nn.Linear(size, size),
            nn.LeakyReLU(),
            nn.Linear(size, 1),
            nn.Sigmoid()
        )
    def forward(self, x):
        x = self.flatten(x)
        logits = self.layers(x)
        return torch.flatten(logits)

simple_model = NeuralNetwork().to(device)
print("--- Simple ---")
print(simple_model)

complex_model = ComplexNeuralNetwork().to(device)
print("--- Complex ---")
print(complex_model)

loss_fn = nn.BCELoss()
optimizer = torch.optim.SGD(simple_model.parameters(), lr=1e-3, weight_decay=1e-4, momentum=0.9)
complex_optimizer = torch.optim.AdamW(complex_model.parameters(), lr=1e-5, weight_decay=1e-4)
def train(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    model.train()
    for batch, (X, y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)

        # Compute prediction error
        pred = model(X)
        loss = loss_fn(pred, y)

        # Back propagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch % 100 == 0:
            loss, current = loss.item(), (batch + 1) * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")
def test(dataloader, model, loss_fn, name="Test"):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.eval()
    test_loss, correct = 0, 0
    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.round() == y).type(torch.float).sum().item()
    test_loss /= num_batches
    correct /= size
    print(f"{name} Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")
epochs = 20
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    print("Simple:")
    train(train_dataloader, simple_model, loss_fn, optimizer)
    test(test_dataloader, simple_model, loss_fn)
    print("Complex:")
    train(train_dataloader, complex_model, loss_fn, complex_optimizer)
    test(test_dataloader, complex_model, loss_fn)
print("Done!")
print("Simple:")
test(validation_dataloader, simple_model, loss_fn, name="Validation")
print("Complex:")
test(validation_dataloader, complex_model, loss_fn, name="Validation")

In [None]:
# Cell 13
predict_y_validation = simple_model(torch.tensor(final_X_validation.values.astype(np.float32)).to(device))
predict_y_validation = predict_y_validation.round().to('cpu').detach().numpy()

print(predict_y_validation.shape)
print(" --- Simple NN Report ---")
print(classification_report(final_y_validation.values.ravel(), predict_y_validation))

m = confusion_matrix(final_y_validation.values.ravel(), predict_y_validation)
m = m / m.sum()
ConfusionMatrixDisplay(m).plot()
plt.show()

predict_y_validation = complex_model(torch.tensor(final_X_validation.values.astype(np.float32)).to(device))
predict_y_validation = predict_y_validation.round().to('cpu').detach().numpy()

print(predict_y_validation.shape)
print(" --- Complex NN Report ---")
print(classification_report(final_y_validation.values.ravel(), predict_y_validation))

m = confusion_matrix(final_y_validation.values.ravel(), predict_y_validation)
m = m / m.sum()
ConfusionMatrixDisplay(m).plot()
plt.show()
None

In [None]:
# Cell 14
class RandomModel:
    def __init__(self) -> None:
        None
    def predict(self, values):
        result = np.random.random((values.shape[0], 1)).round()
        return result

    def predict_proba(self, values):
        result = np.zeros((values.shape[0], 2))
        for offset in range(values.shape[0]):
            result[offset] = [0.5, 0.5]
        return result

random_model = RandomModel()
predict_y_validation = random_model.predict(final_X_validation)
print(predict_y_validation.shape)

print(predict_y_validation.shape)
print(" --- Random Report ---")
print(classification_report(final_y_validation.values.ravel(), predict_y_validation))

m = confusion_matrix(final_y_validation.values.ravel(), predict_y_validation)
m = m / m.sum()
ConfusionMatrixDisplay(m).plot()
plt.show()


In [None]:
# Cell 15
uniform_knn = KNeighborsClassifier(n_neighbors=518, weights='uniform', metric='l1')
uniform_knn.fit(final_X_train.values, final_y_train.values.ravel())

distance_knn = KNeighborsClassifier(n_neighbors=518, weights='distance', metric='l1')
distance_knn.fit(final_X_train.values, final_y_train.values.ravel())

predict_y_validation = uniform_knn.predict(final_X_validation.values)
print(predict_y_validation.shape)
print(" --- (uniform) KNN Report ---")
print(classification_report(final_y_validation.values.ravel(), predict_y_validation))

m = confusion_matrix(final_y_validation.values.ravel(), predict_y_validation)
m = m / m.sum()
ConfusionMatrixDisplay(m).plot()
plt.show()

predict_y_validation = distance_knn.predict(final_X_validation.values)
print(predict_y_validation.shape)
print(" --- (distance) KNN Report ---")
print(classification_report(final_y_validation.values.ravel(), predict_y_validation))

m = confusion_matrix(final_y_validation.values.ravel(), predict_y_validation)
m = m / m.sum()
ConfusionMatrixDisplay(m).plot()
plt.show()
None


In [None]:
# Cell 16
import lime
from lime import lime_tabular

explainer = lime_tabular.LimeTabularExplainer(
    training_data=np.array(final_X_train),
    feature_names=final_X_train.columns,
    class_names=['failed', 'successful'],
    mode='classification'
)

In [None]:
# Cell 17
y_zero_index = None
y_one_index = None
skip_zero = 0
skip_one = 0
for index, y in enumerate(final_y_validation.values):
    y_expected = y[0]
    x = np.array([ final_X_validation.iloc[index]])
    simple_nn_y = simple_model(torch.tensor(x.astype(np.float32)).to(device))
    simple_nn_y = int(simple_nn_y.round().to('cpu').detach().numpy()[0])
    complex_nn_y = complex_model(torch.tensor(x.astype(np.float32)).to(device))
    complex_nn_y = int(complex_nn_y.round().to('cpu').detach().numpy()[0])
    uniform_knn_y = uniform_knn.predict(x)[0]
    distance_knn_y = distance_knn.predict(x)[0]
    values = [
        simple_nn_y,
        complex_nn_y,
        uniform_knn_y,
        distance_knn_y
    ]
    different = False
    for value in values:
        if value != y_expected:
            different = True
            break
    if not different:
        if y_expected == 0 and y_zero_index is None:
            if skip_zero > 0:
                skip_zero -= 1
            else:
                print(index, y_expected, values)
                y_zero_index = index
        elif y_expected == 1 and y_one_index is None:
            if skip_one > 0:
                skip_one -= 1
            else:
                print(index, y_expected, values)
                y_one_index = index
        if y_one_index is not None and y_zero_index is not None:
            break
assert y_zero_index is not None
assert y_one_index is not None

def complex_func(x):
    predict_y = complex_model(torch.tensor(x.astype(np.float32)).to(device))
    predict_y = 1 - predict_y.to('cpu').detach().numpy()
    opposite = 1- predict_y
    result = np.atleast_2d(predict_y).T
    result = np.append(result, np.atleast_2d(opposite).T, axis=1)
    return result

def simple_func(x):
    predict_y = simple_model(torch.tensor(x.astype(np.float32)).to(device))
    predict_y = 1 - predict_y.to('cpu').detach().numpy()
    opposite = 1- predict_y
    result = np.atleast_2d(predict_y).T
    result = np.append(result, np.atleast_2d(opposite).T, axis=1)
    return result


In [None]:
# Cell 18
exp = explainer.explain_instance(
    data_row=np.array(final_X_validation.iloc[y_zero_index]),
    predict_fn=random_model.predict_proba,
    num_features=10,
    num_samples=10000
)
exp.show_in_notebook(show_table=True)

exp = explainer.explain_instance(
    data_row=np.array(final_X_validation.iloc[y_zero_index]),
    predict_fn=uniform_knn.predict_proba,
    num_features=10,
    num_samples=10000
)
exp.show_in_notebook(show_table=True)

exp = explainer.explain_instance(
    data_row=np.array(final_X_validation.iloc[y_zero_index]),
    predict_fn=distance_knn.predict_proba,
    num_features=10,
    num_samples=10000
)
exp.show_in_notebook(show_table=True)

exp = explainer.explain_instance(
    data_row=np.array(final_X_validation.iloc[y_zero_index]),
    predict_fn=simple_func,
    num_features=10,
    num_samples=10000
)
exp.show_in_notebook(show_table=True)

exp = explainer.explain_instance(
    data_row=np.array(final_X_validation.iloc[y_zero_index]),
    predict_fn=complex_func,
    num_features=10,
    num_samples=10000
)
exp.show_in_notebook(show_table=True)

In [None]:
# Cell 19
exp = explainer.explain_instance(
    data_row=np.array(final_X_validation.iloc[y_one_index]),
    predict_fn=random_model.predict_proba,
    num_features=10,
    num_samples=10000
)
exp.show_in_notebook(show_table=True)

exp = explainer.explain_instance(
    data_row=np.array(final_X_validation.iloc[y_one_index]),
    predict_fn=uniform_knn.predict_proba,
    num_features=10,
    num_samples=10000
)
exp.show_in_notebook(show_table=True)

exp = explainer.explain_instance(
    data_row=np.array(final_X_validation.iloc[y_one_index]),
    predict_fn=distance_knn.predict_proba,
    num_features=10,
    num_samples=10000
)
exp.show_in_notebook(show_table=True)

exp = explainer.explain_instance(
    data_row=np.array(final_X_validation.iloc[y_one_index]),
    predict_fn=simple_func,
    num_features=10,
    num_samples=10000
)
exp.show_in_notebook(show_table=True)

exp = explainer.explain_instance(
    data_row=np.array(final_X_validation.iloc[y_one_index]),
    predict_fn=complex_func,
    num_features=10,
    num_samples=10000
)
exp.show_in_notebook(show_table=True)

In [None]:
from IPython.display import display, HTML
display(HTML("<style>.container { width: 90% !important; }</style>"))