In [None]:
# Cell 0
# All the needed imports

import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    classification_report,
    confusion_matrix,
    ConfusionMatrixDisplay
)
import matplotlib.pyplot as plt
from IPython.display import display


In [None]:
# Cell 1
# Load the raw data

original_df = pd.read_csv('ks-projects-201801.csv')

# Remove all nans
original_df = original_df.dropna()
display(original_df.head())

In [None]:
# Cell 2
# Reorganize the data

# Drop the following columns: ID, usd_pledged, usd_pledged_real, usd_goal_real
df = original_df.drop(
    ["pledged", "usd pledged", "usd_pledged_real", "goal", "backers"], axis=1
)

# Rename the columns
df = df.rename(
    columns={
        "ID": "id",
        "name": "name",
        "category": "category",
        "main_category": "main_category",
        "deadline": "deadline",
        "launched": "launched",
        "state": "state",
        "currency": "currency",
        "country": "country",
        "usd_goal_real": "goal",
    }
)

# Conver the launch and deadline to year-month-day
df['launched'] = pd.to_datetime(pd.to_datetime(df['launched'], format="%Y-%m-%d %H:%M:%S").dt.date)
df['deadline'] = pd.to_datetime(pd.to_datetime(df['deadline'], format="%Y-%m-%d").dt.date)

# Calculate the amount of days between launch and deadline
df['durration'] = (df['deadline'] - df['launched']).dt.days
df['start_month'] = df['launched'].dt.month_name()
df['end_month'] = df['deadline'].dt.month_name()
df['start_year'] = df['launched'].dt.year
df['end_year'] = df['deadline'].dt.year
display(df.head())

In [None]:
# Print start shape
print(df.shape)

# Keep only success or fail
df = df[(df['state'] == 'failed') | (df['state'] == 'successful')]

# Remove country
df = df[df['country'] != 'N,0"']

# df = df.drop(columns=['currency', 'country'], axis=1)

# Reset index
df = df.reset_index(drop=True)

# Print end shape
print(df.shape)

# Final data before one hot encoding everything
display(df.head())

In [None]:
def one_hot_encode_column(df: pd.DataFrame, name):
    # The existing column
    column = df[name]

    # Dataframe without the existing column
    new_df = df.drop([name], axis=1)

    # Get the unique values
    unique = column.unique()
    print(unique)

    # Create a mapping from the unique value to the index
    mapping = {key: index for index, key in enumerate(unique)}

    # The encoded data
    encoded = np.zeros((df.shape[0], len(unique)))

    # Show mapping
    # for key, index in mapping.items():
    #     temp = np.zeros((len(unique)))
    #     temp[index] = 1.0
    #     print(f"{temp}: {key}")

    # Encode each value
    for offset, value in enumerate(column):
        index = mapping[value]
        encoded[offset][index] = 1

    # Generate column names
    column_names = []
    for value in unique:
        column_names.append(f"{name}_{str(value)}")

    # Create a pandas DataFrame
    data_df = pd.DataFrame(encoded, columns=column_names, dtype=np.uint8)

    # Return the final new DataFrame
    # print(new_df.shape)
    # print(data_df.dropna().shape)

    new_df = new_df.reset_index(drop=True)
    data_df = data_df.reset_index(drop=True)
    return pd.concat([new_df, data_df], axis=1)

df = one_hot_encode_column(df, "category")
df = one_hot_encode_column(df, "main_category")
df = one_hot_encode_column(df, "currency")
df = one_hot_encode_column(df, "country")
df = one_hot_encode_column(df, "start_month")
df = one_hot_encode_column(df, "end_month")


In [None]:
df_features = df.copy()
df_features = df_features.drop(columns=['name', 'state', 'deadline', 'launched'], axis=1)
# Normalize goal and duration
df_features['goal'] = (df_features['goal'] - df_features['goal'].mean()) / df_features['goal'].std()
df_features['durration'] = (df_features['durration'] - df_features['durration'].mean()) / df_features['durration'].std()
df_features['start_year'] = (df_features['start_year'] - df_features['start_year'].mean()) / df_features['start_year'].std()
df_features['end_year'] = (df_features['end_year'] - df_features['end_year'].mean()) / df_features['end_year'].std()
df_labels = df[['id', 'state']]
df_labels['state'] = (df['state'] == 'successful').astype(np.uint8)

In [None]:
# Get the min and max years
display(df_features.head())
print(df['launched'].dt.year.min())
print(df['launched'].dt.year.max())

In [None]:
def success_failure_summary(labels):
    success_count = labels['state'].sum()
    fail_count = labels.shape[0] - success_count 
    print(f"# success: {success_count}")
    print(f"# fail: {fail_count}")
    print(f"Percent containin success: {round(success_count / labels.shape[0] * 100, 2)}")
    print(f"Percent containin fail: {round(fail_count / labels.shape[0] * 100, 2)}")
success_failure_summary(df_labels)
X_train, X_test, y_train, y_test = train_test_split(df_features, df_labels, test_size=0.20)
X_train.reset_index(inplace=True, drop=True)
X_test.reset_index(inplace=True, drop=True)
y_train.reset_index(inplace=True, drop=True)
y_test.reset_index(inplace=True, drop=True)
display(X_train.head())
display(y_train.head())

In [None]:
# Lastly Drop all ids
final_X_train =  X_train.drop(columns=['id'], axis=1)
final_X_test = X_test.drop(columns=['id'], axis=1)
final_y_train = y_train.drop(columns=['id'], axis=1)
final_y_test = y_test.drop(columns=['id'], axis=1)

# PCA
X = final_X_train - np.mean(final_X_train, axis=0)
U, _, _ = np.linalg.svd(X.T, full_matrices=False)
G = U[:, :50]
final_X_train = np.dot(G.T, X.T).T
final_X_test = np.dot(G.T, (final_X_test - np.mean(final_X_test, axis=0)).T).T

# PCA code
knn = KNeighborsClassifier(n_neighbors=15, weights='uniform', metric='l2')
knn.fit(final_X_train, final_y_train.values.ravel())
print(f"{knn.predict(final_X_test[0:10])} vs {final_y_test['state'].values[0:10]}")
predict_y_test = knn.predict(final_X_test)

# Non-PCA code
# knn.fit(final_X_train.values, final_y_train.values.ravel())
# print(f"{knn.predict(final_X_test[0:10])} vs {final_y_test['state'].values[0:10]}")
# predict_y_test = knn.predict(final_X_test.values)


In [None]:
print(df.shape)
print(final_y_train.shape)
print(final_y_test.shape)

print(" --- Training Data ---")
success_failure_summary(y_train)
print()

print(" --- Testing Data ---")
success_failure_summary(y_test)
print()

print(" --- KNN Report ---")
print(classification_report(final_y_test.values.ravel(), predict_y_test))

m = confusion_matrix(final_y_test.values.ravel(), predict_y_test)
m = m / m.sum()
ConfusionMatrixDisplay(m).plot()
plt.show()
None