In [1]:
# import main libraries
import pandas as pd
import numpy as np

# make pairplots feature vs state
import seaborn as sns
import matplotlib.pyplot as plt

# to evaluate the model
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# import data from csv
df = pd.read_csv('data/kickstarter_projects.csv')


# import KNN classifier
from sklearn.neighbors import KNeighborsClassifier

# import Pipeline and basic preprocessing tools
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

#import GridSearchCV
from sklearn.model_selection import GridSearchCV

In [2]:
from sklearn.base import BaseEstimator, TransformerMixin
from sentence_transformers import SentenceTransformer
import numpy as np
import pandas as pd

class NameEmbeddingTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, model_name='all-MiniLM-L6-v2'):
        self.model_name = model_name
        self.model = None

    def fit(self, X, y=None):
        self.model = SentenceTransformer(self.model_name)
        return self

    def transform(self, X):
        # ✅ Robust handling of DataFrame, Series, or ndarray
        if isinstance(X, pd.DataFrame):
            # Use the first column (e.g., df[['Name']] → df['Name'])
            X = X.iloc[:, 0]
        elif isinstance(X, np.ndarray) and X.ndim == 2:
            X = X[:, 0]  # From 2D to 1D array

        X_list = X.tolist()  # Final input: list of strings
        embeddings = self.model.encode(X_list)
        return embeddings

In [3]:
#drop collums that are useless or cause data leakage
df = df.drop(columns=['ID', 'Pledged', 'Backers'])

df = df.query('State != "Live" and State != "Suspended" and State != "Canceled" and State != "Unknown"').reset_index(drop=True)



In [12]:
# i want to take a sample of 10% of the data, for testing
df_sample = df.sample(frac=0.01, random_state=42)
df_sample.reset_index(drop=True, inplace=True)

In [13]:
#TODO: export column names as features and target

features = df.columns.tolist()  
target = 'State'
features.remove(target)

num_features = (['Goal'])
name_features = (['Name'])
date_features = (['Launched', 'Deadline'])

cat_features = features.copy()
cat_features.remove(num_features[0])
cat_features.remove(name_features[0])
cat_features = [f for f in cat_features if f not in date_features]

In [14]:
print('features: ', features)
print('target: ', target)
print('num_features: ', num_features)
print('name_features: ', name_features)
print('date_features: ', date_features)
print('cat_features: ', cat_features)

features:  ['Name', 'Category', 'Subcategory', 'Country', 'Launched', 'Deadline', 'Goal']
target:  State
num_features:  ['Goal']
name_features:  ['Name']
date_features:  ['Launched', 'Deadline']
cat_features:  ['Category', 'Subcategory', 'Country']


In [15]:
#TODO: build pipeline numerical, name and categorical transformers'

num_pipeline = 'passthrough'
date_pipeline = 'passthrough'
name_pipeline = Pipeline([
    ('embed', NameEmbeddingTransformer())
])

cat_pipeline = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

In [16]:
print('Num features:', num_features)
print('Date features:', date_features)
print('Name features:', name_features)
print('Cat features:', cat_features)

Num features: ['Goal']
Date features: ['Launched', 'Deadline']
Name features: ['Name']
Cat features: ['Category', 'Subcategory', 'Country']


In [20]:
#TODO: build preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_pipeline, num_features),
       # ('date', date_pipeline, date_features),
        ('name', name_pipeline, name_features),
        ('cat', cat_pipeline, cat_features)
    ],
    remainder='drop'
)

In [21]:
#TODO: Set up the pipeline
pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('model', KNeighborsClassifier())  
])

In [22]:
# X and y and split the data
X = df_sample[features]
y = df_sample[target]
print('X_type:', type(X), 'y_type:', type(y))
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print('X_train_type:', type(X_train), 'y_train_type:', type(y_train))
print('X_test_type:', type(X_test), 'y_test_type:', type(y_test))

X_type: <class 'pandas.core.frame.DataFrame'> y_type: <class 'pandas.core.series.Series'>
X_train_type: <class 'pandas.core.frame.DataFrame'> y_train_type: <class 'pandas.core.series.Series'>
X_test_type: <class 'pandas.core.frame.DataFrame'> y_test_type: <class 'pandas.core.series.Series'>


In [None]:
#TODO: Fit the pipeline
pipe.fit(X_train, y_train)

In [27]:
#TODO: Create a parameter grid for the KNN classifier
param_grid = {
    'model__n_neighbors': [3, 5, 7, 9],
    'model__weights': ['uniform', 'distance'],
    'model__metric': ['euclidean'],
    'model__algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
}

search = GridSearchCV(pipe, param_grid, cv=5, n_jobs=-1, verbose=1)

In [None]:
search.fit(X_train, y_train)
print("Best parameters found: ", search.best_params_)
print("Best score found: ", search.best_score_)

Fitting 5 folds for each of 32 candidates, totalling 160 fits


In [14]:
#print the evaluation metrics
y_pred = search.predict(X_test)
print("Accuracy: ", accuracy_score(y_test, y_pred))
print("Classification report: \n", classification_report(y_test, y_pred))

Accuracy:  0.6209653092006033
Classification report: 
               precision    recall  f1-score   support

      Failed       0.67      0.72      0.69      3949
  Successful       0.54      0.47      0.50      2681

    accuracy                           0.62      6630
   macro avg       0.60      0.60      0.60      6630
weighted avg       0.61      0.62      0.62      6630

