# Data Import

In [1]:
import pandas as pd
import numpy as np
import datetime as dt

In [2]:
data_path = ("C:\\Users\\EAFle\\U3S4_BW\\GitHub_Repo\\Kickstarter\\data-modeling-1\\KickstarterCleanedv3.csv")

raw_df = pd.read_csv(data_path, index_col='id')

# Exploratory Data Analysis

In [3]:
raw_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 218361 entries, 1629235715 to 807310529
Data columns (total 21 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Unnamed: 0         218361 non-null  int64  
 1   backers_count      218361 non-null  int64  
 2   blurb              218353 non-null  object 
 3   category           218361 non-null  object 
 4   country            218361 non-null  object 
 5   created_at         218361 non-null  object 
 6   deadline           218361 non-null  object 
 7   goal               218361 non-null  float64
 8   launched_at        218361 non-null  object 
 9   name               218361 non-null  object 
 10  pledged            218361 non-null  float64
 11  spotlight          218361 non-null  int64  
 12  staff_pick         218361 non-null  int64  
 13  state              218361 non-null  int64  
 14  state_changed_at   218361 non-null  object 
 15  usd_exchange_rate  218361 non-null  flo

In [4]:
from pandas_profiling import ProfileReport

In [5]:
# raw_df.profile_report()

# Dataframe Cleaning

In [6]:
# Applying the necessary data cleaning and featuring engineering
# in one wrangle function.

def wrangle(df):

    # Defining new reader friendly column names
    column_names = ['Kickstarter_id', 'Backers Count', 'Campaign Description', 'Primary Category', \
        'Country', 'Campaign Created', 'Deadline', 'Campaign Goal', 'Campaign Start', \
        'Product Name', 'Amount Pledged', 'Product Spotlight', 'Staff Pick', 'Status', \
        'Goal Reached Date', 'USD Exchange Rate', 'USD Pledged', 'Description Length', 'Goal in USD', \
        'Campaign Length', 'Subcategory']
    df.columns = column_names


    # Capitalizing the category and subcategory data points for an easier read
    df['Primary Category'] = df['Primary Category'].str.title()
    df['Subcategory'] = df['Subcategory'].str.title()


    # Converting date columns to datetime dtype
    date_columns = ['Campaign Created', 'Deadline', 'Campaign Start', 'Goal Reached Date']
    df[date_columns] = df[date_columns].apply(pd.to_datetime)


    # Dropping columns and rows not applicable to an American audience
    non_essential_columns = ['Kickstarter_id', 'USD Exchange Rate', 'USD Pledged', 'Goal in USD']
    df.drop(columns=non_essential_columns, axis=1, inplace=True)
    df.drop(df.loc[df['Country']!='US'].index, inplace=True)


    # Correcting country name with ISO country code name
    df['Country'] = df['Country'].replace('US', 'USA')


    # Filling absent subcategory values with primary category values
    # We don't want to discredit a product because it cannot qualify
    # For a granular subcategory
    df[['Primary Category', 'Subcategory']] = df[['Primary Category', 'Subcategory']].astype(str)
    df['Subcategory'].fillna(df['Primary Category'], inplace=True)


    # Dropping rows with no campaign description
    # As they may not be serious campaigns
    # This data may also provide future data leakage indicating campaign failure.
    df.dropna(axis=0, inplace=True)


    # Creating features to indicate goal and pledge amounts per day and per backer
    # These features can assist campaign success measurements
    df['Goal Amount Per Backer'] = round(df['Campaign Goal']/df['Backers Count'])
    df['Pledge Amount Per Backer'] = round(df['Amount Pledged']/df['Backers Count'])
    df['Goal Amount Per Day'] = round(df['Campaign Goal']/df['Campaign Length'])
    df['Pledge Amount Per Day'] = round(df['Amount Pledged']/df['Campaign Length'])


    # Creating data to quantify the time from campagin creation to start
    df['Campaign Launch Length'] = df['Campaign Start'].sub(df['Campaign Created'], axis=0)

    return df

In [7]:
df = wrangle(raw_df)

df.head()

Unnamed: 0_level_0,Backers Count,Campaign Description,Primary Category,Country,Campaign Created,Deadline,Campaign Goal,Campaign Start,Product Name,Amount Pledged,...,Status,Goal Reached Date,Description Length,Campaign Length,Subcategory,Goal Amount Per Backer,Pledge Amount Per Backer,Goal Amount Per Day,Pledge Amount Per Day,Campaign Launch Length
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1629235715,63,A Steve Lafler graphic novel,Comics,USA,2020-08-27,2020-09-24,599.0,2020-09-01,1956 Book One: Sweet Sweet Little Ramona,1942.0,...,1,2020-09-24,5.0,23,Graphic Novels,10.0,31.0,26.0,84.0,5 days
1593905291,132,What happens when two drug-fueled lowlifes fin...,Comics,USA,2017-01-16,2017-03-16,2000.0,2017-02-14,Modern Animals,3097.0,...,1,2017-03-16,20.0,30,Graphic Novels,15.0,23.0,67.0,103.0,29 days
1341470613,6,A pillow meant for two.,Crafts,USA,2015-02-01,2015-03-05,500.0,2015-02-03,Couples Couch Pillow,211.0,...,0,2015-03-05,5.0,30,Diy,83.0,35.0,17.0,7.0,2 days
510157690,16,Professional conservation of the 1880's mural ...,Art,USA,2018-10-24,2019-04-24,17000.0,2019-03-10,"Downtown Mural Restoration (Ann Arbor, MI)",1368.0,...,0,2019-04-24,18.0,45,Painting,1062.0,86.0,378.0,30.0,137 days
147824964,44,"We are trying to raise $2,500 for an art proje...",Art,USA,2012-06-28,2013-04-05,2500.0,2013-02-04,The Trade Parade of Ecuador,2506.0,...,1,2013-04-05,14.0,60,Performance Art,57.0,57.0,42.0,42.0,221 days


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 148206 entries, 1629235715 to 807310529
Data columns (total 22 columns):
 #   Column                    Non-Null Count   Dtype          
---  ------                    --------------   -----          
 0   Backers Count             148206 non-null  int64          
 1   Campaign Description      148206 non-null  object         
 2   Primary Category          148206 non-null  object         
 3   Country                   148206 non-null  object         
 4   Campaign Created          148206 non-null  datetime64[ns] 
 5   Deadline                  148206 non-null  datetime64[ns] 
 6   Campaign Goal             148206 non-null  float64        
 7   Campaign Start            148206 non-null  datetime64[ns] 
 8   Product Name              148206 non-null  object         
 9   Amount Pledged            148206 non-null  float64        
 10  Product Spotlight         148206 non-null  int64          
 11  Staff Pick                148206 non-nul

In [40]:
df['Product Name'].value_counts()

The Nest                                                      5
The Awakening                                                 5
Macbeth                                                       5
The Other Side                                                5
I AM                                                          4
                                                             ..
3rd Street Blackout                                           1
Emergence AS8 - The World's Most Innovative Speaker System    1
Future Skills Development                                     1
Tinker's Treasury at FantasyCon                               1
GADDMUSIC by Regine Louis                                     1
Name: Product Name, Length: 132840, dtype: int64

In [45]:
df['Campaign Description'].value_counts()

ALL-NEW SEXY BADGIRL characters from comic book INDIE legend Everette Hartsoe. 100% artwork in book                                     38
A beautiful natural Fine art nude book exemplifying the female form presented by female producer Nina Vain.                             28
Hard Enamel Pins                                                                                                                        20
The Decentralized Dance Party was founded on the belief that Partying is an art form that has the power to change the world.            17
Wearing masks to express our inner feelings and desires.                                                                                12
                                                                                                                                        ..
My first studio EP is almost complete - please help me finish the cd and share these songs - straight from my heart & soul.              1
An atheist performance arti

In [36]:
print('Length of the dataframe:', (len(df)))

Length of the dataframe: 148206


In [39]:
print('Length of Product Name counts:', len(df['Product Name'].value_counts()))

Length of Product Name counts: 132840


In [46]:
print('Length of Product Name counts:', len(df['Campaign Description'].value_counts()))

Length of Product Name counts: 132010


## Feature Engineering


# Data Visualization


In [9]:
# Bar chart of primary category, subcategories, campaign length https://matplotlib.org/stable/api/_as_gen/matplotlib.pyplot.bar.html

# Whisker plot of campaign goal, amount pledged, backers count https://matplotlib.org/3.1.1/api/_as_gen/matplotlib.pyplot.boxplot.html

# Modeling


In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from category_encoders import OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split, cross_val_score, validation_curve, RandomizedSearchCV


In [11]:
# Establishing feature matrix and y vector.

target = 'Status'

X = df.drop(columns=target)
y = df[target]

print(X.shape)
print(y.shape)
print("The matrix and vector are the same length:", (len(y) == len(X)))

(148206, 21)
(148206,)
The matrix and vector are the same length: True


# NLP Assessment

In [48]:
# Base
from collections import Counter

# SKLearn 
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import PCA

In [None]:
!python -m spacy download en_core_web_md
# NLP Libraries
import spacy
from spacy.tokenizer import Tokenizer
from nltk.stem import PorterStemmer
import re
nlp = spacy.load('en_core_web_md')

In [None]:
!pip install squarify
# Plotting
import squarify
import matplotlib.pyplot as plt
import seaborn as sns

In [20]:
description_txt = (df['Campaign Description'])

print(description_txt)

id
1629235715                         A Steve Lafler graphic novel
1593905291    What happens when two drug-fueled lowlifes fin...
1341470613                              A pillow meant for two.
510157690     Professional conservation of the 1880's mural ...
147824964     We are trying to raise $2,500 for an art proje...
                                    ...                        
726996410     The second issue of a surreal and eclectic com...
552263605     Coral Projects is an ecological contemporary a...
1590830103    "The Wanted Man"\n\nInspired by the classic so...
2093445204    A quaint bachelorette party in the open sea go...
807310529     This is the second book in the Analog Missions...
Name: Campaign Description, Length: 148206, dtype: object


In [21]:
product_name_txt = (df['Product Name'])

print(product_name_txt)

id
1629235715             1956 Book One: Sweet Sweet Little Ramona
1593905291                                       Modern Animals
1341470613                                 Couples Couch Pillow
510157690            Downtown Mural Restoration (Ann Arbor, MI)
147824964                           The Trade Parade of Ecuador
                                    ...                        
726996410                            Days Off #2 - An Anthology
552263605     Coral Projects: Underwater Artwork for Corals ...
1590830103                            Wretched Realms: Issue #1
2093445204    THE DEEP DARK - A "Lovecraftian" tale of Aquat...
807310529     Analog Missions: Oberon A Unique Science Ficti...
Name: Product Name, Length: 148206, dtype: object
