# Data Import

In [3]:
import pandas as pd
import numpy as np
import datetime as dt

In [83]:
data_path = ("C:\\Users\\EAFle\\U3S4_BW\\GitHub_Repo\\Kickstarter\\data-modeling-1\\KickstarterCleanedv3.csv")

raw_df = pd.read_csv(data_path, index_col='id')

# Exploratory Data Analysis

In [3]:
raw_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 218361 entries, 1629235715 to 807310529
Data columns (total 21 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Unnamed: 0         218361 non-null  int64  
 1   backers_count      218361 non-null  int64  
 2   blurb              218353 non-null  object 
 3   category           218361 non-null  object 
 4   country            218361 non-null  object 
 5   created_at         218361 non-null  object 
 6   deadline           218361 non-null  object 
 7   goal               218361 non-null  float64
 8   launched_at        218361 non-null  object 
 9   name               218361 non-null  object 
 10  pledged            218361 non-null  float64
 11  spotlight          218361 non-null  int64  
 12  staff_pick         218361 non-null  int64  
 13  state              218361 non-null  int64  
 14  state_changed_at   218361 non-null  object 
 15  usd_exchange_rate  218361 non-null  flo

In [4]:
from pandas_profiling import ProfileReport

In [5]:
# raw_df.profile_report()

# Dataframe Cleaning

In [84]:
# Applying the necessary data cleaning and featuring engineering
# in one wrangle function.

def wrangle(df):

    # Defining new reader friendly column names
    column_names = ['Kickstarter_id', 'Backers Count', 'Campaign Description', 'Primary Category', \
        'Country', 'Campaign Created', 'Deadline', 'Campaign Goal', 'Campaign Start', \
        'Product Name', 'Amount Pledged', 'Product Spotlight', 'Staff Pick', 'Status', \
        'Goal Reached Date', 'USD Exchange Rate', 'USD Pledged', 'Description Length', 'Goal in USD', \
        'Campaign Length', 'Subcategory']
    df.columns = column_names


    # Capitalizing the category and subcategory data points for an easier read
    df['Primary Category'] = df['Primary Category'].str.title()
    df['Subcategory'] = df['Subcategory'].str.title()


    # Converting date columns to datetime dtype
    date_columns = ['Campaign Created', 'Deadline', 'Campaign Start', 'Goal Reached Date']
    df[date_columns] = df[date_columns].apply(pd.to_datetime)


    # Dropping columns and rows not applicable to an American audience
    non_essential_columns = ['Kickstarter_id', 'USD Exchange Rate', 'USD Pledged', 'Goal in USD']
    df.drop(columns=non_essential_columns, axis=1, inplace=True)
    df.drop(df.loc[df['Country']!='US'].index, inplace=True)


    # Correcting country name with ISO country code name
    df['Country'] = df['Country'].replace('US', 'USA')


    # Filling absent subcategory values with primary category values
    # We don't want to discredit a product because it cannot qualify
    # For a granular subcategory
    df[['Primary Category', 'Subcategory']] = df[['Primary Category', 'Subcategory']].astype(str)
    df['Subcategory'].fillna(df['Primary Category'], inplace=True)


    # Dropping rows with no campaign description
    # As they may not be serious campaigns
    # This data may also provide future data leakage indicating campaign failure.
    df.dropna(axis=0, inplace=True)


    # Creating features to indicate goal and pledge amounts per day and per backer
    # These features can assist campaign success measurements
    df['Goal Amount Per Backer'] = round(df['Campaign Goal']/df['Backers Count'])
    df['Pledge Amount Per Backer'] = round(df['Amount Pledged']/df['Backers Count'])
    df['Goal Amount Per Day'] = round(df['Campaign Goal']/df['Campaign Length'])
    df['Pledge Amount Per Day'] = round(df['Amount Pledged']/df['Campaign Length'])


    # Creating data to quantify the time from campagin creation to start
    df['Campaign Launch Length'] = df['Campaign Start'].sub(df['Campaign Created'], axis=0)

    # Dropping duplicate rows at the subset level
    # Using two entries to double verify across both
    df.drop_duplicates(subset=['Product Name'], keep='first', inplace=True)
    df.drop_duplicates(subset=['Campaign Description'], keep='first', inplace=True)

    return df

In [85]:
df = wrangle(raw_df)

df.head()

Unnamed: 0_level_0,Backers Count,Campaign Description,Primary Category,Country,Campaign Created,Deadline,Campaign Goal,Campaign Start,Product Name,Amount Pledged,...,Status,Goal Reached Date,Description Length,Campaign Length,Subcategory,Goal Amount Per Backer,Pledge Amount Per Backer,Goal Amount Per Day,Pledge Amount Per Day,Campaign Launch Length
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1629235715,63,A Steve Lafler graphic novel,Comics,USA,2020-08-27,2020-09-24,599.0,2020-09-01,1956 Book One: Sweet Sweet Little Ramona,1942.0,...,1,2020-09-24,5.0,23,Graphic Novels,10.0,31.0,26.0,84.0,5 days
1593905291,132,What happens when two drug-fueled lowlifes fin...,Comics,USA,2017-01-16,2017-03-16,2000.0,2017-02-14,Modern Animals,3097.0,...,1,2017-03-16,20.0,30,Graphic Novels,15.0,23.0,67.0,103.0,29 days
1341470613,6,A pillow meant for two.,Crafts,USA,2015-02-01,2015-03-05,500.0,2015-02-03,Couples Couch Pillow,211.0,...,0,2015-03-05,5.0,30,Diy,83.0,35.0,17.0,7.0,2 days
510157690,16,Professional conservation of the 1880's mural ...,Art,USA,2018-10-24,2019-04-24,17000.0,2019-03-10,"Downtown Mural Restoration (Ann Arbor, MI)",1368.0,...,0,2019-04-24,18.0,45,Painting,1062.0,86.0,378.0,30.0,137 days
147824964,44,"We are trying to raise $2,500 for an art proje...",Art,USA,2012-06-28,2013-04-05,2500.0,2013-02-04,The Trade Parade of Ecuador,2506.0,...,1,2013-04-05,14.0,60,Performance Art,57.0,57.0,42.0,42.0,221 days


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 131714 entries, 1629235715 to 2093445204
Data columns (total 22 columns):
 #   Column                    Non-Null Count   Dtype          
---  ------                    --------------   -----          
 0   Backers Count             131714 non-null  int64          
 1   Campaign Description      131714 non-null  object         
 2   Primary Category          131714 non-null  object         
 3   Country                   131714 non-null  object         
 4   Campaign Created          131714 non-null  datetime64[ns] 
 5   Deadline                  131714 non-null  datetime64[ns] 
 6   Campaign Goal             131714 non-null  float64        
 7   Campaign Start            131714 non-null  datetime64[ns] 
 8   Product Name              131714 non-null  object         
 9   Amount Pledged            131714 non-null  float64        
 10  Product Spotlight         131714 non-null  int64          
 11  Staff Pick                131714 non-nu

In [23]:
df['Product Name'].value_counts()

REMASTER Neutral Milk Hotel's "Invent Yourself a Shortcake"    1
Santa Is Coming To Town!                                       1
Shining Some Light on the Dark                                 1
Embracing Change--Time For Naughty Girls To Move or Expand     1
Abe's Swing Ring--Change your ring in the blink of an eye!     1
                                                              ..
Mycaricom.org: Public Data for the Caribbean Community.        1
Because My History Matters                                     1
THE ORION EXPERIENCE: CHILDREN OF THE STARS                    1
Attack on Titan Hard Enamel Pins, Stickers & Mini Prints!      1
Designer Shoes YOU Customize on the Fly -Create 100+ Looks     1
Name: Product Name, Length: 131714, dtype: int64

In [24]:
df['Campaign Description'].value_counts()

INDOOR SKATE PARK! San Antonio needs this! Location secured, featuring skate park, pool table, lounge area, shop etc...                  1
She needs some help to pull together financing to finalize this powerful collection. Step up! Be part of shaping our art and history.    1
10 Years in the making. An extensively researched comic book documentary on the 1960s Underground Press!                                 1
1" Banana Fish Hard Enamel Pins!                                                                                                         1
A redesign of my Might of Mythology installation on the universality of imagination as it is relayed through myths.                      1
                                                                                                                                        ..
Breadfruit Chips is the tropical sensation with a great taste and nutritional goodness for your guilt -free snacking pleasures.          1
A collection of songs based

In [11]:
df_length = len(df)

print('Length of the dataframe:', df_length)

Length of the dataframe: 131714


In [12]:
product_counts_len = len(df['Product Name'].value_counts())

print('Length of Product Name counts:', product_counts_len)

Length of Product Name counts: 131714


In [13]:
description_counts_length = len(df['Campaign Description'].value_counts())

print('Length of Product Name counts:', description_counts_length)

Length of Product Name counts: 131714


In [14]:
print(df_length == product_counts_len)
print(df_length == description_counts_length)

True
True


# Data Visualization


# Modeling

In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from category_encoders import OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split, cross_val_score, validation_curve, RandomizedSearchCV


In [16]:
# Establishing feature matrix and y vector.

target = 'Status'

X = df.drop(columns=target)
y = df[target]

print(X.shape)
print(y.shape)
print("The matrix and vector are the same length:", (len(y) == len(X)))

(131714, 21)
(131714,)
The matrix and vector are the same length: True


# NLP Assessment

## NLP Library Imports

In [43]:
# Base
from collections import Counter

# SKLearn 
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import PCA

In [42]:
#!python -m spacy download en_core_web_md
# NLP Libraries
import spacy
from spacy.tokenizer import Tokenizer
from nltk.stem import PorterStemmer
import re
#nlp = spacy.load('en_core_web_md')

In [44]:
#!pip install squarify
# Plotting
import squarify
import matplotlib.pyplot as plt
import seaborn as sns

## NLP Data Wrangling

In [86]:
 def nlp_wrangle(df):

    non_text_cols = ['Backers Count', 'Primary Category', \
        'Country', 'Campaign Created', 'Deadline', 'Campaign Goal', 'Campaign Start', \
        'Amount Pledged', 'Product Spotlight', 'Staff Pick', \
        'Goal Reached Date', 'Description Length', 'Campaign Length', 'Subcategory', \
        'Goal Amount Per Backer', 'Pledge Amount Per Backer', 'Goal Amount Per Day', \
        'Pledge Amount Per Day', 'Campaign Launch Length']
    df.drop(columns=non_text_cols, axis=1, inplace=True)

    df['Campaign Description'] = df['Campaign Description'].values.astype(str)

    df['Product Name'] = df['Product Name'].values.astype(str)

    return df

In [87]:
txt_df = nlp_wrangle(df)

In [88]:
txt_df.head()

Unnamed: 0_level_0,Campaign Description,Product Name,Status
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1629235715,A Steve Lafler graphic novel,1956 Book One: Sweet Sweet Little Ramona,1
1593905291,What happens when two drug-fueled lowlifes fin...,Modern Animals,1
1341470613,A pillow meant for two.,Couples Couch Pillow,0
510157690,Professional conservation of the 1880's mural ...,"Downtown Mural Restoration (Ann Arbor, MI)",0
147824964,"We are trying to raise $2,500 for an art proje...",The Trade Parade of Ecuador,1


## NLP EDA

In [81]:
txt_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 131714 entries, 1629235715 to 2093445204
Data columns (total 3 columns):
 #   Column                Non-Null Count   Dtype 
---  ------                --------------   ----- 
 0   Campaign Description  131714 non-null  object
 1   Product Name          131714 non-null  object
 2   Status                131714 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 4.0+ MB


In [29]:
max_Campaign_Description = txt_df['Campaign Description'].apply(len).max()
print('Max Campaign Description length:', max_Campaign_Description)
txt_df[txt_df['Campaign Description'].apply(len)==196]

Max Campaign Description length: 196


Unnamed: 0_level_0,Campaign Description,Product Name,Status
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
933275417,An experimental dance film exploring identity ...,Fruition of FORMS OF IDENTIFICATION,1


In [30]:
min_Campaign_Description = txt_df['Campaign Description'].apply(len).min()
print('Min Campaign Description length:', min_Campaign_Description)
txt_df[txt_df['Campaign Description'].apply(len)==1]

Min Campaign Description length: 1


Unnamed: 0_level_0,Campaign Description,Product Name,Status
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1711857183,.,Beacons To Guide Me Home - a new album by Gayl...,1
1813785118,1,a (Canceled),0
224219885,-,"""Reflections"" Composition Debut Album by Karen...",1
970987098,2,co (Canceled),0
1689759121,I,I (Canceled),0
1702555614,C,test campaign (Canceled),0
332892786,q,investor (Canceled),0
892070971,x,x,0


In [32]:
max_Product_Name = txt_df['Product Name'].apply(len).max()
print('Max Product Name length:', max_Product_Name)
txt_df[txt_df['Product Name'].apply(len)==96]

Max Product Name length: 96


Unnamed: 0_level_0,Campaign Description,Product Name,Status
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
502572445,Create collaborative designs for a community M...,Design for Mosaic Sculpture Playground and Mos...,0


In [36]:
min_Product_Name = txt_df['Product Name'].apply(len).min()
print('Min Product Name length:', min_Product_Name)
txt_df[txt_df['Product Name'].apply(len)==1]

Min Product Name length: 1


Unnamed: 0_level_0,Campaign Description,Product Name,Status
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
892070971,x,x,0


## NLP Feature Engineering

In [62]:
txt_df['Product Name Char Length'] = txt_df['Product Name'].apply(len)

txt_df.head()

Unnamed: 0_level_0,Campaign Description,Product Name,Status,Product Name Char Length,Description Char Length
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1629235715,A Steve Lafler graphic novel,1956 Book One: Sweet Sweet Little Ramona,1,40,28
1593905291,What happens when two drug-fueled lowlifes fin...,Modern Animals,1,14,130
1341470613,A pillow meant for two.,Couples Couch Pillow,0,20,23
510157690,Professional conservation of the 1880's mural ...,"Downtown Mural Restoration (Ann Arbor, MI)",0,42,120
147824964,"We are trying to raise $2,500 for an art proje...",The Trade Parade of Ecuador,1,27,73


In [57]:
txt_df['Description Char Length'] = txt_df['Campaign Description'].apply(len)

txt_df.head()

Unnamed: 0_level_0,Campaign Description,Product Name,Status,Product Name Char Length,Description Char Length
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1629235715,A Steve Lafler graphic novel,1956 Book One: Sweet Sweet Little Ramona,1,40,28
1593905291,What happens when two drug-fueled lowlifes fin...,Modern Animals,1,14,130
1341470613,A pillow meant for two.,Couples Couch Pillow,0,20,23
510157690,Professional conservation of the 1880's mural ...,"Downtown Mural Restoration (Ann Arbor, MI)",0,42,120
147824964,"We are trying to raise $2,500 for an art proje...",The Trade Parade of Ecuador,1,27,73


In [90]:
type(txt_df['Product Name'])

pandas.core.series.Series

In [94]:
type(df['Product Name'].iloc[0])

str

In [92]:
df['Product Name'].astype('str').dtypes

dtype('O')

In [93]:
df['Product Name'].astype(str).dtypes

dtype('O')

In [91]:
txt_df['Product Name'][0]

KeyError: 0

In [61]:
re.search("$", txt_df['Campaign Description'][0])

KeyError: 0

In [None]:
variable = (txt_df['Campaign Description'][0]).astype(str)  

answer = (re.findall(r'\w+', variable))

print(answer)

- Includes emojis
- Includes capitalization
- Character length, of each
- Character ratio
- Word quantity, of each
- Word ratio
- Includes $
- Includes #s
- Includes all caps