# Content:
## Formatting Data
## Models
## Results

---

# Formatting Data:

In [2]:
# Box 0
# All imports

import numpy as np
import pandas as pd
import datetime
from sklearn.neighbors import KNeighborsClassifier

In [3]:
# Box 1
# Load the data as is

data_frame = pd.read_csv('ks-projects-201801.csv')
data_frame.head()

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,failed,0,GB,0.0,0.0,1533.95
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02 04:43:57,2421.0,failed,15,US,100.0,2421.0,30000.0
2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12 00:20:50,220.0,failed,3,US,220.0,220.0,45000.0
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,5000.0,2012-03-17 03:24:11,1.0,failed,1,US,1.0,1.0,5000.0
4,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,2015-08-29,19500.0,2015-07-04 08:35:03,1283.0,canceled,14,US,1283.0,1283.0,19500.0


In [4]:
# Box 2
# Format the data to certain columns

labels = data_frame.values[:, 9]
features = data_frame.values[:, :]
#print(features[0])

#Clean Up
#OLD COLUMN ORDER: category, main_category, deadline, launched, usd_pledged_real, usd_gaol_real
features = np.delete(features, [0,1,4,6,8,9,11,12], 1) #remove certain rows

#swaps deadline and launch so launch is listed first
def Swap(arr, start_index, last_index):
    arr[:, [start_index, last_index]] = arr[:, [last_index, start_index]]

#NEW COLUMN ORDER: category, main_category, launched, deadline, usd_pledged_real, usd_gaol_real
Swap(features, 2, 3)


#the states we don't want
unwantedLabels = np.array(['canceled', 'live', 'suspended', 'undefined'])

#row index of the with states we don't want
rowIndexToDelete = np.zeros(labels.shape)
for i  in range(len(unwantedLabels)):
  rowIndexToDelete += labels == unwantedLabels[i]

#convert to bool array
rowIndexToDelete = rowIndexToDelete.astype(bool)


#delete those rows
labels = np.delete(labels, rowIndexToDelete, axis=0)
features = np.delete(features, rowIndexToDelete, axis=0)

print('array of labels: shape ' + str(np.shape(labels)))
print('array of feature matrix: shape ' + str(np.shape(features)))

# np.split(data_frame, [9], axis=1)[1].head()

array of labels: shape (331675,)
array of feature matrix: shape (331675, 7)


In [5]:
print(features[0])
x = datetime.datetime.strptime(features[0][2], "%Y-%m-%d %H:%M:%S")
y = datetime.datetime.strptime(features[0][3], "%Y-%m-%d")

print( (y-x).days)

['Poetry' 'Publishing' '2015-08-11 12:12:28' '2015-10-09' 0 0.0 1533.95]
58


In [6]:
#box 3

#***********************************
#Formating data to read time legibly
#***********************************

#COLUMN ORDER: category, main_category, launched, deadline, backers, usd_pledged_real, usd_gaol_real


for i in range(len(features)):
  #convert launched(string) to time in seconds
  features[i][2] = datetime.datetime.strptime(features[i][2], "%Y-%m-%d %H:%M:%S")
  #convert deadline(string) to time in seconds
  features[i][3] = datetime.datetime.strptime(features[i][3], "%Y-%m-%d")

#print(features[0][2])
#print(features[0][3])

In [7]:
#box 4

#******************************************************
#Hot One Encoding for category and Main category columns
#******************************************************


#COLUMN ORDER: sub_category, main_category, launched, deadline, backers, usd_pledged_real, usd_gaol_real

MAIN_CAT_SIZE =  np.unique(features[:,1]).size
SUB_CAT_SIZE = np.unique(features[:,0]).size

#new shape should be (331675, 179)
#Hot One Encoding for both sub_cat and main_cat FOR FEATURES
nf = np.zeros((len(features), 179), dtype=object)
print(nf.shape)
subCatUnique = np.unique(features[:,0])
mainCatUnique = np.unique(features[:,1])
for i in range(len(features)):
  subMatch = (subCatUnique == features[i][0]).astype(int)
  mainMatch = (mainCatUnique == features[i][1]).astype(int)
  x = np.insert(features[i],1,subMatch)
  y = np.insert(x,-5, mainMatch)
  y = y[1:]
  #print(y)
  nf[i] = np.append(y[:SUB_CAT_SIZE],y[SUB_CAT_SIZE+1:])

 

(331675, 179)


In [8]:
#box 5

#***************************
#Hot one encoding for labels
#***************************

uniqueLabels = np.unique(labels)
print(uniqueLabels)

mask = labels == uniqueLabels[1]

labels = mask.astype(int)


#0 == failed
#1 == successful

['failed' 'successful']


In [9]:
#box 6

#*********************************
#Maintains an old copy of features
#*********************************

oldFeatures = features;
features = nf

In [10]:
#box 7 

#*************************************************************************
#Replace Launched and Deadline with Duration (Added to the end of the row)
#*************************************************************************

nf = np.zeros( (len(features), 178), dtype=object)
print(nf.shape)
for i in range(len(features)):
    launched = features[i][-5]
    deadline = features[i][-4]
    duration = (deadline - launched).days
    x = np.copy(features[i])
    x = np.delete(x, [-4, -5])
    #print(x)
    nf[i] = np.append(x, duration)

#new shape should be (331675, 178)

(331675, 178)


In [11]:
#box 6

#*****************************************************
#Maintains an old copy of features with datetime in it
#*****************************************************
oldFeaturesTime = features
features = nf


In [1]:
#box 7

#***********************************************************
#Generates %80 training data set and %20 validation data set
#***********************************************************

np.random.seed(seed=1)
def MakeSets (features, labels):
  #20% of the data
  validAmount = len(features) * 0.2

  #list of all possible indicies inisde features
  indices = np.arange(0, len(features))

  #random choose indicies (20% of features)
  validIndicies = np.random.choice(indices, size=int(validAmount), replace=False)
  validIndicies = np.sort(validIndicies)

  #list of all possible indicies inisde features
  indices = np.arange(0, len(features))

  #random choose indicies (20% of features)
  validIndicies = np.random.choice(indices, size=int(validAmount), replace=False)

  #The Validation Set
  validSet = np.empty((int(validAmount), len(features[0])))
  validLabels = np.empty((int(validAmount)))

  #add the respective indicies for the validation set
  for i in range(len(validSet)):
      validSet[i] = np.copy(features[validIndicies[i]])
      validLabels[i] = np.copy(labels[validIndicies[i]])

  #remove those indicies from features
  trainingSet = np.delete(features, validIndicies, axis=0)
  trainingLabels = np.delete(labels, validIndicies);
  return (validSet, validLabels, trainingSet, trainingLabels)




NameError: name 'np' is not defined

(178,)
[0 1 0 0 1 0 0 0 0 0] vs [0. 1. 0. 0. 1. 0. 0. 0. 0. 0.]


0.9994271500716062

# Models
## insert all models here

### Feauture Selection:
### PCA:


## KNN

In [16]:
#box 8
validSet, validLabels, trainingSet, trainingLabels = MakeSets(features, labels)
print(test[5].shape)

knn = KNeighborsClassifier(n_neighbors=10, weights='uniform', p=2)
knn.fit(trainingSet, trainingLabels)
print(f"{knn.predict(validSet[0:10])} vs {validLabels[0:10]}")
knn.score(TvalidSet, TvalidLabels)

(178,)
[0 1 0 0 1 0 0 0 0 0] vs [0. 1. 0. 0. 1. 0. 0. 0. 0. 0.]


# Results