# Train Validation Test Set & Sklearn performance

**Tabel of Contents**
1. Load Data
2. Data Preporcessing
3. Train Validation & Test Set
4. Decision Tree Comparison
5. Random Forest Comparison

## 0. Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.preprocessing import LabelEncoder

# Self written Decision Tree & Random Forest
from project import DecisionTree, RandomForest

## 1. Load Data

In [2]:
data = pd.read_csv('Data/ks-projects-201801.csv')

## 2. Data Preprocessing

In [3]:
data.describe()

Unnamed: 0,ID,goal,pledged,backers,usd pledged,usd_pledged_real,usd_goal_real
count,378661.0,378661.0,378661.0,378661.0,374864.0,378661.0,378661.0
mean,1074731000.0,49080.79,9682.979,105.617476,7036.729,9058.924,45454.4
std,619086200.0,1183391.0,95636.01,907.185035,78639.75,90973.34,1152950.0
min,5971.0,0.01,0.0,0.0,0.0,0.0,0.01
25%,538263500.0,2000.0,30.0,2.0,16.98,31.0,2000.0
50%,1075276000.0,5200.0,620.0,12.0,394.72,624.33,5500.0
75%,1610149000.0,16000.0,4076.0,56.0,3034.09,4050.0,15500.0
max,2147476000.0,100000000.0,20338990.0,219382.0,20338990.0,20338990.0,166361400.0


In [4]:
data.head()

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,failed,0,GB,0.0,0.0,1533.95
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02 04:43:57,2421.0,failed,15,US,100.0,2421.0,30000.0
2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12 00:20:50,220.0,failed,3,US,220.0,220.0,45000.0
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,5000.0,2012-03-17 03:24:11,1.0,failed,1,US,1.0,1.0,5000.0
4,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,2015-08-29,19500.0,2015-07-04 08:35:03,1283.0,canceled,14,US,1283.0,1283.0,19500.0


In [5]:
data.shape

(378661, 15)

In [6]:
data.dtypes

ID                    int64
name                 object
category             object
main_category        object
currency             object
deadline             object
goal                float64
launched             object
pledged             float64
state                object
backers               int64
country              object
usd pledged         float64
usd_pledged_real    float64
usd_goal_real       float64
dtype: object

In [7]:
data.drop(['ID', 'name', 'deadline', 'launched'], axis=1, inplace=True)

In [8]:
data.dropna(inplace=True)

In [9]:
data['state'].value_counts()

failed        197614
successful    133851
canceled       38757
live            2798
suspended       1844
Name: state, dtype: int64

In [10]:
data = data[(data['state'] == 'failed') | (data['state'] == 'successful')]
data.shape

(331465, 11)

In [11]:
data_random = shuffle(data, n_samples=20000, random_state=42) # random_state -> seed to sample same random data
data_random.head()

Unnamed: 0,category,main_category,currency,goal,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real
255238,Graphic Novels,Comics,USD,20000.0,8.0,failed,4,US,0.0,8.0,20000.0
152363,Shorts,Film & Video,MXN,28000.0,4300.0,failed,4,MX,26.5,227.55,1481.72
195873,Video Games,Games,USD,10000.0,3285.0,failed,136,US,214.0,3285.0,10000.0
232435,Product Design,Design,USD,30000.0,184.0,failed,2,US,184.0,184.0,30000.0
371834,Web,Technology,EUR,25000.0,25195.0,successful,47,DE,28165.22,27631.25,27417.39


In [12]:
data_random.shape

(20000, 11)

### Catigorize

In [13]:
lb_make = LabelEncoder()
data_random["state"] = lb_make.fit_transform(data_random["state"])
data_random["category"] = lb_make.fit_transform(data_random["category"])
data_random["main_category"] = lb_make.fit_transform(data_random["main_category"])
data_random["currency"] = lb_make.fit_transform(data_random["currency"])
data_random["country"] = lb_make.fit_transform(data_random["country"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the cavea

In [14]:
data_random.shape

(20000, 11)

In [15]:
data_random.head()

Unnamed: 0,category,main_category,currency,goal,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real
255238,66,1,13,20000.0,8.0,0,4,21,0.0,8.0,20000.0
152363,129,6,8,28000.0,4300.0,0,4,15,26.5,227.55,1481.72
195873,148,8,13,10000.0,3285.0,0,136,21,214.0,3285.0,10000.0
232435,113,4,13,30000.0,184.0,0,2,21,184.0,184.0,30000.0
371834,151,13,4,25000.0,25195.0,1,47,5,28165.22,27631.25,27417.39


In [16]:
y = data_random['state']
X = data_random.drop('state', axis=1)
X.shape, y.shape

((20000, 10), (20000,))

## 3. Train Validation Test Set

In [17]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [18]:
X_train.shape, y_train.shape, X_val.shape, y_val.shape, X_test.shape, y_test.shape

((12800, 10), (12800,), (4000, 10), (4000,), (3200, 10), (3200,))

In [19]:
X_train = X_train.values
y_train = y_train.values

X_val = X_val.values
y_val = y_val.values

X_test = X_test.values
y_test = y_test.values

## 4. Decision Tree
### Data Fitting Scikit-Learn vs Self-programmed Decision Tree Runtime

In [20]:
# Scikit-Learn Decision Tree
scikitLearn_DT = DecisionTreeClassifier(criterion='gini', 
                                        max_depth=10)
%time scikitLearn_DT.fit(X_train, y_train)

# Our Decision Tree
%time our_DT = DecisionTree(X_train, y_train, depth=10)

CPU times: user 43.8 ms, sys: 2.84 ms, total: 46.7 ms
Wall time: 44.7 ms
CPU times: user 46 s, sys: 457 ms, total: 46.5 s
Wall time: 45.8 s


In [21]:
# Profiling on Our Decision Tree
%prun our_DT = DecisionTree(X_train, y_train, depth=10)

 

### Data Fitting Scikit-Learn vs Self-programmed Decision Tree Accuracy
**Validation**

In [22]:
# Scikit-Learn Decision Tree
print("Scikit-Learn Validataion Accuracy:",
      scikitLearn_DT.score(X_val, y_val))

# Our Decision Tree
print("Self-Programmed Validation Accuracy:",
      our_DT.score(X_val, y_val))

Scikit-Learn Validataion Accuracy: 0.9925
Self-Programmed Validation Accuracy: 0.992


**Training** 

To see if the accuracy is different from the validation dataset so we can conclude over or underfitting

In [33]:
# Scikit-Learn Decision Tree on Training Set
print("Scikit-Learn Validataion Accuracy:",
      scikitLearn_DT.score(X_train, y_train))

# Our Decision Tree on Training Set
print("Self-Programmed Validation Accuracy:",
      our_DT.score(X_train, y_train))

Scikit-Learn Validataion Accuracy: 0.9996875
Self-Programmed Validation Accuracy: 0.99890625


 **Final Testing**
 
 Since the accuracy is already high on the validation dataset, we directly test the accuracy on the test dataset.

In [23]:
# Scikit-Learn Decision Tree
print("Scikit-Learn Final Accuracy:",
      scikitLearn_DT.score(X_test, y_test))

# Our Decision Tree
print("Self-Programmed Final Accuracy:",
      our_DT.score(X_test, y_test))

Scikit-Learn Final Accuracy: 0.995625
Self-Programmed Final Accuracy: 0.995625


## 5. Random Forest
### Data Fitting Scikit-Learn vs Self-programmed Random Forest Runtime

In [24]:
# Scikit-Learn Random Forest
scikitLearn_RF = RandomForestClassifier(n_estimators=10, 
                                        criterion='gini', 
                                        max_depth=10)
%time scikitLearn_RF.fit(X_train, y_train)

# Our Random Forest
%time our_RF = RandomForest(X_train, y_train, 
                            tree_amount=10, 
                            max_depth=10)

CPU times: user 112 ms, sys: 3.89 ms, total: 116 ms
Wall time: 115 ms
CPU times: user 1min 22s, sys: 544 ms, total: 1min 22s
Wall time: 1min 21s


In [29]:
# Profiling on Our Random Forest
%prun our_RF = RandomForest(X_train, y_train, tree_amount=10, max_depth=10)

 

### Data Fitting Scikit-Learn vs Self-programmed Random Forest Accuracy
**Validation**

In [30]:
# Scikit-Learn Random Forest on Validation Set
print("Scikit-Learn Validataion Accuracy:",
      scikitLearn_RF.score(X_val, y_val))

# Our Random Forest on Validation Set
print("Self-Programmed Validation Accuracy:",
      our_RF.score(X_val, y_val))

Scikit-Learn Validataion Accuracy: 0.99675
Self-Programmed Validation Accuracy: 0.99225


**Training** 

To see if the accuracy is different from the validation dataset so we can conclude over or underfitting

In [31]:
# Scikit-Learn Random Forest on Training Set
print("Scikit-Learn Validataion Accuracy:",
      scikitLearn_RF.score(X_train, y_train))

# Our Random Forest on Training Set
print("Self-Programmed Validation Accuracy:",
      our_RF.score(X_train, y_train))

Scikit-Learn Validataion Accuracy: 0.998828125
Self-Programmed Validation Accuracy: 0.994375


**Final Testing**

Since the accuracy is already high on the validation dataset, we directly test the accuracy on the test dataset. 

In [35]:
# Scikit-Learn Random Forest on Test Set
print("Scikit-Learn Final Accuracy:",
      scikitLearn_RF.score(X_test, y_test))

# Our Random Forest on Test Set
print("Self-Programmed Final Accuracy:",
      our_RF.score(X_test, y_test))

Scikit-Learn Final Accuracy: 0.9959375
Self-Programmed Final Accuracy: 0.9953125
