# KICKSTARTER DATASET [HACKEREARTH]

## Loading Libraries and data

In [116]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import RFE
import xgboost as xg
from sklearn.grid_search import GridSearchCV

In [117]:
train = pd.read_csv('train.csv')

In [118]:
train.shape

(108129, 14)

In [119]:
train.describe()

Unnamed: 0,goal,deadline,state_changed_at,created_at,launched_at,backers_count,final_status
count,108129.0,108129.0,108129.0,108129.0,108129.0,108129.0,108129.0
mean,36726.23,1380248000.0,1380153000.0,1374037000.0,1377299000.0,123.516661,0.319627
std,971902.7,42702220.0,42664020.0,42723100.0,42944210.0,1176.745162,0.466334
min,0.01,1241334000.0,1241334000.0,1240335000.0,1240603000.0,0.0,0.0
25%,2000.0,1346732000.0,1346695000.0,1340058000.0,1343917000.0,2.0,0.0
50%,5000.0,1393628000.0,1393567000.0,1384445000.0,1390870000.0,17.0,0.0
75%,13000.0,1415719000.0,1415548000.0,1409623000.0,1412807000.0,65.0,1.0
max,100000000.0,1433097000.0,1433097000.0,1432325000.0,1432658000.0,219382.0,1.0


#### As you can see from the table above, there are no missing values. There may be outliers though.

In [120]:
train.head(10)

Unnamed: 0,project_id,name,desc,goal,keywords,disable_communication,country,currency,deadline,state_changed_at,created_at,launched_at,backers_count,final_status
0,kkst1451568084,drawing for dollars,I like drawing pictures. and then i color them...,20.0,drawing-for-dollars,False,US,USD,1241333999,1241334017,1240600507,1240602723,3,1
1,kkst1474482071,Sponsor Dereck Blackburn (Lostwars) Artist in ...,"I, Dereck Blackburn will be taking upon an inc...",300.0,sponsor-dereck-blackburn-lostwars-artist-in-re...,False,US,USD,1242429000,1242432018,1240960224,1240975592,2,0
2,kkst183622197,Mr. Squiggles,So I saw darkpony's successfully funded drawin...,30.0,mr-squiggles,False,US,USD,1243027560,1243027818,1242163613,1242164398,0,0
3,kkst597742710,Help me write my second novel.,Do your part to help out starving artists and ...,500.0,help-me-write-my-second-novel,False,US,USD,1243555740,1243556121,1240963795,1240966730,18,1
4,kkst1913131122,Support casting my sculpture in bronze,"I'm nearing completion on a sculpture, current...",2000.0,support-casting-my-sculpture-in-bronze,False,US,USD,1243769880,1243770317,1241177914,1241180541,1,0
5,kkst1085176748,daily digest,I'm a fledgling videoblogger living in Brookly...,700.0,daily-digest,False,US,USD,1243815600,1243816219,1241050799,1241464468,14,0
6,kkst1468954715,iGoozex - Free iPhone app,I am an independent iPhone developer that beli...,250.0,igoozex-free-iphone-app,False,US,USD,1243872000,1243872028,1241725172,1241736308,2,0
7,kkst194050612,Drive A Faster Car 2.0,Drive A Faster Car (http://www.driveafastercar...,1000.0,drive-a-faster-car-20,False,US,USD,1244088000,1244088022,1241460541,1241470291,32,1
8,kkst708883590,"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""...","Opening Friday, June 5 2009, on view through S...",5000.0,lostles-at-tinys-giant,False,US,USD,1244264400,1244264422,1241415164,1241480901,44,0
9,kkst890976740,Choose Your Own Adventure - A Robot Painting S...,This project is for a Choose Your Own Adventur...,3500.0,choose-your-own-adventure-a-robot-painting-series,False,US,USD,1244946540,1244946632,1242268157,1242273460,18,0


In [121]:
train.columns

Index([u'project_id', u'name', u'desc', u'goal', u'keywords',
       u'disable_communication', u'country', u'currency', u'deadline',
       u'state_changed_at', u'created_at', u'launched_at', u'backers_count',
       u'final_status'],
      dtype='object')

In [122]:
features = train[['goal','deadline']].values
labels = train['final_status'].values

#### Here, we used 'goal', and 'deadline' as features to predict the final status.This is just the initial phase.Later we will improve the features

In [123]:
features_train, features_test, labels_train,labels_test = train_test_split(features,labels,test_size = 0.2,random_state=2)

In [124]:
clf = RandomForestClassifier()

In [125]:
clf.fit(features_train,labels_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [126]:
pred = clf.predict(features_test)

In [127]:
accuracy = accuracy_score(pred,labels_test)

In [128]:
print(accuracy)

0.614491815407


#### Got a accuracy of 61% which is too low, anything above a 80% is considered a good model.So, let's work on it.

In [129]:
test = pd.read_csv('test.csv')

In [130]:
test.describe()

Unnamed: 0,goal,deadline,state_changed_at,created_at,launched_at
count,63465.0,63465.0,63465.0,63465.0,63465.0
mean,35323.72,1459009000.0,1458278000.0,1451771000.0,1456135000.0
std,1206678.0,16388500.0,15712110.0,19838270.0,16419210.0
min,1.0,1433117000.0,1428069000.0,1266343000.0,1427940000.0
25%,2000.0,1444522000.0,1444419000.0,1438098000.0,1441756000.0
50%,6000.0,1458415000.0,1458254000.0,1452243000.0,1455635000.0
75%,20000.0,1472587000.0,1470672000.0,1466366000.0,1469650000.0
max,100000000.0,1490916000.0,1490915000.0,1490228000.0,1490297000.0


In [131]:
test.head(10)

Unnamed: 0,project_id,name,desc,goal,keywords,disable_communication,country,currency,deadline,state_changed_at,created_at,launched_at
0,kkst917493670,Bràthair.,"My first film, of many to come. Trying to purs...",7000.0,brathair,False,US,USD,1449619185,1449619185,1446002581,1446159585
1,kkst1664901914,THE SCREENWRITER,A young man that has earned his master's in sc...,35000.0,the-screenwriter,False,US,USD,1453435620,1453435620,1450297323,1450411620
2,kkst925125077,The Hornets Nest the Fairmont Heights Story,Film about a high school constructed for negro...,49500.0,the-hornets-nest-the-fairmont-heights-story,False,US,USD,1451780700,1451780700,1448581356,1448672128
3,kkst1427645275,BROTHERS Season 2 - Groundbreaking Transgender...,The acclaimed series about a group of transgen...,40000.0,brothers-season-2-groundbreaking-transgender-male,False,US,USD,1445021518,1445021530,1440966830,1442429518
4,kkst1714249266,Blackdom the movie,Blackdom's history offers a new narrative tha...,20000.0,blackdom-the-movie,False,US,USD,1462068840,1462068844,1455765276,1458334890
5,kkst994744324,Hero's Battle: The Movie produced by Fish4Him ...,Sexual immorality is Satan's weapon to wage wa...,10000.0,heros-battle-the-movie,False,US,USD,1461777994,1461777994,1458171626,1459185994
6,kkst366471810,Limbo Film Project - 213 lives of Julia,Film focuses on connection between social alie...,1000.0,limbo-film-project-213-lives-of-julia,False,IT,EUR,1450087423,1450087423,1446994619,1447495423
7,kkst1686645245,Traffic (A Short Film),Paris is hired by a jazz singer to kill an old...,300.0,traffic-a-short-film,False,GB,GBP,1456516800,1456516801,1450964837,1451340184
8,kkst1009612119,Modern Gangsters,new web series created by jonney terry,6000.0,modern-gangsters,False,US,USD,1444337940,1444337941,1441745957,1441750564
9,kkst774947236,KISS ME GOODBYE - A REFRESHING VOICE IN INDIE ...,A martyr faces execution at the hands of the S...,8000.0,kiss-me-goodbye-a-new-voice-in-indie-filmmaking,False,US,USD,1444144222,1444144223,1438886415,1441120222


In [132]:
train.columns

Index([u'project_id', u'name', u'desc', u'goal', u'keywords',
       u'disable_communication', u'country', u'currency', u'deadline',
       u'state_changed_at', u'created_at', u'launched_at', u'backers_count',
       u'final_status'],
      dtype='object')

In [133]:
test.columns

Index([u'project_id', u'name', u'desc', u'goal', u'keywords',
       u'disable_communication', u'country', u'currency', u'deadline',
       u'state_changed_at', u'created_at', u'launched_at'],
      dtype='object')

In [134]:
new_data = train.copy()

#### Making a copy of train which will be the cleaned data i.e will have better columns to act as features.

In [135]:
new_data['currency'].unique()

array(['USD', 'GBP', 'CAD', 'AUD', 'NZD', 'EUR', 'SEK', 'NOK', 'DKK'], dtype=object)

In [136]:
new_data['currency'][new_data['currency']=='USD'] = 1
new_data['currency'][new_data['currency']=='GBP'] = 1.551 
new_data['currency'][new_data['currency']=='CAD'] = 0.8971
new_data['currency'][new_data['currency']=='AUD'] = 0.8755
new_data['currency'][new_data['currency']=='NZD'] = 0.7429
new_data['currency'][new_data['currency']=='EUR'] = 1.2826
new_data['currency'][new_data['currency']=='SEK'] = 0.1374
new_data['currency'][new_data['currency']=='NOK'] = 0.1546
new_data['currency'][new_data['currency']=='DKK'] = 0.1722

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

Se

In [137]:
new_data['currency'].head(10)

0    1
1    1
2    1
3    1
4    1
5    1
6    1
7    1
8    1
9    1
Name: currency, dtype: object

In [138]:
train['currency'].head(10)

0    USD
1    USD
2    USD
3    USD
4    USD
5    USD
6    USD
7    USD
8    USD
9    USD
Name: currency, dtype: object

In [139]:
new_data['money'] = new_data['goal']*new_data['currency']

In [140]:
new_data[['goal','currency','money']][new_data['currency']==1.551].head(10)

Unnamed: 0,goal,currency,money
30408,300.0,1.551,465.3
30596,100.0,1.551,155.1
30651,1200.0,1.551,1861.2
30692,25000.0,1.551,38775.0
30735,500.0,1.551,775.5
30763,6000.0,1.551,9306.0
30796,3500.0,1.551,5428.5
30797,15000.0,1.551,23265.0
30822,7000.0,1.551,10857.0
30953,2000.0,1.551,3102.0


#### Now we have combined the goal and currency column, so there would be no differences ,all of them are now in USD. Now, we can drop the goal and currency column, we use the money column as a feature

In [141]:
new_data[['goal','currency','money']][new_data['currency']==0.8971].head(10)

Unnamed: 0,goal,currency,money
46213,51923.0,0.8971,46580.1
46483,1867.0,0.8971,1674.89
46496,3250.0,0.8971,2915.57
46564,2172.0,0.8971,1948.5
46825,15000.0,0.8971,13456.5
46904,10000.0,0.8971,8971.0
46905,7500.0,0.8971,6728.25
46906,3000.0,0.8971,2691.3
46908,12000.0,0.8971,10765.2
46909,9500.0,0.8971,8522.45


In [142]:
new_data.drop(['goal','currency'],axis=1,inplace=True)

In [143]:
new_data.columns

Index([u'project_id', u'name', u'desc', u'keywords', u'disable_communication',
       u'country', u'deadline', u'state_changed_at', u'created_at',
       u'launched_at', u'backers_count', u'final_status', u'money'],
      dtype='object')

In [144]:
new_data['disable_communication'].unique()

array([False,  True], dtype=bool)

#### Bool column can be directly used as a feature

#### The time given to fulfill the goal i.e the difference between the deadline and launched_at

In [145]:
new_data['time_given'] = new_data['deadline'] - new_data['launched_at']

In [146]:
new_data[['deadline','launched_at','time_given']].head()

Unnamed: 0,deadline,launched_at,time_given
0,1241333999,1240602723,731276
1,1242429000,1240975592,1453408
2,1243027560,1242164398,863162
3,1243555740,1240966730,2589010
4,1243769880,1241180541,2589339


In [147]:
new_data.describe()

Unnamed: 0,deadline,state_changed_at,created_at,launched_at,backers_count,final_status,time_given
count,108129.0,108129.0,108129.0,108129.0,108129.0,108129.0,108129.0
mean,1380248000.0,1380153000.0,1374037000.0,1377299000.0,123.516661,0.319627,2949493.0
std,42702220.0,42664020.0,42723100.0,42944210.0,1176.745162,0.466334,1120409.0
min,1241334000.0,1241334000.0,1240335000.0,1240603000.0,0.0,0.0,86400.0
25%,1346732000.0,1346695000.0,1340058000.0,1343917000.0,2.0,0.0,2592000.0
50%,1393628000.0,1393567000.0,1384445000.0,1390870000.0,17.0,0.0,2592000.0
75%,1415719000.0,1415548000.0,1409623000.0,1412807000.0,65.0,1.0,3184273.0
max,1433097000.0,1433097000.0,1432325000.0,1432658000.0,219382.0,1.0,7945200.0


#### Now we add another column time_start,the difference between created_at and launched_at, which depicts the time taken to launch the project after it was posted on the website

In [148]:
new_data['time_start'] = new_data['launched_at'] - new_data['created_at']

In [149]:
new_data.columns

Index([u'project_id', u'name', u'desc', u'keywords', u'disable_communication',
       u'country', u'deadline', u'state_changed_at', u'created_at',
       u'launched_at', u'backers_count', u'final_status', u'money',
       u'time_given', u'time_start'],
      dtype='object')

In [150]:
test.columns

Index([u'project_id', u'name', u'desc', u'goal', u'keywords',
       u'disable_communication', u'country', u'currency', u'deadline',
       u'state_changed_at', u'created_at', u'launched_at'],
      dtype='object')

#### The backers count will be a great feature to predict,however it isn't present in the test data. Hence there is no use to keep it in the train data too.Dropping the backers_count column.

In [151]:
new_data.drop(['backers_count'],inplace=True,axis=1)

In [152]:
new_data.columns

Index([u'project_id', u'name', u'desc', u'keywords', u'disable_communication',
       u'country', u'deadline', u'state_changed_at', u'created_at',
       u'launched_at', u'final_status', u'money', u'time_given',
       u'time_start'],
      dtype='object')

### Converting the country column to integer

In [153]:
new_data['country'].unique()

array(['US', 'GB', 'CA', 'AU', 'NZ', 'NL', 'SE', 'IE', 'NO', 'DK', 'DE'], dtype=object)

### Since the classifier only take integer values , we convert the country column to integer values, assigning countries to numbers:
#### US-1 GB-2 CA-3 AU-4 NZ-5 NL-6 SE-7 IE-8 NO-9 DK-10 DE-11

In [154]:
test['country'].unique()

array(['US', 'IT', 'GB', 'AU', 'NO', 'NL', 'SE', 'ES', 'DK', 'CA', 'DE',
       'IE', 'FR', 'NZ', 'CH', 'BE', 'AT', 'LU', 'HK', 'SG', 'MX'], dtype=object)

### Now we have got a problem, there are more countries in the test_data then the train_data.So, i don't know how it will classify.

In [155]:
new_data['country'][new_data['country']=='US']=1
new_data['country'][new_data['country']=='GB']=2
new_data['country'][new_data['country']=='CA']=3
new_data['country'][new_data['country']=='AU']=4
new_data['country'][new_data['country']=='NZ']=5
new_data['country'][new_data['country']=='NL']=6
new_data['country'][new_data['country']=='SE']=7
new_data['country'][new_data['country']=='IE']=8
new_data['country'][new_data['country']=='NO']=9
new_data['country'][new_data['country']=='DK']=10
new_data['country'][new_data['country']=='DE']=11

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

Se

In [156]:
new_data['country'].unique()

array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], dtype=object)

### Let's also change the country column in the test data

In [157]:
test['country'].unique()

array(['US', 'IT', 'GB', 'AU', 'NO', 'NL', 'SE', 'ES', 'DK', 'CA', 'DE',
       'IE', 'FR', 'NZ', 'CH', 'BE', 'AT', 'LU', 'HK', 'SG', 'MX'], dtype=object)

In [158]:
test['country'][test['country']=='US']=1
test['country'][test['country']=='GB']=2
test['country'][test['country']=='CA']=3
test['country'][test['country']=='AU']=4
test['country'][test['country']=='NZ']=5
test['country'][test['country']=='NL']=6
test['country'][test['country']=='SE']=7
test['country'][test['country']=='IE']=8
test['country'][test['country']=='NO']=9
test['country'][test['country']=='DK']=10
test['country'][test['country']=='DE']=11
test['country'][test['country']=='IT']=12
test['country'][test['country']=='ES']=13
test['country'][test['country']=='FR']=14
test['country'][test['country']=='CH']=15
test['country'][test['country']=='BE']=16
test['country'][test['country']=='AT']=17
test['country'][test['country']=='LU']=18
test['country'][test['country']=='HK']=19
test['country'][test['country']=='SG']=20
test['country'][test['country']=='MX']=21

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

Se

In [159]:
test['country'].unique()

array([1, 12, 2, 4, 9, 6, 7, 13, 10, 3, 11, 8, 14, 5, 15, 16, 17, 18, 19,
       20, 21], dtype=object)

## Let's predict now until further changes

In [160]:
new_data.columns

Index([u'project_id', u'name', u'desc', u'keywords', u'disable_communication',
       u'country', u'deadline', u'state_changed_at', u'created_at',
       u'launched_at', u'final_status', u'money', u'time_given',
       u'time_start'],
      dtype='object')

In [161]:
features = new_data[['disable_communication','country','launched_at','state_changed_at','money','time_given','time_start']].values

In [162]:
labels = new_data['final_status'].values

In [163]:
features_train, features_test, labels_train, labels_test = train_test_split(features,labels,test_size=0.2,random_state=2)

In [164]:
clf.fit(features_train,labels_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [165]:
pred = clf.predict(features_test)

In [166]:
accuracy = accuracy_score(pred,labels_test)
print(accuracy)

0.666882456303


## Feature Selection

In [167]:
rfe = RFE(clf,3)

In [168]:
rfe_fit = rfe.fit(features_train,labels_train)

In [169]:
print('Feature Ranking :{0}'.format(rfe_fit.ranking_))
print('Selected Features;{0}'.format(rfe_fit.support_))

Feature Ranking :[5 4 1 1 2 3 1]
Selected Features;[False False  True  True False False  True]


### So, according to rfe, the top 3 features for prediction is 'launched_at', 'state_changed_at','time_start'. Well, atleast the column 'time_start' came to use, or else all the work would have gone to waste. However, money should also be a important feature, so i will also consider it while predicting.

## Using XGBoost to predict and also finding the perfect parameters for the xgboostclassifier

In [170]:
features = new_data[['launched_at','state_changed_at','money','time_start']].values
labels = new_data['final_status'].values

In [171]:
features_train,features_test,labels_train,labels_test = train_test_split(features,labels,test_size=0.2,random_state=2)

In [172]:
cv_params = {'learning_rate':[0.1,0.01],'subsample':[0.7,0.8,0.9]}
ind_params = {'n_estimators': 1000,'random_state':0,'colsample_bytree':0.8,'max_depth':3,'min_child_weight':1}

In [173]:
optimized_XGB = GridSearchCV(xgb.XGBClassifier(**ind_params),cv_params,scoring='accuracy',cv=5,n_jobs=-1)

In [174]:
optimized_XGB.fit(features_train,labels_train)

KeyboardInterrupt: 

In [None]:
optimized_XGB.grid_scores_

### Highest accuracy in subsample: 0.9 and learning rate:0.01

In [None]:
cv_params = {'max_depth':[3,5,7],'min_child_weight':[1,3,5]}
ind_params = {'learning rate':0.01,'n_estimators':1000,'subsample':0.9,'colsample_bytree':0.8}

In [None]:
optimized_XGB = GridSearchCV(xgb.XGBClassifier(**ind_params),cv_params,scoring='accuracy',cv=5,n_jobs=-1)

In [None]:
optimized_XGB.fit(features_train,labels_train)

In [None]:
optimized_XGB.grid_scores_

### Best parameters: sub-sample:0.9,learning_rate =0.01 , max_depth:3, min_child_weight:3

### Now, we have used the money feature too, let's remove it and check the accuracy using the above parameters.

In [None]:
features_2 = new_data[['launched_at','state_changed_at','time_start','money']].values

In [None]:
features_train, features_test, labels_train, labels_test = train_test_split(features_2, labels, test_size=0.2, random_state=42)

In [None]:
params = {'sub_sample':0.9,'learning_rate':0.9,'max_depth':3,'min_child_weight':3}
clf_xgb = xgb.XGBClassifier(**params)

In [None]:
clf_xgb.fit(features_train,labels_train)

In [None]:
pred = clf_xgb.predict(features_test)

In [None]:
accuracy = accuracy_score(pred,labels_test)
print(accuracy)

## Cleaning the test data as i did with train data.

In [177]:
test.columns

Index([u'project_id', u'name', u'desc', u'goal', u'keywords',
       u'disable_communication', u'country', u'currency', u'deadline',
       u'state_changed_at', u'created_at', u'launched_at'],
      dtype='object')

In [178]:
test['currency'].unique()

array(['USD', 'EUR', 'GBP', 'AUD', 'NOK', 'SEK', 'DKK', 'CAD', 'NZD',
       'CHF', 'HKD', 'SGD', 'MXN'], dtype=object)

In [179]:
test['currency'][test['currency']=='USD'] = 1
test['currency'][test['currency']=='GBP'] = 1.551 
test['currency'][test['currency']=='CAD'] = 0.8971
test['currency'][test['currency']=='AUD'] = 0.8755
test['currency'][test['currency']=='NZD'] = 0.7429
test['currency'][test['currency']=='EUR'] = 1.2826
test['currency'][test['currency']=='SEK'] = 0.1374
test['currency'][test['currency']=='NOK'] = 0.15460
test['currency'][test['currency']=='DKK'] = 0.1722
test['currency'][test['currency']=='CHF'] = 1.0244
test['currency'][test['currency']=='HKD'] = 0.1288
test['currency'][test['currency']=='SGD'] = 0.7479
test['currency'][test['currency']=='MXN'] = 0.0745

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

Se

In [180]:
test['currency'].unique()

array([1, 1.2826, 1.551, 0.8755, 0.1546, 0.1374, 0.1722, 0.8971, 0.7429,
       1.0244, 0.1288, 0.7479, 0.0745], dtype=object)

In [181]:
test['money'] = test['goal'] * test['currency']

In [182]:
test[['goal','currency','money']].head()

Unnamed: 0,goal,currency,money
0,7000.0,1,7000
1,35000.0,1,35000
2,49500.0,1,49500
3,40000.0,1,40000
4,20000.0,1,20000


In [183]:
test.columns

Index([u'project_id', u'name', u'desc', u'goal', u'keywords',
       u'disable_communication', u'country', u'currency', u'deadline',
       u'state_changed_at', u'created_at', u'launched_at', u'money'],
      dtype='object')

In [184]:
test.drop(['goal','currency'],inplace=True,axis=1)

In [185]:
test.columns

Index([u'project_id', u'name', u'desc', u'keywords', u'disable_communication',
       u'country', u'deadline', u'state_changed_at', u'created_at',
       u'launched_at', u'money'],
      dtype='object')

In [186]:
test['time_given'] = test['deadline'] - test['launched_at']

In [187]:
test['time_start'] = test['launched_at'] - test['created_at']

In [188]:
test.columns

Index([u'project_id', u'name', u'desc', u'keywords', u'disable_communication',
       u'country', u'deadline', u'state_changed_at', u'created_at',
       u'launched_at', u'money', u'time_given', u'time_start'],
      dtype='object')

In [189]:
new_data.columns

Index([u'project_id', u'name', u'desc', u'keywords', u'disable_communication',
       u'country', u'deadline', u'state_changed_at', u'created_at',
       u'launched_at', u'final_status', u'money', u'time_given',
       u'time_start'],
      dtype='object')

In [190]:
features_train = new_data[['disable_communication','country','created_at','money','time_given','time_start']].values

In [191]:
features_test = test[['disable_communication','country','created_at','money','time_given','time_start']].values

In [192]:
labels = new_data['final_status'].values

In [193]:
clf.fit(features_train,labels)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [194]:
pred = clf.predict(features_test)

In [195]:
project_id = np.array(test['project_id'])

In [196]:
solution = pd.DataFrame(pred,project_id,columns=['final_status'])

In [197]:
solution.to_csv('solution1.csv',index_label=['project_id'])

### The above solution got me a accuracy of 64% .Got to improve it.

### Using AdaBoostClassifier to predict

In [198]:
from sklearn.ensemble import AdaBoostClassifier

In [199]:
clf_ada = AdaBoostClassifier(n_estimators=100)

In [200]:
clf_ada.fit(features_train,labels)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=100, random_state=None)

In [201]:
pred_ada = clf_ada.predict(features_test)

In [202]:
solution_ada = pd.DataFrame(pred_ada,project_id,columns=['final_status'])

In [203]:
solution_ada.to_csv('solution_ada.csv',index_label=['project_id'])

## XGBoost classifier

In [211]:
features_train = new_data[['disable_communication','country','time_given','money','launched_at','created_at','time_start']].values
features_test = test[['disable_communication','country','time_given','money','launched_at','created_at','time_start']].values
labels = new_data['final_status'].values

In [212]:
params = {'sub_sample':0.9,'learning_rate':0.01,'max_depth':3,'min_child_weight':3}
clf_xgb = xgb.XGBClassifier(**params)

In [213]:
clf_xgb.fit(features_train,labels)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.01, max_delta_step=0,
       max_depth=3, min_child_weight=3, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, sub_sample=0.9, subsample=1)

In [214]:
pred = clf_xgb.predict(features_test)

In [215]:
project_id = np.array(test['project_id'])
solution_xgb = pd.DataFrame(pred,project_id,columns=['final_status'])

In [216]:
solution_xgb.to_csv('solution_xgb.csv',index_label=['project_id'])

### Well, all this xgboost upto no use, the accuracy is even less than that of the adaboost classifier(66.5) and xgboost(66%). I think so next i should improve the params of the adaboost classifier. And also use the words column to predict, which will make me learn better.