## Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn import metrics
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve
from sklearn import datasets
from sklearn import preprocessing 
label_encoder = preprocessing.LabelEncoder() 
from sklearn.ensemble import GradientBoostingClassifier

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 

## Loading Datasets for training and testing

In [2]:
dataset = pd.read_csv('./train.csv')
dtest=pd.read_csv('./test.csv')

  interactivity=interactivity, compiler=compiler, result=result)


## Viewing first 5 lines of datasets  

In [3]:
dataset.head()

Unnamed: 0,ID,comp_name,website,op_status,domain,founded_on,hq_country_code,hq_state_code,hq_region,hq_city,...,last_funding_utc,successful_investment,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22,Unnamed: 23,Unnamed: 24,Unnamed: 25,Unnamed: 26
0,0,Nanotecture,http://www.nanotecture.co.uk,closed,Nanotechnology,,GBR,M4,London,Southampton,...,12907,0,,,,,,,,
1,1,Sybari,http://www.sybari.com,acquired,Email|Security|Software,1995-01-01,USA,NY,Long Island,East Northport,...,11411,1,,,,,,,,
2,2,Futurederm,http://www.futurederm.com,operating,Beauty|Cosmetics|E-Commerce|Social Media,2009-06-07,USA,PA,Pittsburgh,Pittsburgh,...,16217,0,,,,,,,,
3,3,Mercora,http://mercora.com,closed,Curated Web,,USA,CA,SF Bay Area,Sunnyvale,...,12784,0,,,,,,,,
4,4,Datos IO,http://www.datos.io,operating,Big Data|Cloud Computing|Databases|Enterprise ...,2014-06-01,USA,CA,SF Bay Area,San Jose,...,16693,0,,,,,,,,


In [4]:
dtest.head()

Unnamed: 0,ID,comp_name,website,op_status,domain,founded_on,hq_country_code,hq_state_code,hq_region,hq_city,total_funding_usd,funding_rounds,first_funding_date,last_funding_date,num_investors,funding_duration,first_funding_utc,last_funding_utc,Unnamed: 18
0,0,MDLIVE,http://mdlive.com,operating,Health Care,2009-01-01,USA,FL,Ft. Lauderdale,Fort Lauderdale,73600000.0,2,2014-01-22,2015-06-24,5,518,16092,16610,
1,1,APX Labs,http://apx-labs.com,operating,Software,2010-01-01,USA,VA,"Washington, D.C.",Herndon,28612936.0,3,2014-04-02,2015-11-18,7,595,16162,16757,
2,2,9Cookies,http://www.9cookies.com/,acquired,Point of Sale|Restaurants,2012-01-01,DEU,16,Berlin,Berlin,,1,2013-04-18,2013-04-18,1,0,15813,15813,
3,3,Confluence Solar,http://www.confluencesolar.com,acquired,Clean Technology,,USA,MO,St. Louis,Hazelwood,12700000.0,1,2008-09-24,2008-09-24,3,0,14146,14146,
4,4,Dazo,http://dazo.in,operating,Mobile,2014-01-01,IND,19,Bangalore,Bangalore,,1,2015-04-01,2015-04-01,7,0,16526,16526,


## Cleaning of Datasets

In [5]:
#Droping unnecessary columns in train dataset
dataset = dataset.iloc[:,:-8]
dataset = dataset[['op_status','domain','total_funding_usd','funding_rounds','num_investors','funding_duration','successful_investment']]

In [6]:
#Droping unnecessary columns in test dataset
dtest = dtest.iloc[:,:-1]
dtest = dtest[['op_status','domain','total_funding_usd','funding_rounds','num_investors','funding_duration']]

In [7]:
#Checking null values in train dataset
dataset.isnull().any()

op_status                False
domain                    True
total_funding_usd         True
funding_rounds            True
num_investors            False
funding_duration         False
successful_investment    False
dtype: bool

In [8]:
#Checking null values in test dataset
dtest.isnull().any()

op_status            False
domain                True
total_funding_usd     True
funding_rounds       False
num_investors        False
funding_duration     False
dtype: bool

In [9]:
#Removal of unwanted rows from op_status from train dataset
for key,value in dataset['op_status'].iteritems():
    if (value == 'http://centers.fortodo.com') or (value.strip() == 'http//www.dreamlines.com.br') or (value == 'http://www.fortodo.com'):
        print(key)
        dataset = dataset.drop(key,axis = 0)

7652
17070
31258


In [10]:
#Changing value of rows from num_investors from test dataset
for key,value in dtest['num_investors'].iteritems():
    if (value == '2013-11-01'):
        dtest['num_investors'][2380] = 1
        
print(dtest['num_investors'][2380])

1


In [11]:
#Converting string to int in total_funding_usd column
dataset['total_funding_usd']=pd.to_numeric(dataset['total_funding_usd'])
dtest['total_funding_usd']=pd.to_numeric(dtest['total_funding_usd'])

In [12]:
#Filling null values with mean
dataset.fillna(dataset.mean(),inplace=True)
dtest.fillna(dataset.mean(),inplace=True)

In [13]:
#Filling null values with preceding values in domain column
dataset.fillna(method="ffill",inplace=True)
dtest.fillna(method="ffill",inplace=True)

In [14]:
#Checking Null values again in train dataset
dataset.isnull().any()

op_status                False
domain                   False
total_funding_usd        False
funding_rounds           False
num_investors            False
funding_duration         False
successful_investment    False
dtype: bool

In [15]:
#Checking Null values again in test dataset
dtest.isnull().any()

op_status            False
domain               False
total_funding_usd    False
funding_rounds       False
num_investors        False
funding_duration     False
dtype: bool

## Handling of non-numeric dataset

In [16]:
dataset['domain']=label_encoder.fit_transform(dataset['domain'])
dtest['domain']=label_encoder.fit_transform(dtest['domain'])

In [17]:
dataset['op_status']=label_encoder.fit_transform(dataset['op_status'])
dtest['op_status']=label_encoder.fit_transform(dtest['op_status'])

## Splitting of dataset into test and train

In [18]:
#Splitting of training dataset to x and y
x_train = dataset.iloc[:,0:6]
y_train = dataset.iloc[:,6]
x_test=dtest.iloc[:,0:6]

In [19]:
print(x_train.shape,y_train.shape)

(40210, 6) (40210,)


In [20]:
print(y_train)

0        0
1        1
2        0
3        0
4        0
        ..
40208    0
40209    0
40210    0
40211    0
40212    0
Name: successful_investment, Length: 40210, dtype: int64


In [21]:
#Splitting training dataset into train and test
X_train,X_test,Y_train,Y_test = train_test_split(x_train,y_train,test_size=0.3,random_state=0)

## Applying Gradient Boosting Classifier

In [22]:
model = GradientBoostingClassifier(n_estimators=80)
model.fit(X_train,Y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=80,
                           n_iter_no_change=None, presort='auto',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [23]:
Y_pred = model.predict(X_test)
result = model.predict(x_test)

In [24]:
#Checking accuracy of train dataset
print("Accuracy:",metrics.accuracy_score(Y_test,Y_pred))

Accuracy: 1.0


In [25]:
#Checking test dataset predictions
result=pd.DataFrame(result)
print(result)

      0
0     0
1     0
2     1
3     1
4     0
...  ..
4464  0
4465  0
4466  0
4467  0
4468  0

[4469 rows x 1 columns]


In [26]:
#Exporting to csv file
export_csv = result.to_csv ('./result.csv', index = True, header=True)