In [67]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
import warnings
warnings.filterwarnings("ignore")

# Reading csv and converted values in status column : acquired = 1 and else = 0
df = pd.read_csv('data.csv', converters={
                 'status': lambda x: int(x == 'acquired')})
df.head()


Unnamed: 0.1,Unnamed: 0,state_code,latitude,longitude,zip_code,id,city,Unnamed: 6,name,labels,...,object_id,has_VC,has_angel,has_roundA,has_roundB,has_roundC,has_roundD,avg_participants,is_top500,status
0,1005,CA,42.35888,-71.05682,92101,c:6669,San Diego,,Bandsintown,1,...,c:6669,0,1,0,0,0,0,1.0,0,1
1,204,CA,37.238916,-121.973718,95032,c:16283,Los Gatos,,TriCipher,1,...,c:16283,1,0,0,1,1,1,4.75,1,1
2,1001,CA,32.901049,-117.192656,92121,c:65620,San Diego,San Diego CA 92121,Plixi,1,...,c:65620,0,0,1,0,0,0,4.0,1,1
3,738,CA,37.320309,-122.05004,95014,c:42668,Cupertino,Cupertino CA 95014,Solidcore Systems,1,...,c:42668,0,0,0,1,1,1,3.3333,1,1
4,1002,CA,37.779281,-122.419236,94105,c:65806,San Francisco,San Francisco CA 94105,Inhale Digital,0,...,c:65806,1,1,0,0,0,0,1.0,1,0


In [68]:
# how many null values are present
df.isnull().sum().sort_values(ascending=False)

closed_at                   588
Unnamed: 6                  493
age_last_milestone_year     152
age_first_milestone_year    152
state_code.1                  1
Unnamed: 0                    0
is_biotech                    0
is_software                   0
is_web                        0
is_mobile                     0
is_enterprise                 0
is_advertising                0
is_gamesvideo                 0
is_ecommerce                  0
is_othercategory              0
is_consulting                 0
is_otherstate                 0
object_id                     0
has_VC                        0
has_angel                     0
has_roundA                    0
has_roundB                    0
has_roundC                    0
has_roundD                    0
avg_participants              0
is_top500                     0
category_code                 0
is_NY                         0
is_TX                         0
first_funding_at              0
latitude                      0
longitud

In [69]:
# dropped columns
df = df.drop(['Unnamed: 0','Unnamed: 6', 'name', 'founded_at', 'closed_at','first_funding_at', 'last_funding_at', 'id', 'object_id', 'labels', 'latitude', 'longitude', 'zip_code', 'city', 'state_code.1', 'state_code', 'category_code'], axis=1)

# replaced NA/null values
df['age_first_milestone_year'] = df['age_first_milestone_year'].fillna(value=0)
df['age_last_milestone_year'] = df['age_last_milestone_year'].fillna(value=0)
df.sample()

Unnamed: 0,age_first_funding_year,age_last_funding_year,age_first_milestone_year,age_last_milestone_year,relationships,funding_rounds,funding_total_usd,milestones,is_CA,is_NY,...,is_othercategory,has_VC,has_angel,has_roundA,has_roundB,has_roundC,has_roundD,avg_participants,is_top500,status
309,3.8849,5.863,2.8356,6.1123,9,2,6700000,4,0,1,...,1,1,0,0,0,0,0,8.0,1,1


In [70]:
# extracted feature columns and target column
features = df.loc[:, df.columns != 'status']
print(features.columns)
target = df.loc[:, 'status']

# preprocessing - converted string(object) values into incremental value
feature_name = ['age_first_funding_year', 'relationships', 'funding_total_usd']
for feature in feature_name:
    features[feature] = MinMaxScaler().fit_transform(df[feature].values.reshape(len(features), 1))

# dataframe to array
features_array = features.values
target = target.values

# split the dataset into test and train  train:test=80:20
X_train, X_test, y_train, y_test  = train_test_split(features, target, train_size=0.8, random_state=42)

# print confusion matrix and calculate accuracy rate
def print_performance(pred,actual):
    actual_array = np.array(actual)
    unique_label = np.unique([actual, pred])
    cf = pd.DataFrame(
        confusion_matrix(actual_array, pred, labels=unique_label), 
        index=['Actual:{:}'.format(x) for x in unique_label], 
        columns=['Pred:{:}'.format(x) for x in unique_label]
    )
    print(cf)
    print('Percent Acquired correctly predicted: ', cf['Pred:1'][1]/(cf['Pred:0'][1] +cf['Pred:1'][1])*100)
    print('Percent Not Acquired correctly predicted: ', cf['Pred:0'][0]/(cf['Pred:0'][0] +cf['Pred:1'][0])*100)

Index(['age_first_funding_year', 'age_last_funding_year',
       'age_first_milestone_year', 'age_last_milestone_year', 'relationships',
       'funding_rounds', 'funding_total_usd', 'milestones', 'is_CA', 'is_NY',
       'is_MA', 'is_TX', 'is_otherstate', 'is_software', 'is_web', 'is_mobile',
       'is_enterprise', 'is_advertising', 'is_gamesvideo', 'is_ecommerce',
       'is_biotech', 'is_consulting', 'is_othercategory', 'has_VC',
       'has_angel', 'has_roundA', 'has_roundB', 'has_roundC', 'has_roundD',
       'avg_participants', 'is_top500'],
      dtype='object')


In [71]:
# Logistic Regression
logistic_clf = LogisticRegression(solver='liblinear')
logistic_clf.fit(X_train, y_train)
y_pred = logistic_clf.predict(X_test)
print_performance(y_pred, y_test)
print("Accuracy of the test set: ", accuracy_score(y_test, y_pred))

          Pred:0  Pred:1
Actual:0      39      34
Actual:1      13      99
Percent Acquired correctly predicted:  88.39285714285714
Percent Not Acquired correctly predicted:  53.42465753424658
Accuracy of the test set:  0.745945945945946


In [72]:
from sklearn.ensemble import AdaBoostClassifier

ab = AdaBoostClassifier(random_state=42)
ab.fit(X_train, y_train)
y_pred_ada = ab.predict(X_test)
print_performance(y_pred_ada, y_test)
print("Accuracy of the test set: ", accuracy_score(y_test, y_pred_ada))

          Pred:0  Pred:1
Actual:0      47      26
Actual:1       7     105
Percent Acquired correctly predicted:  93.75
Percent Not Acquired correctly predicted:  64.38356164383562
Accuracy of the test set:  0.8216216216216217


In [123]:
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier(n_estimators=9, max_depth=6, criterion="gini")
forest.fit(X_train, y_train)
y_test_pred_forest = forest.predict(X_test)
print_performance(y_test_pred_forest, y_test)
print("Accuracy of the test set: ", accuracy_score(y_test, y_test_pred_forest))  

          Pred:0  Pred:1
Actual:0      36      37
Actual:1       7     105
Percent Acquired correctly predicted:  93.75
Percent Not Acquired correctly predicted:  49.31506849315068
Accuracy of the test set:  0.7621621621621621


In [124]:
from sklearn.svm import SVC

svc = SVC()

svc.fit(X_train, y_train)
y_test_pred_svc = svc.predict(X_test)
print_performance(y_test_pred_svc, y_test)
print("Accuracy of the test set: ", accuracy_score(y_test, y_test_pred_svc))  

          Pred:0  Pred:1
Actual:0      35      38
Actual:1      10     102
Percent Acquired correctly predicted:  91.07142857142857
Percent Not Acquired correctly predicted:  47.94520547945205
Accuracy of the test set:  0.7405405405405405
