In [1]:
#Load packages
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost
from xgboost import XGBClassifier
pd.set_option('display.max_columns', None)
data=pd.read_excel('./int303-big-data-analysis/startup_train.xlsx')
test_xlsx=pd.read_excel('./int303-big-data-analysis/startup_test_new.xlsx')
# Convert closed to 0 and acquired to 1
data['status'] = data.status.map({'acquired':1, 'closed':0})
ID_list=test_xlsx['ID']

In [2]:
data.duplicated().sum()
data.duplicated(subset=['name']).sum()
data=data.drop_duplicates(subset=['name'])
data.duplicated(subset=['name']).sum()
data=data.drop(['Unnamed: 0','name','Unnamed: 6','zip_code',
                'latitude','longitude','id','state_code','state_code.1','category_code','founded_at',
                'first_funding_at', 'last_funding_at','object_id','city'],axis=1)
data['age_first_milestone_year'] = data['age_first_milestone_year'].fillna(0)
data['age_last_milestone_year'] = data['age_last_milestone_year'].fillna(0)
data=data.drop(data[data.age_first_funding_year<0].index)
data=data.drop(data[data.age_last_funding_year<0].index)
data=data.drop(data[data.age_first_milestone_year<0].index)
data=data.drop(data[data.age_last_milestone_year<0].index)

In [3]:
# do the same operation for testing data
# Drop duplicated data
test_xlsx=test_xlsx.drop_duplicates(subset=['name'])
# Remove useless attributes
test_xlsx=test_xlsx.drop(['Unnamed: 0','name','Unnamed: 6','zip_code',
                'latitude','longitude','id','state_code','state_code.1','category_code','founded_at',
                'first_funding_at', 'last_funding_at','object_id','city','ID','status'],axis=1)
# Fill null values
test_xlsx['age_first_milestone_year'] = test_xlsx['age_first_milestone_year'].fillna(0)
test_xlsx['age_last_milestone_year'] = test_xlsx['age_last_milestone_year'].fillna(0)

In [4]:
#split the dataset into train and test sets
train_data=data.loc[:,data.columns!='status']
train_target=data.loc[:,'status']
X_train, X_test, y_train, y_test = train_test_split(train_data, train_target,test_size=0.1)

In [5]:
#Create KNN Classifier
knn = KNeighborsClassifier(n_neighbors=8) # Hyperparameter that can be changed
#Train the model using the training sets
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
#Import scikit-learn metrics module for accuracy calculation

# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.6233766233766234


In [24]:
# Random Forest
# import class, instantiate estimator, fit with training set
forest = RandomForestClassifier(n_estimators=100, max_depth=4, criterion="gini")
forest.fit(X_train, y_train)
# make predictions
y_pred = forest.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.7922077922077922


In [12]:
# GridSearchCV for Adaboost
ab = AdaBoostClassifier()
params = {
    'n_estimators': np.linspace(5, 300, 10,dtype=int),
    'learning_rate': np.linspace(0.05,0.5,5,dtype=float)
}
cv = GridSearchCV(ab, params, cv = 5, n_jobs = -1)
cv.fit(X_train, y_train)
pd.DataFrame(cv.cv_results_).sort_values('mean_test_score', ascending = False).head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_learning_rate,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
13,0.150208,0.0031,0.011658,0.002643,0.1625,103,"{'learning_rate': 0.1625, 'n_estimators': 103}",0.827338,0.784173,0.798561,0.84058,0.775362,0.805203,0.024973,1
11,0.057955,0.000582,0.003739,1.8e-05,0.1625,37,"{'learning_rate': 0.1625, 'n_estimators': 37}",0.820144,0.791367,0.784173,0.826087,0.797101,0.803774,0.016423,2
9,0.440929,0.009686,0.035201,0.00368,0.05,300,"{'learning_rate': 0.05, 'n_estimators': 300}",0.820144,0.798561,0.791367,0.84058,0.76087,0.802304,0.026958,3
4,0.207894,0.009065,0.017909,0.003451,0.05,136,"{'learning_rate': 0.05, 'n_estimators': 136}",0.820144,0.791367,0.784173,0.833333,0.775362,0.800876,0.022114,4
5,0.263061,0.013836,0.021834,0.00605,0.05,168,"{'learning_rate': 0.05, 'n_estimators': 168}",0.81295,0.791367,0.791367,0.84058,0.768116,0.800876,0.024398,4


In [18]:
ab = AdaBoostClassifier(n_estimators=200, learning_rate=0.2)
ab.fit(X_train, y_train)
y_pred = ab.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.8441558441558441


In [8]:
# XGBoost
xgc=XGBClassifier(learning_rate=0.2,n_estimators=50, max_depth=5,
                  objective= 'binary:logistic',colsample_bytree = 0.8,subsample=0.8,use_label_encoder=False) 
xgc.fit(X_train, y_train)
y_pred=xgc.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.7922077922077922


In [21]:
status_list=forest.predict(test_xlsx)
result=pd.DataFrame()
result['ID']=ID_list
result['status']=status_list
result['status'] = result.status.map({1:'acquired', 0:'closed'})
result.to_csv("subrandomforest.csv",index=False)

In [19]:
status_list=ab.predict(test_xlsx)
result=pd.DataFrame()
result['ID']=ID_list
result['status']=status_list
result['status'] = result.status.map({1:'acquired', 0:'closed'})
result.to_csv("subadaboost.csv",index=False)

In [11]:
status_list=xgc.predict(test_xlsx)
result=pd.DataFrame()
result['ID']=ID_list
result['status']=status_list
result['status'] = result.status.map({1:'acquired', 0:'closed'})
result.to_csv("subxgboost.csv",index=False)