# imports

In [53]:
import pandas as pd
import numpy as np
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn import naive_bayes
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import cross_val_score
import pickle

sns.set()
warnings.filterwarnings('ignore')

# SVM Pipeline

In [54]:
data = pd.read_csv('cleaned_data.csv')
data

Unnamed: 0.1,Unnamed: 0,status,founded_at,funding_rounds,funding_total_usd,milestones,relationships,lat,lng,advertising,...,ESP,FRA,GBR,IND,ISR,NLD,USA,other_country,isClosed,active_days
0,0,1,2007,2.0,4561781.0,2.0,2.0,30.427755,-9.598107,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1,5110.0
1,1,1,2008,2.0,4561781.0,1.0,2.0,35.686975,-105.937799,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,4745.0
2,2,2,2007,1.0,5000000.0,3.0,14.0,37.386052,-122.083851,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0,0.0
3,3,1,2008,2.0,4561781.0,1.0,3.0,33.078655,-116.601964,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,4745.0
4,4,2,2008,1.0,4561781.0,4.0,9.0,37.441883,-122.143019,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43356,43356,1,2007,2.0,4561781.0,2.0,5.0,37.774929,-122.419415,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,5110.0
43357,43357,1,2007,1.0,750000.0,1.0,14.0,37.338208,-121.886329,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,5110.0
43358,43358,1,1959,2.0,4561781.0,3.0,44.0,38.882334,-77.171091,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,22630.0
43359,43359,1,2008,2.0,4561781.0,2.0,1.0,34.052234,-118.243685,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,4745.0


In [55]:
#dropping 'Unnamed: 0' column
data.drop('Unnamed: 0',axis=1,inplace=True)
data = data[['founded_at','funding_rounds','funding_total_usd','milestones','relationships','status']]
data

Unnamed: 0,founded_at,funding_rounds,funding_total_usd,milestones,relationships,status
0,2007,2.0,4561781.0,2.0,2.0,1
1,2008,2.0,4561781.0,1.0,2.0,1
2,2007,1.0,5000000.0,3.0,14.0,2
3,2008,2.0,4561781.0,1.0,3.0,1
4,2008,1.0,4561781.0,4.0,9.0,2
...,...,...,...,...,...,...
43356,2007,2.0,4561781.0,2.0,5.0,1
43357,2007,1.0,750000.0,1.0,14.0,1
43358,1959,2.0,4561781.0,3.0,44.0,1
43359,2008,2.0,4561781.0,2.0,1.0,1


In [56]:
# balancing the data, so that the model won't be biased
operating = data[data['status'] == 1]
acquired = data[data['status'] == 2]
closed = data[data['status'] == 3]
ipo = data[data['status'] == 4]

In [57]:
operating = operating.sample(acquired.shape[0],random_state=0)
closed = closed.sample(acquired.shape[0],replace=True,random_state=0)
ipo = ipo.sample(acquired.shape[0],replace=True,random_state=0)
balanced_data = pd.concat([operating,acquired,ipo,closed],axis=0)

In [58]:
x = balanced_data.copy()
y = x.pop('status')

In [59]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=0)

In [60]:
SVM_pipeline = Pipeline(steps=[
    ('scaler',StandardScaler()),
    ('pca',PCA(n_components=5)),
    ('model',SVC(kernel='rbf', gamma=1, C=1, decision_function_shape='ovo'))
])

In [61]:
SVM_pipeline.fit(x_train,y_train)
preds = SVM_pipeline.predict(x_test)

In [62]:
cr = classification_report(y_test,preds)
print(cr)

              precision    recall  f1-score   support

           1       0.58      0.50      0.54       540
           2       0.51      0.44      0.47       567
           3       0.63      0.76      0.69       560
           4       0.76      0.82      0.79       560

    accuracy                           0.63      2227
   macro avg       0.62      0.63      0.62      2227
weighted avg       0.62      0.63      0.62      2227



In [63]:
scores = cross_val_score(SVM_pipeline,x,y,cv=5,scoring='accuracy')
print(scores)
print(scores.mean())

[0.6259542  0.62954648 0.63701707 0.61814915 0.58760108]
0.6196535938298696


In [64]:
pickle_out = open("SVM_pipeline.pkl", "wb")
pickle.dump(SVM_pipeline, pickle_out)
pickle_out.close()

# Naive Bayes Pipeline

In [65]:
naive_bayes_pipeline = Pipeline(steps=[
    ('scaler',StandardScaler()),
    ('pca',PCA(n_components=5)),
    ('min_max_scaler',MinMaxScaler()),
    ('model',naive_bayes.MultinomialNB())
])

In [66]:
naive_bayes_pipeline.fit(x_train,y_train)
preds = naive_bayes_pipeline.predict(x_test)

In [67]:
cr = classification_report(y_test,preds)
print(cr)

              precision    recall  f1-score   support

           1       0.39      0.66      0.49       540
           2       0.28      0.04      0.08       567
           3       0.55      0.63      0.59       560
           4       0.57      0.58      0.57       560

    accuracy                           0.48      2227
   macro avg       0.45      0.48      0.43      2227
weighted avg       0.45      0.48      0.43      2227



In [68]:
scores = cross_val_score(naive_bayes_pipeline,x,y,cv=5,scoring='accuracy')
print(scores)
print(scores.mean())

[0.48854962 0.48001796 0.50449236 0.49281222 0.475292  ]
0.4882328331015541


In [69]:
pickle_out = open("naive_bayes_pipeline.pkl", "wb")
pickle.dump(naive_bayes_pipeline, pickle_out)
pickle_out.close()