# Imports

In [7]:
import pandas as pd
import numpy as np
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

sns.set()
warnings.filterwarnings('ignore')

# Creating SVM Model

In [8]:
data = pd.read_csv('modelling_data.csv')
data

Unnamed: 0.1,Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,status
0,0,0.159229,-0.562613,-0.591805,-0.157964,0.659951,0.282487,-0.708394,1
1,1,0.025409,-0.545583,-0.628715,0.360402,-0.721383,-0.049150,0.534937,1
2,2,-1.181523,4.085948,1.818133,0.832832,1.918683,-0.973053,-0.373344,2
3,3,0.041926,-0.528871,-0.607936,0.369352,-0.656534,-0.026201,0.592552,1
4,4,-1.295015,4.279216,2.080705,1.216708,2.516421,-0.710619,-1.778471,2
...,...,...,...,...,...,...,...,...,...
43356,43356,0.309856,-0.169297,-0.140077,0.966977,-0.007443,0.463643,-0.465648,1
43357,43357,-1.322762,-0.004236,1.989456,-0.253900,-0.595255,0.384888,1.250554,1
43358,43358,5.909159,-2.173778,4.289523,0.962860,2.417385,0.277445,0.019121,1
43359,43359,0.142677,-0.183913,-0.318695,0.774432,-0.014608,0.327352,-0.660462,1


In [9]:
# removing index column
data.drop('Unnamed: 0',axis=1,inplace=True)
data

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,status
0,0.159229,-0.562613,-0.591805,-0.157964,0.659951,0.282487,-0.708394,1
1,0.025409,-0.545583,-0.628715,0.360402,-0.721383,-0.049150,0.534937,1
2,-1.181523,4.085948,1.818133,0.832832,1.918683,-0.973053,-0.373344,2
3,0.041926,-0.528871,-0.607936,0.369352,-0.656534,-0.026201,0.592552,1
4,-1.295015,4.279216,2.080705,1.216708,2.516421,-0.710619,-1.778471,2
...,...,...,...,...,...,...,...,...
43356,0.309856,-0.169297,-0.140077,0.966977,-0.007443,0.463643,-0.465648,1
43357,-1.322762,-0.004236,1.989456,-0.253900,-0.595255,0.384888,1.250554,1
43358,5.909159,-2.173778,4.289523,0.962860,2.417385,0.277445,0.019121,1
43359,0.142677,-0.183913,-0.318695,0.774432,-0.014608,0.327352,-0.660462,1


In [10]:
x = data.drop('status',axis=1)
y = data['status']

##### 1 -> operating
##### 2 -> acquired
##### 3 -> closed
##### 4 -> ipo

In [11]:
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=0)

i will try different kernels and select the one with highest performance

In [12]:
from sklearn.svm import SVC

linear_model = SVC(kernel='linear',decision_function_shape="ovo")
linear_model.fit(x_train,y_train)

rbf_model = SVC(kernel='rbf',decision_function_shape="ovo")
rbf_model.fit(x_train,y_train)

poly_model = SVC(kernel="poly", degree=3,decision_function_shape="ovo")
poly_model.fit(x_train,y_train)

In [13]:
# predictions
linear_pred = linear_model.predict(x_test)
rbf_pred = rbf_model.predict(x_test)
poly_pred = poly_model.predict(x_test)

In [14]:
from sklearn.metrics import classification_report
linear_report = classification_report(y_test,linear_pred)
print(linear_report)

              precision    recall  f1-score   support

           1       0.99      1.00      1.00      7782
           2       0.89      0.92      0.90       566
           3       0.80      0.74      0.77       249
           4       0.00      0.00      0.00        76

    accuracy                           0.98      8673
   macro avg       0.67      0.66      0.67      8673
weighted avg       0.97      0.98      0.97      8673



In [15]:
rbf_report = classification_report(y_test,rbf_pred)
print(rbf_report)

              precision    recall  f1-score   support

           1       0.99      1.00      1.00      7782
           2       0.84      0.94      0.89       566
           3       0.83      0.61      0.70       249
           4       0.73      0.11      0.18        76

    accuracy                           0.98      8673
   macro avg       0.85      0.66      0.69      8673
weighted avg       0.97      0.98      0.97      8673



In [16]:
poly_report = classification_report(y_test,poly_pred)
print(poly_report)

              precision    recall  f1-score   support

           1       0.99      1.00      1.00      7782
           2       0.91      0.94      0.93       566
           3       0.85      0.80      0.82       249
           4       0.88      0.09      0.17        76

    accuracy                           0.98      8673
   macro avg       0.91      0.71      0.73      8673
weighted avg       0.98      0.98      0.98      8673



In [17]:
from sklearn.metrics import confusion_matrix

linear_matrix = confusion_matrix(y_test,linear_pred)
print(linear_matrix)

[[7782    0    0    0]
 [   0  519   47    0]
 [   0   65  184    0]
 [  76    0    0    0]]


In [18]:
rbf_matrix = confusion_matrix(y_test,rbf_pred)
print(rbf_matrix)

[[7779    0    0    3]
 [   1  533   32    0]
 [   0   98  151    0]
 [  68    0    0    8]]


In [19]:
poly_matrix = confusion_matrix(y_test,poly_pred)
print(poly_matrix)

[[7781    0    0    1]
 [   0  531   35    0]
 [   0   50  199    0]
 [  69    0    0    7]]


the linear model has percesion of 0 in case of IPO !!

the best model among the three models is the polynomial one. So, we can further inhance it to achieve better recall.

# Balancing

In [20]:
# get the value counts of the target column
y.value_counts()

1    38986
2     2783
3     1183
4      409
Name: status, dtype: int64

we will use the value count of acquired to undersample operating and oversample closed and ipo

In [21]:
operating = data[data['status'] == 1]
acquired = data[data['status'] == 2]
closed = data[data['status'] == 3]
ipo = data[data['status'] == 4]

In [22]:
operating = operating.sample(acquired.shape[0],random_state=0)
operating.shape

(2783, 8)

In [23]:
closed = closed.sample(acquired.shape[0],replace=True,random_state=0)
closed.shape

(2783, 8)

In [24]:
ipo = ipo.sample(acquired.shape[0],replace=True,random_state=0)
ipo.shape

(2783, 8)

In [25]:
balanced_data = pd.concat([operating,acquired,ipo,closed],axis=0)
balanced_data

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,status
6894,-1.575368,-0.677618,0.541785,0.345049,-0.602037,0.257419,0.565867,1
29151,1.118219,1.825567,-0.359798,-0.878922,-1.028346,0.498862,0.700482,1
21491,-0.075359,-0.445215,-0.658759,0.522238,-0.876709,-0.032855,0.504502,1
30039,-0.247741,-0.270872,-0.795076,0.345761,0.123727,0.541835,-0.751444,1
2812,2.688831,0.461711,4.411604,-0.949799,0.299542,0.230913,0.495600,1
...,...,...,...,...,...,...,...,...
35307,-2.357082,1.340915,1.125432,0.122043,0.683153,-2.250059,1.385992,3
26916,-2.765246,1.196318,1.168855,0.438767,0.474660,-2.311518,1.257432,3
9540,-2.788241,1.383168,0.896831,0.195943,0.533606,-2.218173,1.243913,3
5122,-1.198625,0.986140,-1.435735,1.193200,0.655007,-2.178689,1.132531,3


In [26]:
balanced_data['status'].value_counts()

1    2783
2    2783
4    2783
3    2783
Name: status, dtype: int64

we didn't use RandomOverSampler or SMOTE to oversample the data because oversampling ipo from 409 to 38986 will cause overfitting

In [27]:
x = balanced_data.drop('status',axis=1)
y = balanced_data['status']

In [28]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=0)

In [29]:
final_model = SVC(kernel='rbf', gamma=1, C=1, decision_function_shape='ovo').fit(x_train, y_train)

In [30]:
model_pred = final_model.predict(x_test)

In [31]:
model_report = classification_report(y_test,model_pred)
print(model_report)

              precision    recall  f1-score   support

           1       0.93      0.90      0.92       540
           2       0.90      0.88      0.89       567
           3       0.88      0.91      0.89       560
           4       0.91      0.93      0.92       560

    accuracy                           0.91      2227
   macro avg       0.91      0.91      0.91      2227
weighted avg       0.91      0.91      0.91      2227



In [32]:
model_matrix = confusion_matrix(y_test,model_pred)
print(model_matrix)

[[488   3   0  49]
 [  0 498  69   0]
 [  0  52 508   0]
 [ 38   0   0 522]]


the accuracy decreased as we balanced the data but the recall for ipo increased significantly. and the f1-score for all classes is above 0.85

In [33]:
#cross-validation
from sklearn.model_selection import cross_val_score
scores = cross_val_score(final_model,x,y,cv=5,scoring='accuracy')
print(scores)
print(scores.mean())

[0.88774136 0.90480467 0.90655885 0.89937107 0.88409704]
0.8965145960443806
