In [1]:
import numpy as np 

import pandas as pd 

import os

import seaborn as sns

import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import BaggingClassifier
from sklearn.svm import SVC
from sklearn.decomposition import PCA
import time
import joblib

import warnings
warnings.filterwarnings('ignore')

In [2]:
bio = pd.read_csv('data/test.csv')

In [3]:
bio.shape

(2501, 1776)

In [4]:
train = pd.read_csv('data/train.csv')

In [5]:
train.shape

(3751, 1777)

## Read Data

In [6]:
print(bio.head(10))

         D1        D2    D3   D4        D5        D6        D7        D8  \
0  0.366667  0.611765  0.05  0.0  0.110435  0.803973  0.106075  0.473965   
1  0.100000  0.758175  0.30  0.0  0.180128  0.621378  0.287144  0.503919   
2  0.100000  0.658812  0.10  0.0  0.243421  0.640959  0.312765  0.279784   
3  0.100000  0.655752  0.10  0.0  0.226978  0.776996  0.150657  0.336948   
4  0.000000  0.484851  0.00  0.0  0.561200  0.771463  0.244287  0.293096   
5  0.166667  0.599212  0.05  0.0  0.181274  0.815924  0.136971  0.092700   
6  0.000000  0.467971  0.05  0.0  0.165139  0.671504  0.296463  0.512215   
7  0.000000  0.719994  0.05  0.0  0.458446  0.632341  0.373370  0.615524   
8  0.000000  0.442633  0.05  0.0  0.113797  0.622830  0.374005  0.547289   
9  0.100000  0.560006  0.00  0.0  0.218233  0.641587  0.327974  0.520868   

         D9       D10  ...  D1767  D1768  D1769  D1770  D1771  D1772  D1773  \
0  0.835617  0.106452  ...      1      1      1      1      0      1      0   
1  0.

In [7]:
print("Missing Values",bio.isnull().sum().sum())
print("Column names",bio.columns)

Missing Values 0
Column names Index(['D1', 'D2', 'D3', 'D4', 'D5', 'D6', 'D7', 'D8', 'D9', 'D10',
       ...
       'D1767', 'D1768', 'D1769', 'D1770', 'D1771', 'D1772', 'D1773', 'D1774',
       'D1775', 'D1776'],
      dtype='object', length=1776)


## Are missing values imputed/handled appropriately? 

Yes, 0 missing values

## Support Vector

In [8]:
import numpy as np
np.random.seed(42)

In [9]:
X = train.drop(columns='Activity')
y = train['Activity']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35, random_state=42, stratify = y)

In [10]:
pca2 = PCA(n_components=500,random_state=42)
X_train_pca2 = pca2.fit_transform(X_train)
X_test_pca2 = pca2.transform(X_test)

In [11]:
support_vector_Classifier = SVC()
support_vector_Classifier.fit(X_train, y_train)

SVC()

In [12]:
bagged_reg = BaggingClassifier()
bagged_reg.fit(X_train, y_train)

BaggingClassifier()

In [13]:
from sklearn.pipeline import Pipeline

In [14]:
pipesvu = Pipeline ([
    ('PCA', PCA(random_state=42)),
    ('SVC', SVC(random_state=42))
])

In [15]:
pipe_params = {
    'PCA__n_components': list(range(300, 801, 100)),
    'SVC__C': np.logspace(-3, 2, 50),
    'SVC__kernel': ['rbf', 'polynomial']
}

In [16]:
svc_model=RandomizedSearchCV(pipesvu, pipe_params, n_iter=100, scoring= 'roc_auc', random_state= 42, n_jobs=-2)

In [17]:
svc_model.fit(X_train, y_train)

RandomizedSearchCV(estimator=Pipeline(steps=[('PCA', PCA(random_state=42)),
                                             ('SVC', SVC(random_state=42))]),
                   n_iter=100, n_jobs=-2,
                   param_distributions={'PCA__n_components': [300, 400, 500,
                                                              600, 700, 800],
                                        'SVC__C': array([1.00000000e-03, 1.26485522e-03, 1.59985872e-03, 2.02358965e-03,
       2.55954792e-03, 3.23745754e-03, 4.09491506e-03, 5.17947468e-03,
       6.55128557e-03, 8.2864...
       7.19685673e-01, 9.10298178e-01, 1.15139540e+00, 1.45634848e+00,
       1.84206997e+00, 2.32995181e+00, 2.94705170e+00, 3.72759372e+00,
       4.71486636e+00, 5.96362332e+00, 7.54312006e+00, 9.54095476e+00,
       1.20679264e+01, 1.52641797e+01, 1.93069773e+01, 2.44205309e+01,
       3.08884360e+01, 3.90693994e+01, 4.94171336e+01, 6.25055193e+01,
       7.90604321e+01, 1.00000000e+02]),
                            

In [18]:
svc_model.best_score_

0.8483922325295064

In [19]:
svc_model.best_params_

{'SVC__kernel': 'rbf', 'SVC__C': 1.8420699693267164, 'PCA__n_components': 600}

In [20]:
joblib.dump(svc_model,'./models/svc_model.pkl')

['./models/svc_model.pkl']

## Bagging Model

In [21]:
pipe_bag = Pipeline ([
    ('PCA', PCA(random_state=42)),
    ('bag', BaggingClassifier(random_state=42))
])

In [22]:
pipe_params = {
    'PCA__n_components': list(range(300, 801, 100)),
    'bag__n_estimators': list(range(10, 151, 10)),
    'bag__max_samples': list(range(1, 5, 1)),
    'bag__bootstrap': [True, False]
}

In [23]:
bag_model=RandomizedSearchCV(pipe_bag, pipe_params, n_iter=100, scoring= 'roc_auc', random_state= 42, n_jobs=-2, verbose=1)

In [None]:
bag_model.fit(X_train, y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


In [None]:
bag_model.best_score_

In [None]:
bag_model.best_params_

In [None]:
joblib.dump(bag_model,'./models/bag_model.pkl')