In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from jupyterthemes import jtplot
from scipy.io import arff
import os

%matplotlib inline
jtplot.style('gruvboxd')

In [2]:
DATA_DIR = os.path.join(os.getcwd().replace('Notebooks', 'data'), 'ct-dataset.csv')

In [3]:
data = arff.loadarff(DATA_DIR)

In [4]:
type(data)

tuple

In [5]:
new_data = [*map(list, data[0])]

In [6]:
new_data = np.array(new_data)

In [7]:
new_data.shape

(53500, 386)

In [8]:
new_data[:10]

array([[ 0.      ,  0.      ,  0.      , ..., -0.25    , -0.25    ,
        21.803851],
       [ 0.      ,  0.      ,  0.      , ..., -0.25    , -0.25    ,
        21.745726],
       [ 0.      ,  0.      ,  0.      , ..., -0.25    , -0.25    ,
        21.6876  ],
       ...,
       [ 0.      ,  0.      ,  0.      , ..., -0.25    , -0.25    ,
        21.396971],
       [ 0.      ,  0.      ,  0.      , ..., -0.25    , -0.25    ,
        21.28072 ],
       [ 0.      ,  0.      ,  0.      , ..., -0.25    , -0.25    ,
        22.617612]])

In [9]:
new_data = pd.DataFrame(new_data)

In [10]:
new_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,376,377,378,379,380,381,382,383,384,385
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.25,-0.25,-0.25,...,-0.25,0.980381,0.0,0.0,0.0,0.0,0.0,-0.25,-0.25,21.803851
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.25,-0.25,-0.25,...,-0.25,0.977008,0.0,0.0,0.0,0.0,0.0,-0.25,-0.25,21.745726
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.25,-0.25,-0.25,...,-0.25,0.977008,0.0,0.0,0.0,0.0,0.0,-0.25,-0.25,21.6876
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.25,-0.25,-0.25,...,-0.25,0.977008,0.0,0.0,0.0,0.0,0.0,-0.25,-0.25,21.629474
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.25,-0.25,-0.25,...,-0.25,0.976833,0.0,0.0,0.0,0.0,0.0,-0.25,-0.25,21.571348


In [11]:
new_data.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,376,377,378,379,380,381,382,383,384,385
count,53500.0,53500.0,53500.0,53500.0,53500.0,53500.0,53500.0,53500.0,53500.0,53500.0,...,53500.0,53500.0,53500.0,53500.0,53500.0,53500.0,53500.0,53500.0,53500.0,53500.0
mean,47.075701,0.059627,0.071558,0.145819,0.218728,0.274762,0.276189,0.204531,0.062281,-0.042025,...,-0.029404,0.182913,0.320112,0.359373,0.342889,0.266091,0.083049,-0.031146,-0.154524,47.028039
std,27.41424,0.174243,0.196921,0.30027,0.359163,0.378862,0.369605,0.351294,0.292232,0.268391,...,0.085817,0.383333,0.463517,0.478188,0.471811,0.437633,0.279734,0.098738,0.122491,22.347042
min,0.0,0.0,0.0,0.0,0.0,0.0,-0.25,-0.25,-0.25,-0.25,...,-0.25,0.0,0.0,0.0,0.0,0.0,-0.25,-0.25,-0.25,1.738733
25%,23.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.25,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.25,29.891607
50%,46.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.25,43.987893
75%,70.0,0.0,0.0,0.0,0.446429,0.684477,0.662382,0.441412,0.0,0.0,...,0.0,0.0,0.996286,0.999677,0.99956,0.949478,0.0,0.0,0.0,63.735059
max,96.0,1.0,1.0,1.0,1.0,0.99879,0.996468,0.999334,1.0,1.0,...,0.961279,1.0,1.0,1.0,1.0,1.0,0.999857,0.996839,0.942851,97.489115


In [12]:
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVR
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score

In [13]:
### Get features and targets
X = new_data.iloc[:, 1:-1]
y = new_data.iloc[:, -1]

### Train, test, and valid sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.35)

In [14]:
### Instantiate and train model
model = LinearSVR(C = 1, max_iter = 10000)

model.fit(X_train, y_train)

LinearSVR(C=1, max_iter=10000)

In [15]:
### Train R-Squared
model.score(X_train, y_train)
r2_score(y_train, model.predict(X_train))

0.8452015806952119

In [16]:
### Valid R-Squared
r2_score(y_val, model.predict(X_val))

0.8485748557037252

In [17]:
### Test R-Squared
r2_score(y_test, model.predict(X_test))

0.8416889467050237