In [12]:
# Apply a classification only using textual features in order to see their potential alone.
# as textual features, tfidfs are used and we apply dimensionality reduction later.

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
    
data_train = pd.read_csv('task1/data/diab_train.csv', index_col=0)
data_test = pd.read_csv('task1/data/diab_test.csv', index_col=0)
data_validation = pd.read_csv('task1/data/diab_validation.csv', index_col=0)
data = pd.concat([data_train, data_validation, data_test], axis=0)
data.reset_index(drop=True, inplace=True)

def extract_textual_features(data, colname):
    corpus = data[colname]
    corpus = corpus.replace(np.NaN, '').values
    vectorizer = TfidfVectorizer(stop_words='english', max_df=0.5)
    X = vectorizer.fit_transform(corpus)
    text_features = pd.DataFrame.sparse.from_spmatrix(X)
    return text_features

# names = ['diag_1_desc', 'diag_2_desc', 'diag_3_desc']
# data_original = data.copy()
# for col in names:
#     features = extract_textual_features(data_original, col)
#     data = pd.concat([data, features], axis=1)


In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

corpus = [
     'This is the first document.',
     'This document is the second document.',
     'And this is the third one.',
     'Is this the first document?',
]
vectorizer = TfidfVectorizer(ngram_range=(2,2))
vectorizer = CountVectorizer(ngram_range=(2,2))
X = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names())
print(X.shape)
X.toarray()

['and this', 'document is', 'first document', 'is the', 'is this', 'second document', 'the first', 'the second', 'the third', 'third one', 'this document', 'this is', 'this the']
(4, 13)


array([[0.        , 0.        , 0.52303503, 0.42344193, 0.        ,
        0.        , 0.52303503, 0.        , 0.        , 0.        ,
        0.        , 0.52303503, 0.        ],
       [0.        , 0.47633035, 0.        , 0.30403549, 0.        ,
        0.47633035, 0.        , 0.47633035, 0.        , 0.        ,
        0.47633035, 0.        , 0.        ],
       [0.49819711, 0.        , 0.        , 0.31799276, 0.        ,
        0.        , 0.        , 0.        , 0.49819711, 0.49819711,
        0.        , 0.39278432, 0.        ],
       [0.        , 0.        , 0.43779123, 0.        , 0.55528266,
        0.        , 0.43779123, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.55528266]])

In [13]:
# apply tfidf to whole data together, and then for each patient take the mean of all 3 diagnoses


textdata = pd.concat([data['diag_1_desc'], data['diag_2_desc'], data['diag_3_desc']], axis=0)
textdata.reset_index(inplace=True, drop=True)
textdata

0        Acute pericarditis in diseases classified else...
1                         Malignant essential hypertension
2              Urinary tract infection, site not specified
3                     Respiratory abnormality, unspecified
4        Coronary atherosclerosis of unspecified type o...
                               ...                        
29995                            Pure hypercholesterolemia
29996    Diabetes mellitus without mention of complicat...
29997                                 Obesity, unspecified
29998                                                  NaN
29999    Diabetes mellitus without mention of complicat...
Length: 30000, dtype: object

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

textdata = textdata.replace(np.NaN, '').values
# vectorizer = TfidfVectorizer(stop_words='english', max_df=0.5)
vectorizer = CountVectorizer(stop_words='english', max_df=0.5)
X = vectorizer.fit_transform(textdata)
text_features = pd.DataFrame.sparse.from_spmatrix(X)

In [15]:
text_features = X.reshape(10000,-1)
text_features.shape
type(text_features)
text_features = text_features.toarray()

In [16]:
diag1 = text_features[:,:1077]
diag2 = text_features[:,1077:2154]
diag3 = text_features[:, 2154:]

print(diag1.shape, diag2.shape, diag3.shape)

(10000, 1077) (10000, 1077) (10000, 1077)


In [17]:

text_mean = diag1 + diag2 + diag3 
text_mean.shape

#### we can either leave the features as they are. i.e 1077 features or we take the ones that have the highest 
#### variance among all columns #####

from sklearn.decomposition import KernelPCA

transformer = KernelPCA(n_components=200, kernel='rbf', random_state=10)
text_mean = transformer.fit_transform(text_mean)
text_mean.shape


(10000, 200)

In [18]:
# now do SVM
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.metrics import f1_score, auc, roc_curve, make_scorer, roc_auc_score, accuracy_score
from sklearn.model_selection import GridSearchCV
import auxilary
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train = text_mean[:8000,:]
y_train = data['readmitted'].values[:8000]
X_test = text_mean[8000:,:]
y_test = data['readmitted'].values[8000:]


X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


scoring ={'auroc':    make_scorer(roc_auc_score, greater_is_better=True),
        'f1_score': make_scorer(f1_score, average='micro', greater_is_better=True)}


parameters = {'kernel': ['rbf'],
              'C': [0.1, 1, 10, 100],
              'gamma': ['scale'],
              'degree': [3]}


classifier = SVC(random_state=423, class_weight='balanced', max_iter=80000)
grid_classifier = GridSearchCV(classifier, parameters, cv=4, verbose=3, scoring=scoring, n_jobs=3, refit='f1_score')
print('fitting...')
grid_classifier.fit(X_train, y_train)


y_predict = grid_classifier.predict(X_test)
print('best score: ', grid_classifier.best_score_)
print('best parameters: ', grid_classifier.best_params_)

accuracy = accuracy_score(y_test, y_predict)
aucroc_score = roc_auc_score(y_test, y_predict)
f1Score = f1_score(y_test, y_predict)
print('Accuracy: {} AUCROC: {} F1: {}'.format(accuracy, aucroc_score, f1Score))






fitting...
Fitting 4 folds for each of 4 candidates, totalling 16 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  16 out of  16 | elapsed:  2.0min finished


best score:  0.52925
best parameters:  {'C': 100, 'degree': 3, 'gamma': 'scale', 'kernel': 'rbf'}
Accuracy: 0.528 AUCROC: 0.5062210664774941 F1: 0.40253164556962023
