## Task1 using a tfidf-vectorizer to convert text to numbers

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest
import re
from sklearn.feature_selection import f_classif, chi2

#### Load data into pandas dataframes

In [2]:
df_train = pd.read_csv('data/diab_train.csv')
df_test = pd.read_csv('data/diab_test.csv')
df_val = pd.read_csv('data/diab_validation.csv')

df_train.dropna(inplace=True)
df_test.dropna(inplace=True)
df_val.dropna(inplace=True)

#### Concatenate all three diagnosis to one string

In [3]:
train_txt = (df_train['diag_1_desc'] + ' ' + df_train['diag_2_desc'] + ' ' + df_train['diag_3_desc']).tolist()
test_txt = (df_test['diag_1_desc'] + ' ' + df_test['diag_2_desc'] + ' ' + df_test['diag_3_desc']).tolist()
val_txt = (df_val['diag_1_desc'] + ' ' + df_val['diag_2_desc'] + ' ' + df_val['diag_3_desc']).tolist()

#### Create labels

In [4]:
train_y = df_train['readmitted'].to_numpy()
test_y = df_test['readmitted'].to_numpy()
val_y = df_val['readmitted'].to_numpy()

#### Define preprocessing step which will be later used by the tfidfvectorizer

In [5]:
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize

def preprocess(text):
    stop_words = set(stopwords.words('english')) 
  
    word_tokens = word_tokenize(text) 
  
    text = [w for w in word_tokens if not w in stop_words]
    text = ' '.join(text)
    return WordNetLemmatizer().lemmatize(text)

[nltk_data] Downloading package wordnet to /home/imre/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


#### N-gram vectorizer using unigrams and bigrams

In [6]:
def ngram_vectorize(train_texts, train_labels, val_texts, test_texts,
	ngram_range = (1,2),
	TOP_K=1000,
	TOKEN_MODE='word',
	MIN_DOCUMENT_FREQUENCY = 2,
	MAX_FEATURES = 1000):
	"""Vectorizes texts as n-gram vectors.

	1 text = 1 tf-idf vector the length of vocabulary of unigrams + bigrams.

	# Arguments
		train_texts: list, training text strings.
		train_labels: np.ndarray, training labels.
		val_texts: list, validation text strings.

	# Returns
		x_train, x_val: vectorized training and validation texts
	"""
	# Create keyword arguments to pass to the 'tf-idf' vectorizer.
	kwargs = {
			'ngram_range': ngram_range,  # Use 1-grams + 2-grams.
			'dtype': np.float64,
			'strip_accents': 'unicode',
			'decode_error': 'replace',
			'analyzer': TOKEN_MODE,  # Split text into word tokens.
			'min_df': MIN_DOCUMENT_FREQUENCY,
			'max_features': MAX_FEATURES,
			'preprocessor': preprocess
	}
	vectorizer = TfidfVectorizer(**kwargs)

	# Learn vocabulary from training texts and vectorize training texts.
	x_train = vectorizer.fit_transform(train_texts)

	# Vectorize validation texts.
	x_val = vectorizer.transform(val_texts)
	# Vectorize test set
	x_test = vectorizer.transform(test_texts)

	# Select top 'k' of the vectorized features.
	selector = SelectKBest(score_func=chi2, k=min(TOP_K, x_train.shape[1]))
	selector.fit(x_train, train_labels)
	x_train = selector.transform(x_train).astype('float32')
	x_val = selector.transform(x_val).astype('float32')
	x_test = selector.transform(x_test).astype('float32')

	
	return x_train.todense(), x_val.todense(), x_test.todense(), vectorizer, selector

In [7]:
x_train, x_val, x_test, vectorizer, selector = ngram_vectorize(train_txt, train_y, val_txt, test_txt)

In [8]:
from train_model import train_ngram_model

In [9]:
model = train_ngram_model(x_train, train_y,x_val,val_y)

Train on 4886 samples, validate on 1613 samples
Epoch 1/30
4886/4886 - 1s - loss: 0.6746 - acc: 0.5966 - val_loss: 0.6664 - val_acc: 0.5983
Epoch 2/30
4886/4886 - 0s - loss: 0.6608 - acc: 0.5962 - val_loss: 0.6628 - val_acc: 0.6076
Epoch 3/30
4886/4886 - 0s - loss: 0.6531 - acc: 0.6066 - val_loss: 0.6653 - val_acc: 0.6063
Epoch 4/30
4886/4886 - 0s - loss: 0.6453 - acc: 0.6218 - val_loss: 0.6654 - val_acc: 0.5908
Validation accuracy: 0.590824544429779, loss: 0.6653728665066889


In [10]:
y_pred=model.predict_classes(x_test)

In [11]:
model.evaluate(x_test,test_y)



[0.6755613507965045, 0.57848656]