# Predicting Type of MS

First, we import the relevant packages

In [6]:
import pandas as pd
import numpy as np
from sklearn.neural_network import MLPClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.pipeline import make_pipeline
import matplotlib.pyplot as plt
from sklearn.svm import OneClassSVM
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
import pickle

## Importing the free text and MS Type Label

Code adapted from Matt's

In [7]:
# Read in the dataset into a pandas data-frame
df_all = pd.read_csv("FreeTextProcessed_20220126.csv")

# Extract entries which contain responses to the free text question
# of the questionaire 
df_free_txt = df_all[df_all["QuestionnaireKey"] == "v3_unique_textbox"]

# Extract the column which only contains the free text responses
# Convert to np.array (because I like to)
df_free_txt_only = df_free_txt.filter(["Value"]).fillna(' ').to_numpy()
df_ms_at_diagnosis = df_free_txt.filter(["MSAtDiagnosis"]).fillna('Unknown').to_numpy()
df_ms_now = df_free_txt.filter(["MSTypeNow"]).fillna('Unknown').to_numpy()

Validate their shapes

In [8]:
print("Free text shape:", df_free_txt_only.shape)
print("MS at diagnosis shape:", df_ms_at_diagnosis.shape)
print("MS now shape:", df_ms_now.shape)

Free text shape: (6768, 1)
MS at diagnosis shape: (6768, 1)
MS now shape: (6768, 1)


### Labeling the MS Type

Here, we can  use the MS Type now column, unless it is Unknown, in which case we use the one at diagnosis.

In [9]:
Class_label_correspondances = {
    0: 'PPMS',
    1: 'SPMS',
    2: 'RRMS',
    3: 'Benign',
    4: 'Unknown'
}

In [10]:
ms_label = []

for i in range(df_ms_now.size):
    if df_ms_now[i].item() == 'Unknown':
        ms_label.append(df_ms_at_diagnosis[i].item())
    else:
        ms_label.append(df_ms_now[i].item())

ms_label_int = []
inverted_labels = {v:k for k,v in Class_label_correspondances.items()}

for MSType in ms_label:
    ms_label_int.append(inverted_labels[MSType])

free_text = []
for i in range(df_ms_now.size):
    free_text.append(df_free_txt_only[i].item())


In [11]:
# Sort the unknown ones into a different set
dataset, unknown = [], []
data_label, unknown_label = [], []
for i in range(len(ms_label_int)):
    (unknown, dataset)[ms_label_int[i]<4].append(free_text[i])
    (unknown_label, data_label)[ms_label_int[i]<4].append(ms_label_int[i])

### Splitting into train and test set 
Here I chose to use an 80/20 split

In [12]:
def split_data(lendata, percentage = 0.8):
    indexes = np.array(range(0,lendata))
    np.random.shuffle(indexes)
    train_size = int(lendata*percentage)
    
    idxs_train = indexes[:train_size]
    idxs_test = indexes[train_size:]
    
    return idxs_train, idxs_test

In [13]:
LENDATA = len(dataset)
np.random.seed(69420)
idxs_train, idxs_test = split_data(LENDATA, 0.8)

In [14]:
bow = CountVectorizer()
tfidf = TfidfTransformer()
vectorizer = TfidfVectorizer()

In [15]:
# Define a class to hold the data (similar to Dataset class in torch)
# Do I want to put bag of words and tfidf at this stage?? probably.
class MSDataset(object):
    def __init__(self, data, labels, idxs_train, idxs_test):
        self.train_set = [data[i] for i in idxs_train]
        self.train_labels = [labels[i] for i in idxs_train]
        self.test_set = [data[i] for i in idxs_test]
        self.test_labels = [labels[i] for i in idxs_test]

In [16]:
full_dataset = MSDataset(dataset, data_label, idxs_train, idxs_test)

In [20]:
pickle.dump(full_dataset, open('dataset.sav','wb'))

In [17]:
keys = [0,1,2,3]


### Converting to a Bag-of-Words model and using Tfidf


In [18]:
X = bow.fit_transform(full_dataset.train_set)

In [19]:
train_vectors = bow.transform(full_dataset.train_set)
test_vectors = bow.transform(full_dataset.test_set)
# model = MLPClassifier(hidden_layer_sizes=(30,),batch_size=16,max_iter=20)
# classifier = make_pipeline(vectorizer,model)
print(train_vectors.shape)
print(test_vectors.shape)

(5242, 13462)
(1311, 13462)


In [23]:
regression_model = LogisticRegression()
regression_model.fit(train_vectors, full_dataset.train_labels)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [25]:
score = regression_model.score(test_vectors, full_dataset.test_labels)
print(score)

0.5179252479023646


In [32]:
unknown_vectors = bow.transform(unknown)
unknown_predictions = regression_model.predict(unknown_vectors)
print(Class_label_correspondances[unknown_predictions[0]])

SPMS


In [44]:
# saving the model to be used by the app
model_sav = 'regression_model_bow.sav'
pickle.dump(regression_model, open(model_sav, 'wb'))

In [None]:
train_vectors = vectorizer.transform(full_dataset.train_set)
test_vectors = vectorizer.transform(full_dataset.test_set)
# model = MLPClassifier(hidden_layer_sizes=(30,),batch_size=16,max_iter=20)
# classifier = make_pipeline(vectorizer,model)
print(train_vectors.shape)
print(test_vectors.shape)

### Trying to use keras NN model

In [1]:
# import tensorflow as tf

In [52]:
import os
print(os.path.expanduser('~'))

C:\Users\khook


### Importing the 20 News Groups dataset to use as a negative

In [33]:
twenty_train = pickle.load(open('twenty_train.sav','rb'))

In [34]:
religion_text = []
for i in range(len(twenty_train.target)):
    if twenty_train.target[i] == 19:
        religion_text.append(twenty_train.data[i])

### Training the OneClassSVM on the MS free text data

In [35]:
one_class_svm = OneClassSVM(nu=0.1,kernel='rbf',gamma=0.1)

In [36]:
one_class_svm.fit(train_vectors)

OneClassSVM(cache_size=200, coef0=0.0, degree=3, gamma=0.1, kernel='rbf',
            max_iter=-1, nu=0.1, random_state=None, shrinking=True, tol=0.001,
            verbose=False)

In [37]:
# testing on the train set to see how it predicts on itself
pred = one_class_svm.predict(train_vectors)

In [38]:
unique, count = np.unique(pred, return_counts=True)
print(unique, count)

[-1  1] [1555 3687]


In [39]:
# testing on the test set (which should all be true btw)
pred_test = one_class_svm.predict(test_vectors)

In [40]:
unique1, count1 = np.unique(pred_test, return_counts=True)
print(unique1, count1)

[-1  1] [776 535]


Transforming the random religion set into vector

In [2]:
# religion_and_test = full_dataset.test_set + religion_text
# religion_and_test_labels = np.concatenate(np.ones_like(np.array(full_dataset.test_set)),np.zeros_like(np.array(religion_text)))

In [3]:
# religion_and_test_vectors = vectorizer.transform(religion_and_test)
# print(religion_and_test_vectors.shape)

In [4]:
# final_test = one_class_svm.predict(religion_and_test_vectors)

In [5]:
# unique2, count2 = np.unique(final_test, return_counts=True)
# print(unique2, count2)