In [None]:
import pandas as pd
import numpy as np
import os.path as op
import unicodedata
import sys
from gensim.models import FastText
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, cross_val_predict, StratifiedShuffleSplit, train_test_split
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.pipeline import Pipeline
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.utils.multiclass import unique_labels
from sklearn.metrics import classification_report, make_scorer

basename = r".." #path to basename file
path_utils = op.join(basename , "utils")
sys.path.insert(0, path_utils)

from sys_utils import load_library
from tweet_utils import *

from preprocess import Preprocess
prep = Preprocess()

model_we = FastText.load(r"fasttext_path")

trainingData = pd.read_csv(r"path_labelled_data")


print(trainingData.dtypes)
trainingData.head(5)

In [None]:
def preprocess_tweet(tweet):
    tweet = prep.replace_hashtags_URL_USER(tweet, mode_URL="replace", mode_Mentions="replace")
    tweet = prep.tokenize(tweet)
    return tweet


def create_history_typeDiabetes_column(row):
    """ Create column history of type diabetes """
    if row["History_TypeDiab"] == 0: return 0
    elif row["History_TypeDiab"] == 1: return 1
    elif row["History_TypeDiab"] == 2: return 2
    elif pd.isnull(row["History_TypeDiab"]): return row["Type_Diabetes"]
    else: print("ERROR: Should not occur:  ", row["Type_Diabetes"], ";;;", row["Type_Diabetes"])

trainingData['history_typeDiab_total'] = trainingData.apply (lambda row: create_history_typeDiabetes_column(row), axis=1)


In [None]:
#label = "Type_Diabetes"
label = "history_typeDiab_total"
data_pd = trainingData[["text", "user_description", "user_name", label]][:30000]

data_pd.head()

data_pd.text = data_pd.text.map(lambda tweet: tweet_vectorizer(preprocess_tweet(tweet), model_we))
data_pd.user_description = data_pd.user_description.map(lambda userDesc: np.zeros((model_we.vector_size, )) 
                                                if isinstance(userDesc, float) or userDesc == " " 
                                                else tweet_vectorizer(preprocess_tweet(userDesc), model_we))


# remove the tweets that are empty because there is no word embedding
data_pd = data_pd[data_pd["text"].apply(lambda x: len(x)>0) ]
print(data_pd.shape)

#data_pd.user_name = data_pd.user_name.map(lambda tweet: prep.remove_non_ascii(tweet))
data_pd.head(10)


In [None]:
class ItemSelect(BaseEstimator, TransformerMixin):
    
    def __init__(self, key):
        self.key = key
        
    def fit(self, x, y=None):
        return self
    
    def transform(self, data):
        return np.asarray(data[self.key].values.tolist())

In [None]:
# choose algo:
#---------------------------------------------------------------------------
modelAlgo = "SVC"

if modelAlgo == "MultinomialNB":
    model = MultinomialNB(random_state=0)
elif modelAlgo == "SVC":
    model = SVC(random_state=0)
elif modelAlgo == "logReg":
    model = LogisticRegression(random_state=0)
elif modelAlgo == "RandomForest" :
    model = RandomForestClassifier(random_state=0)
elif modelAlgo == "XGBoost" :
    model = XGBClassifier(random_state=0)
elif modelAlgo == "MLP" :
    model = MLPClassifier(early_stopping=True, batch_size=32, random_state=0)

from imblearn.pipeline import Pipeline    
    
pipeline  = Pipeline([
                ('union', FeatureUnion(
                            transformer_list = [
                                ('tweet', Pipeline([
                                    ('tweetsSelector', ItemSelect(key='text')),
                                ])),  
                                ('userDesc', Pipeline([
                                    ('userDescSelector', ItemSelect(key='user_description'))
                                ])),
                            ],
                )),
                ('smote', SMOTE(random_state=12, sampling_strategy="auto", n_jobs=-1)), # , ratio = 1.0
                ('model', model),
            ])


# parameter grid for grid search by using fastText embeddings
parameters = {
                'union__transformer_weights' : #[#{"tweet": 1, "userDesc":1, "userName":1},
#                                                {"tweet": 1, "userDesc":1, "userName":0.8}, 
#                                                {"tweet": 1, "userDesc":1, "userName":0.5},
#                                                {"tweet": 1, "userDesc":1, "userName":0.5},
#                                                {"tweet": 1, "userDesc":1, "userName":0.4},
#                                                {"tweet": 1, "userDesc":0.8, "userName":0.4},
#                                                {"tweet": 1, "userDesc":0.8, "userName":0.5},
#                                                {"tweet": 1, "userDesc":0.8, "userName":0.6},
#                                                {"tweet": 1, "userDesc":0.7, "userName":0.5},
#                                                {"tweet": 1, "userDesc":0.9, "userName":0.5},
#                                                ],
                                               [#{"tweet": 1, "userDesc":1}, 
#                                                {"tweet": 1, "userDesc":0.7}, 
#                                                {"tweet": 1, "userDesc":0.5},
#                                                {"tweet": 1, "userDesc":0.3},
#                                                {"tweet": 1, "userDesc":0.1},
#                                                {"tweet": 1, "userDesc":0.0},
#                                                {"tweet": 0, "userDesc":1}
                                                ],
    
#               'smote__k_neighbors' : [3],
#               # param for SVC
#               'model__kernel' : ["linear"],#["linear", "poly", "rbf"],
#               'model__C' : [0.5],
#               'model__tol' : [1e-2],
#               'model__class_weight' : ["balanced", {0:1, 1:1, 2:1}, {0:1, 1:2, 2:1}, {0:1, 1:1, 2:2}, {0:1, 1:2, 2:2}],
#
#               # param for RandomForestClassifier
#               'model__n_estimators' : [50, 100, 150],
#               'model__criterion' : ['gini', 'entropy'],
#               'model__max_features' : ['auto', 'log2'],
#               'model__max_depth' : [ 5, 10, 20, 30]
#
#               # param for XGBoost Best: 0.910828 using {'model__learning_rate': 0.05, 'model__reg_alpha': 0, 'model__max_depth': 3, 'model__reg_lambda': 1.5, 'model__n_estimators': 300}
#               'model__max_depth' : [3,4],
#               'model__learning_rate' : [0.5, 0.1, 0.05],#, 0.01, 0.001],
#               'model__booster' : ["gblinear"], #["gbtree", "gblinear", "dart"],
#               'model__gamma' : [0, 0.01],
#               'model__n_estimators' : [80, 100, 150],
#               'model__reg_alpha' : [0, 0.1],
#               'model__reg_lambda' : [0.5, 1.0]
}


In [None]:
def label_encode(sex):
    # no type 0, type 1 = 1, type 2 = 2
    if sex == 0: return(-1) 
    else: return(1)
    
    
print("data before filter out gestational diabetes:", data_pd.shape, type(data_pd))
data_pd_withoutGestational = data_pd.loc[data_pd[label] != 3] # ignore tweets with gestational diabetes as there are too few

#print("data after before filter out gestational diabetes:", data_pd_withoutGestational.shape, type(data_pd_withoutGestational))
X = data_pd_withoutGestational[["text", "user_description"]]
y = data_pd_withoutGestational[label]

print("X :", X.shape, type(X))
print("y.unique: ", y.unique())
print(y.value_counts())


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0) 

X_train_pd = pd.DataFrame(X_train, columns=["text", "user_description"])
X_test_pd = pd.DataFrame(X_test, columns=["text", "user_description"])

#from sklearn.metrics import precision_score, roc_auc_score, make_scorer
prec_scorer = make_scorer(precision_score, average="micro")
print("Start Grid search...")
grid = GridSearchCV(pipeline, parameters, cv=10, n_jobs=-2, verbose=2, scoring=prec_scorer)
grid.fit(X_train_pd, y_train)
print("\nBest: %f using %s" % (grid.best_score_, grid.best_params_))

y_pred = grid.best_estimator_.predict(X_test_pd)
#print("F1-Score:", f1_score(y_test, y_pred))
#print("Precision: ",precision_score(y_test, y_pred))
#print("Recall: ", recall_score(y_test, y_pred))    
print("Accuracy: ", accuracy_score(y_test, y_pred))   
print("Performance overall: ")
print(classification_report(y_test, y_pred))

In [None]:
# SVC - SMOTE - precision scoring
 

# Take this one

# SVC - no SMOTE
#Best: 0.733737 using {'model__C': 0.5, 'model__kernel': 'linear', 'model__tol': 0.1, 'union__transformer_weights': {'tweet': 1, 'userDesc': 0.5}}
#Accuracy:  0.7530864197530864
#Performance overall: 
#              precision    recall  f1-score   support

#           0       0.71      0.87      0.78       243
#           1       0.83      0.69      0.75       157
#           2       0.78      0.64      0.70       167

#   micro avg       0.75      0.75      0.75       567
#   macro avg       0.77      0.73      0.75       567
#weighted avg       0.76      0.75      0.75       567


# SVC - SMOTE
#Best: 0.739032 using {'model__C': 0.5, 'model__kernel': 'linear', 'model__tol': 0.01, 'smote__k_neighbors': 3, 'union__transformer_weights': {'tweet': 1, 'userDesc': 0.3}}
#Accuracy:  0.7372134038800705
#Performance overall: 
#              precision    recall  f1-score   support

#           0       0.74      0.78      0.76       243
#           1       0.72      0.69      0.71       157
#           2       0.75      0.72      0.73       167

#   micro avg       0.74      0.74      0.74       567
#   macro avg       0.74      0.73      0.73       567
#weighted avg       0.74      0.74      0.74       567


In [None]:
#save model
import joblib
joblib.dump(grid.best_estimator_, '', compress = 1)

In [None]:
#Plot confusion matrix
def plot_confusion_matrix(y_true, y_pred, classes,
                          normalize=False,
                          title=None,
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if not title:
        if normalize:
            title = 'Normalized confusion matrix'
        else:
            title = 'Confusion matrix, without normalization'

    # Compute confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    # Only use the labels that appear in the data
    print("Classes before:", classes)
    print("unique labels:", unique_labels(y_true, y_pred))
    classes = classes[unique_labels(y_true, y_pred)]
    print("Classes:", classes)
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    fig, ax = plt.subplots()
    im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
    ax.figure.colorbar(im, ax=ax)
    # We want to show all ticks...
    ax.set(xticks=np.arange(cm.shape[1]),
           yticks=np.arange(cm.shape[0]),
           # ... and label them with the respective list entries
           xticklabels=classes, yticklabels=classes,
           title=title,
           ylabel='True label',
           xlabel='Predicted label')

    # Rotate the tick labels and set their alignment.
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")

    # Loop over data dimensions and create text annotations.
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], fmt),
                    ha="center", va="center",
                    color="white" if cm[i, j] > thresh else "black")
    fig.tight_layout()
    return ax



np.set_printoptions(precision=2)

#class_names = np.array(["M", "F", "U"])
class_names = np.array(["Unknown", "Type 1", "Type 2"])

# Plot non-normalized confusion matrix
plot_confusion_matrix(y_test, y_pred, classes=class_names,
                      title='Confusion matrix, without normalization')

# Plot normalized confusion matrix
plot_confusion_matrix(y_test, y_pred, classes=class_names, normalize=True,
                      title='Normalized confusion matrix')

plt.show()

# Predict having diabetes

In [None]:
def preprocess_tweet(tweet):
    tweet = prep.replace_hashtags_URL_USER(tweet, mode_URL="replace", mode_Mentions="replace")
    tweet = prep.tokenize(tweet)
    return tweet


def create_haveDiab_column(row):
    if isinstance(row["History_HasDiab"], str): return row["HasDiabetes"]
    elif float(row["History_HasDiab"]) < 1e-9: return 0
    elif float(row["History_HasDiab"])-1 < 1e-9: return 1
    elif pd.isnull(row["History_HasDiab"]) : return row["HasDiabetes"]
    else: print("ERROR: Should not occur:  ", row["HasDiabetes"], ";;;", row["History_HasDiab"])

def preprocess_tweet(tweet):
    tweet = prep.replace_hashtags_URL_USER(tweet, mode_URL="replace", mode_Mentions="replace")
    tweet = prep.tokenize(tweet)
    return tweet

def userName_to_vec(name):
    try:
        firstName = unicodedata.normalize('NFKD', name).encode('ascii', 'ignore').decode('utf-8', 'ignore').split(" ")[0].replace(" ", "")
        vec = model_we[firstName]
    except:
        vec = np.zeros((model_we.vector_size, ))
    return vec
df = pd.read_csv(r"") #import dataset 
#del df["emotion"]
#del df["__index_level_1__"]
print(df.shape)
print(df.columns)
print(df.head())

df['HaveDiab_merge'] = df.apply (lambda row: create_haveDiab_column(row), axis=1)

df.HaveDiab_merge.value_counts()

df = pd.read_csv(r"") #import dataset 
print(df.shape)
print(df.columns)
print(df.head())

In [None]:
label = "Type_Diabetes"
data_pd = df[["text", "user_description", "user_name", label]]

temp = pd.DataFrame()
temp["text"] = df.text.map(lambda tweet: tweet_vectorizer(preprocess_tweet(tweet), model_we))
temp["user_description"] = df.user_description.map(lambda userDesc: np.zeros((model_we.vector_size, )) 
                                                if isinstance(userDesc, float) or userDesc == " " or userDesc == None
                                                else tweet_vectorizer(preprocess_tweet(userDesc), model_we))
temp["user_name"] = df.user_name.map(lambda name: userName_to_vec(name))

gender_classifier = joblib.load("")

df["Type_diabetes"] = gender_classifier.predict(temp)
df.head()

In [None]:
#export new dataset as csv
df.to_csv(r"",index=False)