### Table of Contents
* [Preprocessing of Text](#preproc)
* [Wordclouds split by Label](#clouds)
* [Bigrams split by Label](#bigrams)
* [Vectorization](#vectors)
* [Classification Model](#model)
* [Evaluate Model on Test Set](#test)

In [None]:
# packages

# standard
import numpy as np
import pandas as pd
import time 

# plots
import matplotlib.pyplot as plt
import seaborn as sns

# string functions
import string

# wordcloud
from wordcloud import WordCloud

# NLTK
from nltk import word_tokenize, bigrams
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

# sklearn
from sklearn.feature_extraction.text import TfidfVectorizer

# ML
import h2o
from h2o.estimators import H2ORandomForestEstimator
from h2o.estimators import H2OGradientBoostingEstimator

# other stuff
from collections import Counter

In [None]:
# configurations
pd.set_option('display.max_colwidth', None) # we want to see full cell contents

In [None]:
# stop words
stop_words = stopwords.words('english')
print(stop_words)

In [None]:
# other preparations
porter = PorterStemmer()

In [None]:
# load data (input is an Excel file)
df = pd.read_excel('../input/students-anxiety-and-depression-dataset/dataset.xlsx')
df.head()

In [None]:
# data frame overview
df.info()

In [None]:
# remove rows with missing values
df = df.dropna()
df.reset_index(inplace=True, drop=True)
df.label = df.label.astype(int)
df.info()

In [None]:
# plot target (1 ~ anxiety/depression)
df.label.value_counts().plot(kind='bar')
plt.title('Label distribution - 1 ~ anxiety/depression')
plt.grid()
plt.show()

<a id='preproc'></a>
# Preprocessing of Text

In [None]:
# cleaning text
def clean_text(i_text):
    # convert to lower case
    step_1 = i_text.lower()
    # remove punctuation
    step_2 = ''.join([char for char in step_1 if char not in string.punctuation])
    result = step_2
    return (result)

In [None]:
# extract words
def extract_tokens(i_text):
    step_1 = word_tokenize(i_text)
    step_2 = [word for word in step_1 if word not in stop_words]
    step_3 = [porter.stem(word) for word in step_2]
    result = step_3
    return (result)

In [None]:
# clean version of texts
df['text_clean'] = df.text.apply(clean_text)

In [None]:
# tokenize texts
df['tokens'] = df.text_clean.apply(extract_tokens)

In [None]:
# convert token list into text
df['text_tokens'] = df.tokens.apply(lambda x : ' '.join(x))

In [None]:
# show results so far
df.head()

<a id='clouds'></a>
# Wordclouds by Label

In [None]:
# combine all texts into one, split by target
text_0 = " ".join(xx for xx in df.text[df.label==0])
text_1 = " ".join(xx for xx in df.text[df.label==1])

In [None]:
# refine stopwords for wordcloud
stop_words_cloud = stop_words
stop_words_cloud.append("i'm")
stop_words_cloud.append("i'll")
stop_words_cloud.append("i've")
stop_words_cloud.append("can't")
stop_words_cloud.append('ðÿ')
stop_words_cloud.append('â')

In [None]:
# wordcloud for label=0
wordcloud = WordCloud(stopwords=stop_words_cloud, max_font_size=50, max_words=250,
                      width = 600, height = 400,
                      background_color='black').generate(text_0)
plt.figure(figsize=(12,8))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
# wordcloud for label=1
wordcloud = WordCloud(stopwords=stop_words_cloud, max_font_size=50, max_words=250,
                      width = 600, height = 400,
                      background_color='black').generate(text_1)
plt.figure(figsize=(12,8))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

<a id='bigrams'></a>
# Bigrams by Label

In [None]:
# evaluate bigrams
i_bigrams = bigrams(extract_tokens(clean_text(text_0)))
# and count bigram frequencies
counts = Counter(i_bigrams)
counts = dict(counts)
# convert dictionary to data frame
dcounts = pd.DataFrame.from_dict(counts, orient='index', columns=['frequency'])
# select only bigrams occuring at least twice
dcounts = dcounts[dcounts.frequency>=2]
# and sort descending
dcounts = dcounts.sort_values(by='frequency', ascending=False)

# plot top occurrences
n_top = 30
plt.figure(figsize=(8,12))
xx = list(map(str, dcounts.index.tolist()))
plt.barh(y=xx[0:n_top], width=dcounts.frequency[0:n_top].values)
plt.title('Bigrams - Label = 0')
plt.grid()
plt.gca().invert_yaxis()
plt.show()

In [None]:
# evaluate bigrams
i_bigrams = bigrams(extract_tokens(clean_text(text_1)))
# and count bigram frequencies
counts = dict(Counter(i_bigrams))
# convert dictionary to data frame
dcounts = pd.DataFrame.from_dict(counts, orient='index', columns=['frequency'])
# select only bigrams occuring at least twice
dcounts = dcounts[dcounts.frequency>=2]
# and sort descending
dcounts = dcounts.sort_values(by='frequency', ascending=False)

# plot top occurrences
n_top = 30
plt.figure(figsize=(8,12))
xx = list(map(str, dcounts.index.tolist()))
plt.barh(y=xx[0:n_top], width=dcounts.frequency[0:n_top].values)
plt.title('Bigrams - Label = 1 (Anxiety/Depression)')
plt.grid()
plt.gca().invert_yaxis()
plt.show()

<a id='vectors'></a>
# Vectorization

In [None]:
# run TFIDF analysis
maxfeat = 250
tfidf = TfidfVectorizer(max_features=maxfeat)
tfidf = tfidf.fit_transform(df.text_tokens)

In [None]:
# convert to data frame
column_names = ['f'+str(i) for i in range(maxfeat)]
tfidf_matrix_df = pd.DataFrame(tfidf.toarray(), columns=column_names)

In [None]:
# add vectorization results to data frame
df = pd.concat([df, tfidf_matrix_df], axis=1)
df.head()

<a id='model'></a>
# Classification Model

In [None]:
# start H2O
h2o.init(max_mem_size='12G', nthreads=4) # Use maximum of 12 GB RAM and 4 cores

In [None]:
# select predictors
predictors = column_names
print('Number of predictors: ', len(predictors))

In [None]:
# upload data frame in H2O environment
df_hex = h2o.H2OFrame(df[['text']+predictors + ['label']])

In [None]:
# convert target to categorical
df_hex['label'] = df_hex['label'].asfactor()

# train / test split (70/30)
train_hex, test_hex = df_hex.split_frame(ratios=[0.7], seed=123)

# pandas versions of train/test
df_train = train_hex.as_data_frame()
df_test = test_hex.as_data_frame()

In [None]:
# make available for download
df_train.to_csv('df_train.csv')
df_test.to_csv('df_test.csv')

In [None]:
# define Gradient Boosting model
fit_1 = H2OGradientBoostingEstimator(ntrees = 25,
                                     max_depth=6,
                                     min_rows=15,
                                     learn_rate=0.005, # default: 0.1
                                     sample_rate=1,
                                     col_sample_rate=0.5,
                                     nfolds=5,
                                     score_each_iteration=True,
                                     stopping_metric='AUC',
                                     stopping_rounds=5,
                                     seed=123)

In [None]:
# and train the model
t1 = time.time()
fit_1.train(x=predictors,
            y='label',
            training_frame=train_hex)
t2 = time.time()
print('Elapsed time [s]: ', np.round(t2-t1,2))

In [None]:
# show scoring history - training vs cross validations
for i in range(5):
    cv_model_temp = fit_1.cross_validation_models()[i]
    df_cv_score_history = cv_model_temp.score_history()
    my_title = 'CV ' + str(1+i) + ' - Scoring History [AUC]'
    plt.scatter(df_cv_score_history.number_of_trees,
                y=df_cv_score_history.training_auc, 
                c='blue', label='training')
    plt.scatter(df_cv_score_history.number_of_trees,
                y=df_cv_score_history.validation_auc, 
                c='darkorange', label='validation')
    plt.title(my_title)
    plt.xlabel('Number of Trees')
    plt.ylabel('AUC')
    plt.ylim(0.9,1)
    plt.legend()
    plt.grid()
    plt.show()


In [None]:
# show cross validation metrics
fit_1.cross_validation_metrics_summary()

### ROC Curves

In [None]:
# training performance
perf_train = fit_1.model_performance(train=True)
perf_train.plot();

In [None]:
# cross validation performance
perf_cv = fit_1.model_performance(xval=True)
perf_cv.plot();

### Variable Importance

In [None]:
# variable importance - basic version
fit_1.varimp_plot();

In [None]:
# variable importance using shap values => see direction as well as severity of feature impact
t1 = time.time()
fit_1.shap_summary_plot(train_hex);
t2 = time.time()
print('Elapsed time [s]: ', np.round(t2-t1,2))

In [None]:
# show most influential factor vs target
sns.boxplot(data=df, y='f176', x='label')
plt.grid()
plt.show()

In [None]:
# show confusion matrix at calculated threshold - training
conf_train = fit_1.confusion_matrix(train=True)
conf_train.show()

In [None]:
# store threshold from training
tt = 0.1031606238288816

In [None]:
# show confusion matrix at calculated threshold - cross validation
conf_cv_tt = fit_1.confusion_matrix(xval=True, thresholds=tt)
conf_cv_tt.show()

<a id='test'></a>
# Evaluate Model on Test Set

In [None]:
# calc performance on test test
perf_test = fit_1.model_performance(test_hex)
# ROC Curve - Test Set
perf_test.plot();

In [None]:
# confusion matrix using threshold from training
conf_test = perf_test.confusion_matrix(thresholds=tt)
conf_test.show()

In [None]:
# calc predictions
pred_test = fit_1.predict(test_hex)['p1']
pred_test = pred_test.as_data_frame().p1

# connect predictions with data frame
df_test['prediction'] = pred_test

# and plot
plt.hist(pred_test, bins=50)
plt.title('Predictions on Test Set (Probabilities)')
plt.grid()
plt.show()

In [None]:
# show highest predictions
df_top = df_test.nlargest(5, columns='prediction')
df_top[['text','label','prediction']]

In [None]:
# show lowest predictions
df_bot = df_test.nsmallest(5, columns='prediction')
df_bot[['text','label','prediction']]

### Thank you for your interest!