In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

**Importing the Datasets**

On carefully examining the data given, the only useful column is the 'Description'. This column gives a brief insight (written by the author) regarding his/her life, personal interests and hobbies. In order to predict whether the person is anyway related to Healthcare. Hence, this column is translated from various languages(such as Portuguese, Spanish, Dutch, Italian, Arabic) to English. ***You can find the code that was used for the translation here: [](http://hello.com)***


In [None]:
# importing the training data
author_data = pd.read_csv('../input/preprocessed_train.csv')
author_data.head()


We see that now that we are using the pre-processed version of the previous data-set, (original) we have removed the attributes like: ***Name, Location, Twitter Handle*** as they are un-relevant in deciding whether the social-media account is that of a Health-Care professional.

In [None]:
# similarly importing the test-set for future predictions
author_data_test = pd.read_csv('../input/preprocessed_test.csv')
author_data_test.head()

**Checking the Distribution of the Training Set**

We check the distribution, i.e., the number of data-points which are manually classified by the client as ***HEALTH-CARE PROFESSIONALS***. A simple balanced distibution will make it easier for the models to train.

In [None]:
# showing the distribution
sns.countplot(x='HCP_flag', data=author_data)
ax = plt.gca()
ax.set_xticklabels(('No', 'Yes'))
plt.xlabel('Health-Care Professional')
plt.ylabel('Count')
plt.title('Distribution of Data (Training Set)')
plt.show()

As expected, the number of data-pointsm having a health-care is far less compared to the ordinary profiles in the social media data-set. However, this is practically sound, and we need to proceed likewise.

In [None]:
# selecting the features and label to be used for prediction and testing 
# for the predictions we will be using this to get the future features
author_data = author_data.loc[:, ['Description', 'HCP_flag']]
author_data_test = author_data_test.loc[:, ['Description', 'HCP_flag']]
author_data.head(10)

In [None]:
author_data['Description'].dtype

In [None]:
# removing the rows with Nan description
author_data = author_data[pd.notna(author_data['Description'])]
author_data.head()

# TODO: why?

**Visualizing the Author Description**

By using the concept of WordCloud analysis, we will be taking a look at the set of words that were used to describe a health-care professional and the set, which does not describe a health-care professional. 

In [None]:
from wordcloud import WordCloud, STOPWORDS , ImageColorGenerator
is_hcp_data = author_data[author_data['HCP_flag'] == 1]
is_not_hcp_data = author_data[author_data['HCP_flag'] == 0]
words_all = ' '.join(description for description in author_data['Description'])
words_is_hcp = " ".join(description for description in is_hcp_data['Description'])
words_is_not_hcp = " ".join(description for description in is_not_hcp_data['Description'])

# creating the sub-plots
fig, ax = plt.subplots(3, 1, figsize  = (30,30))

# create and generate a word cloud image:
wordcloud_all = WordCloud(max_font_size=50, max_words=100, background_color="white").generate(words_all)
wordcloud_is_hcp = WordCloud(max_font_size=50, max_words=100, background_color="white").generate(words_is_hcp)
wordcloud_is_not_hcp = WordCloud(max_font_size=50, max_words=100, background_color="white").generate(words_is_not_hcp)

# display the generated image:
ax[0].imshow(wordcloud_all, interpolation='bilinear')
ax[0].set_title('All Words', fontsize=30)
ax[0].axis('off')
ax[1].imshow(wordcloud_is_hcp, interpolation='bilinear')
ax[1].set_title('Words under Is HCP Class',fontsize=30)
ax[1].axis('off')
ax[2].imshow(wordcloud_is_not_hcp, interpolation='bilinear')
ax[2].set_title('Tweets under Is Not HCP Class',fontsize=30)
ax[2].axis('off')
plt.show()

Some useful insights we get from this:-
* We can see the importance of words like **'cancer'**, **'medicine'**, **'physician'**, **'oncology'**, **'surgeon'** are being higlighted as **highly frequent** in profiles of the social media handles, who are associated with health-care. This seems practical. 
***A doctor is sensible enough, to have a meaningful description.***
* Few high frequency tokens such as **'health'**, **'tweet'**, **'cancer'** are frequently used in both the categorical classes. 
***Why would someone have **'cancer'** in their description?*** 
    We can conclude this in several ways:-
        - Maybe he/she was a cancer survivor (like Yuvraj, Hugh Jackman)
        - Maybe that is his/her zodiac sign (Psst! Nobody cares.)
* Removing these words along with stops words would not impact the performance. 

**Pre-processing the data**

Even though we are planning to use a better optimizer than the usual CountVectorizer class of the NLTK library, we need to futher remove the noise from the data, so that our prediction is more accurate. We may proceed as follows:-  
**(A)** Removing punctuations
**(B)** Converting to Lower-Case

***As you can see, we have already transformed the data-set, during language translation. But, what else?***

<img src="https://yoast.com/app/uploads/2015/12/Stopwords_in_your_focus_keywords_FI.png">>

![](http://https://yoast.com/app/uploads/2015/12/Stopwords_in_your_focus_keywords_FI.png)

**STOP WORDS!** 
The definition of what’s a stop word may vary. You may consider a stop word a word that has high frequency on a corpus. Or you can consider every word that’s empty of true meaning given a context.
Words such as articles and some verbs are usually considered stop words because they don’t help us to find the context or the true meaning of a sentence. These are words that can be removed without any negative consequences to the final model that you are training.

In [None]:
# trying to see what stop-words can be removed from our data-set
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
print(stop_words)

So, we just get rid of them?
We will be coming back to this step, during the Vectorization phase of the text-data. For now, let's move forward!

In [None]:
# looking at the final list of tokens without the stop-words
import nltk
from nltk.tokenize import word_tokenize
merged_desc = pd.concat([author_data['Description'], author_data_test['Description']], axis=0)
# fill the NaN values with 'X'
merged_desc.fillna('X', inplace=True)
reviews = merged_desc.str.cat(sep=' ')
import re
reviews = re.sub('[^a-zA-Z]', ' ', reviews)
tokens = word_tokenize(reviews)
vocabulary = set(tokens)
print('Total list of tokens: ', len(vocabulary))


In [None]:
# after removal of stop-words
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
tokens = [w for w in tokens if (not w in stop_words and len(w) > 2)]
print(tokens)


As you can see, we have tried to refine the set of words which we are going to use ***by doing two things***:-
* Removing the stop-words (can be also done, using **TfidfVectorizer**
* Removing all the words having length of **less than 2** (these are totally not relevant, and will help us to further optimize our models)

**Refining the 'Description' column further for training sets**
We include only those words in the 'Profile Description' that are present in the **tokens** set, recently constructed. Hence, we need not use the CountVectorizer method, and ***can further continue this rather adventurous journey!***

In [None]:
"""
    A function that transforms the 'Description' column for each of the individual rows as described above.
"""
# remove the word 'tweet' from tokens
author_data['Description'] = author_data['Description'].str.lower()
author_data_test['Description'] = author_data_test['Description'].str.lower()

author_data['Description'] = author_data['Description'].apply(lambda x: ' '.join([w for w in str(x).split() if (w in tokens and not w == 'tweet' and not w == 'tweets')]))
author_data_test['Description'] = author_data_test['Description'].apply(lambda x: ' '.join([w for w in str(x).split() if (w in tokens and not w == 'tweet' and not w == 'tweets')]))


In [None]:
author_data_test.head(10)

Further visualizing our progress with the word-clouds. Let's see if there is any difference. ***Basically, reduction in noise.***

In [None]:
from wordcloud import WordCloud, STOPWORDS , ImageColorGenerator
is_hcp_data = author_data[author_data['HCP_flag'] == 1]
is_not_hcp_data = author_data[author_data['HCP_flag'] == 0]
words_all = ' '.join(description for description in author_data['Description'])
words_is_hcp = " ".join(description for description in is_hcp_data['Description'])
words_is_not_hcp = " ".join(description for description in is_not_hcp_data['Description'])

# creating the sub-plots
fig, ax = plt.subplots(3, 1, figsize  = (30,30))

# create and generate a word cloud image:
wordcloud_all = WordCloud(max_font_size=50, max_words=100, background_color='black').generate(words_all)
wordcloud_is_hcp = WordCloud(max_font_size=50, max_words=100, background_color='black').generate(words_is_hcp)
wordcloud_is_not_hcp = WordCloud(max_font_size=50, max_words=100, background_color='black').generate(words_is_not_hcp)

# display the generated image:
ax[0].imshow(wordcloud_all, interpolation='bilinear')
ax[0].set_title('All Words', fontsize=30)
ax[0].axis('off')
ax[1].imshow(wordcloud_is_hcp, interpolation='bilinear')
ax[1].set_title('Words under Is HCP Class',fontsize=30)
ax[1].axis('off')
ax[2].imshow(wordcloud_is_not_hcp, interpolation='bilinear')
ax[2].set_title('Tweets under Is Not HCP Class',fontsize=30)
ax[2].axis('off')
plt.show()

***Well! Is that it?***

**Stemming**

In natural language processing, there may come a time when we want the model to recognize that the words “ask” and “asked” are just different tenses of the same verb. This is the idea of reducing different forms of a word to a core root. Words that are derived from one another can be mapped to a central word or symbol, especially if they have the same core meaning.
Maybe this is in an information retrieval setting and in-order want to boost the algorithm’s recall. Or perhaps you are trying to analyze word usage in a corpus and wish to condense related words so that you don’t have as much variability. Either way, this technique of text normalization may be useful to you.



<img src="https://miro.medium.com/max/660/0*o8l4UfdWOL2KTljk.jpg">>

With stemming, words are reduced to their word stems. A word stem need not be the same root as a dictionary-based morphological root, it just is an equal to or smaller form of the word.

***We won't be using this approach, although we have the description data in English, as we need to preserve the authenticty of the technical (medical) terms!***

**Making the Vectorizer**

We will further create the TfidfVectorizer to generate the sparse-matrix of features that will be taken as input, to the model. There are several reasons for using this, as the Vectorizer:-

**1. Tf** stands for Term Frequency. Let's try to understand, what that means! **It is the ratio of number of times the word appears in a document compared to the total number of words in that document. It increases as the number of occurrences of that word within the document increases. Each document has its own tf.**

**2. idf** stands for Inverse Document Frequency. **It is used to calculate the weight of rare words across all documents in the corpus.**

Hence, unlike CountVectorizer class this model will try to give some importance to the rare-scientific terms like **Neuro-surgeon**, **Pharmacy**, etc. There is a weightage that is involved. It is a measure used to evaluate how important a word is, to a document, in a collection of documents.

In [None]:
# getting the features
X = author_data['Description']
y = author_data['HCP_flag']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
# making the vectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
train_vectors = vectorizer.fit_transform(X_train)
test_vectors = vectorizer.transform(X_test)
print(train_vectors.shape, test_vectors.shape)

Hence, we see that our sparse-matrix contains 11922 different words (as features) which will be used to make the prediction models.

In [None]:
# show the set of words that were used in the feature column
print(vectorizer.vocabulary_)

**Creating and Evaluating Models**

Now that we have created the set of vectors, we try and predict the results, with different models. 

In [None]:
from sklearn import metrics
def plot_roc_curve(y_test, y_prob, model_name):
    from sklearn.metrics import roc_curve, auc
    false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_prob)
    roc_auc = auc(false_positive_rate, true_positive_rate)
    plt.figure(figsize=(8, 8))
    plt.title('Receiver Operating Characteristic {} Model'.format(model_name))
    plt.plot(false_positive_rate, true_positive_rate, color='red',label = 'AUC = %0.2f' % roc_auc)
    plt.legend(loc = 'lower right')
    plt.plot([0, 1], [0, 1],linestyle='--')
    plt.axis('tight')
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.show()
    

In [None]:
def evaluate_model(y_test, y_pred, y_score, model_name):
    cm = metrics.confusion_matrix(y_test, y_pred)
    print ('Confusion Matrix for {} Model'.format(model_name))
    print (cm)
    print ('Classification Report for {} Model'.format(model_name))
    print (metrics.classification_report(y_test, y_pred, digits=6))
    print ('Area under under ROC curve for {} Model'.format(model_name))
    print (metrics.roc_auc_score(y_test, y_score))
    plot_roc_curve(y_test, y_score, model_name)

A MultinomialNB Classifier

In [None]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB().fit(train_vectors, y_train)

In [None]:
from  sklearn.metrics  import roc_auc_score
predicted = model.predict(test_vectors)
print(roc_auc_score(y_test, predicted))


In [None]:
predicted_prob = model.predict_proba(test_vectors)
evaluate_model(y_test, predicted, predicted_prob[:, [1]], 'Multinomial Naive-Bayes')

A Random Forest Classifier

In [None]:
# building a random forest classifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix

model = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 100)

model.fit(train_vectors, y_train)

In [None]:
# evaluating the random forest classifier
from  sklearn.metrics  import roc_auc_score
predicted = model.predict(test_vectors)
print(roc_auc_score(y_test,predicted))

In [None]:
# evaluation of Random Forests Classifier
predicted_prob = model.predict_proba(test_vectors)
evaluate_model(y_test, predicted, predicted_prob[:, [1]], 'Random Forests Classifier')

An XGBoost Classifier

In [None]:
# making an XGBoost classifier with hyperparameter tuning
from xgboost.sklearn import XGBClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score, GridSearchCV  #Additional scklearn functions

# tuning the number of decision trees
model = XGBClassifier(learning_rate=0.01,
                    min_child_weight=3, gamma=0.3, subsample=0.6, colsample_bytree=1.0,
                    objective='binary:logistic', eval_metric='auc', nthread=4, scale_pos_weight=1, seed=27, n_jobs=4)
n_estimators = [300] #[100, 150, 200, 250,
max_depth = [2, 4, 6, 8]

params = dict(n_estimators=n_estimators, max_depth=max_depth)
# kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=7)
grid = GridSearchCV(model, params, scoring='roc_auc', n_jobs=10, cv=3)
grid_result = grid.fit(train_vectors, y_train)
print('Best: %f using %s' % (grid_result.best_score_, grid_result.best_params_))

In [None]:
# making the final model
from xgboost.sklearn import XGBClassifier
tuned_xg = XGBClassifier(max_depth=8, n_estimators=300,
                    min_child_weight=3, gamma=0.3, subsample=0.6, colsample_bytree=1.0,
                    objective='binary:logistic', eval_metric='auc', nthread=4, scale_pos_weight=1, seed=27, n_jobs=4)
tuned_xg.fit(train_vectors, y_train)


In [None]:
# evaluation of Random Forests Classifier
predicted_prob = tuned_xg.predict_proba(test_vectors)
evaluate_model(y_test, predicted, predicted_prob[:, [1]], 'XGBoost Classifier')

**Evaluation of the Final Model**

Looking at the confusion matrix, we can say, that the data is carefully normalized. This is because of the default working of the Ensemble learning algorithms.

In [None]:
# predicting the final results for testing part
X_valid = author_data_test['Description']
X_valid = X_valid.fillna('X')
valid_vectors = vectorizer.transform(X_valid)

author_data_test['HCP_flag'] = tuned_xg.predict_proba(valid_vectors)[:, 1]

author_data_test.to_csv('submisstion.csv', index=False)