In [None]:
# packages

# standard
import numpy as np
import pandas as pd

# plots
import matplotlib.pyplot as plt

# string functions
import string

# wordcloud
from wordcloud import WordCloud

# NLTK
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

# sklearn
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
# configurations
pd.set_option('display.max_colwidth', None) # we want to see full cell contents

In [None]:
# stop words
stop_words = stopwords.words('english')
print(stop_words)

In [None]:
# other preparations
porter = PorterStemmer()

In [None]:
# load data
df = pd.read_excel('../input/students-anxiety-and-depression-dataset/dataset.xlsx')
df.head()

In [None]:
# data frame overview
df.info()

In [None]:
# remove rows with missing values
df = df.dropna()
df.reset_index(inplace=True, drop=True)
df.info()

In [None]:
# plot target
df.label.value_counts().plot(kind='bar')
plt.grid()
plt.show()

# Preprocessing of Text

In [None]:
# cleaning text
def clean_text(i_text):
    # convert to lower case
    step_1 = i_text.lower()
    # remove punctuation
    step_2 = ''.join([char for char in step_1 if char not in string.punctuation])
    result = step_2
    return (result)

In [None]:
# extract words
def extract_tokens(i_text):
    step_1 = word_tokenize(i_text)
    step_2 = [word for word in step_1 if word not in stop_words]
    step_3 = [porter.stem(word) for word in step_2]
    result = step_3
    return (result)

In [None]:
# clean version of texts
df['text_clean'] = df.text.apply(clean_text)

In [None]:
# tokenize texts
df['tokens'] = df.text_clean.apply(extract_tokens)

In [None]:
# convert token list into text
df['text_tokens'] = df.tokens.apply(lambda x : ' '.join(x))

In [None]:
# show results so far
df.head()

# Wordclouds by Label

In [None]:
# combine all texts into one, split by target
text_0 = " ".join(xx for xx in df.text[df.label==0])
text_1 = " ".join(xx for xx in df.text[df.label==1])

In [None]:
# refine stopwords for wordcloud
stop_words_cloud = stop_words
stop_words_cloud.append("i'm")
stop_words_cloud.append("i'll")
stop_words_cloud.append("i've")
stop_words_cloud.append("can't")
stop_words_cloud.append('ðÿ')
stop_words_cloud.append('â')

In [None]:
# wordcloud for label=0
wordcloud = WordCloud(stopwords=stop_words_cloud, max_font_size=50, max_words=250,
                      width = 600, height = 400,
                      background_color='black').generate(text_0)
plt.figure(figsize=(12,8))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
# wordcloud for label=1
wordcloud = WordCloud(stopwords=stop_words_cloud, max_font_size=50, max_words=250,
                      width = 600, height = 400,
                      background_color='black').generate(text_1)
plt.figure(figsize=(12,8))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

# Vectorization

In [None]:
# run TFIDF analysis
maxfeat = 100
tfidf = TfidfVectorizer(max_features=maxfeat)
tfidf = tfidf.fit_transform(df.text_tokens)
tfidf_matrix = tfidf.toarray()

In [None]:
column_names = ['f'+str(i) for i in range(maxfeat)]
tfidf_matrix_df = pd.DataFrame(tfidf_matrix, columns=column_names)

In [None]:
# add vectorization results to data frame
df = pd.concat([df, tfidf_matrix_df], axis=1)
df.head()

### work in progress...