In [None]:
# access google drive folder
%cd /content/drive/MyDrive/Yoga_Classes

# running the setup file containing basic libraries and functions
%run 'notebooks/scripts/setup.ipynb'

# import natural language processing toolkit
import nltk
nltk.download("stopwords")
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# import modules to enable text analysis
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from wordcloud import WordCloud, STOPWORDS

# import modules to plot distributions
from plotly.offline import plot
import plotly.graph_objects as go
import plotly.express as px

## Functions for text-*preprocessing*


In order to prepare textual data for any further quantitative manipulation it has to be cleaned and transformed properly. The text preprocessing steps in this case are the following:
* Cleaning: keep only alphanumerical letters (no punctuations, questions marks, tabs etc.)
* Lowercase all words
* Remove stop words from the texts. We used basic nltk english set of stopwords (very common english words) and defined domain-specific set of words (yoga/fitness class related words that do not bring any value for the analysis no matter how often they appear)
* Lemmatize the words in the text. Lemmatization stands for bringing words to their basic form (so to take into account only semantic differences, not morphological)


In [None]:
def text_clean(corpus):
    cleaned_corpus = pd.Series(dtype='object')
    for row in corpus:
        qs = []
        for word in row.split():
            p1 = re.sub(pattern='[^a-zA-Z0-9]',repl=' ',string=word)
            p1 = p1.lower()
            qs.append(p1)
        cleaned_corpus = pd.concat([cleaned_corpus, pd.Series(' '.join(qs))])
    return cleaned_corpus

In [None]:
stop_words = set(stopwords.words('english'))

def stopwords_removal(corpus, stop_set=None):
    corpus = [[x for x in x.split() if x not in stop_set] for x in corpus]
    corpus = [' '.join(x) for x in corpus]
    return corpus


In [None]:
def lemmatize(corpus):
    lemmatizer = WordNetLemmatizer()
    corpus = [[lemmatizer.lemmatize(x, pos = 'v') for x in x.split()] for x in corpus]
    corpus = [' '.join(x) for x in corpus]
    return corpus

In [None]:
def preprocess_text(corpus, cleaning = True, lemmatization = True, remove_stopwords = True, **kwargs):

    if cleaning == True:
        corpus = text_clean(corpus)

    if lemmatization == True:
        corpus = lemmatize(corpus)

    if remove_stopwords == True:
        corpus = stopwords_removal(corpus, **kwargs)

    return corpus

## Functions for basic text-*exploration*

In [None]:
def get_top_n_words(corpus, n=None):
    vec = CountVectorizer(stop_words='english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    df_unigrams = pd.DataFrame(words_freq[:n], columns = ['unigram' , 'count'])
    return df_unigrams

In [None]:
def get_top_n_bigram(corpus, n=None):
    vec = CountVectorizer(ngram_range=(2, 2), stop_words='english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    df_bigrams = pd.DataFrame(words_freq[:n], columns = ['bigram' , 'count'])
    return df_bigrams

In [None]:
def plot_ngrams_distribution(df_ngram, ngram):

  fig = go.Figure([go.Bar(x=df_ngram[ngram], y=df_ngram['count'])])
  title = f"Top 15 {ngram} in the corpus after pre-processing"
  fig.update_layout(title=go.layout.Title(text=title))
  fig.show()

In [None]:
def create_word_cloud(text):
  word_cloud = WordCloud(
        width=3000,
        height=2000,
        random_state=1,
        collocations=False,
        stopwords=STOPWORDS,
        ).generate(text)

  plt.imshow(word_cloud)
  plt.axis("off")
  plt.show()


In [None]:
def get_style_word_clouds(df, class_styles_list):

  def get_style_descriptions_variable(df, class_style):
    df = df[df['class_style'] == class_style]
    texts = df['cleaned_text_with_styles']
    text = " ".join(var for var in texts)
  return text

  for i in class_styles_list:
    class_text = get_style_descriptions_variable(df, i)
    print(f'Word Cloud for {i} class_style')
    create_word_cloud(class_text)
    # save figure

In [None]:
def word_count(series, title, xlabel, ylabel, var_name, mean_line=True, median_line=True, bins=20):

  series.hist(bins=bins)

  plt.title(title)
  plt.xlabel(xlabel)
  plt.ylabel(ylabel)

  mean = round(series.mean())
  median = round(series.median())
  max = series.max()
  min = series.min()

  print(f'The shortest {var_name} has {min} words in total')
  print(f'The longest {var_name} has {max} words in total')
  print(f'The mean {var_name} length is {mean} words')
  print(f'The median {var_name} length is {median} words')

  if mean_line: #show mean line
    plt.axvline(mean, color='k', linestyle='dashed', linewidth=1, label = f'mean: {mean}')

  if median_line: #show median line
    plt.axvline(median, color='r', linestyle='-', linewidth=1, label = f'median: {median}')

  plt.legend(loc="upper right")