In [None]:
# Remove comments to install required packages
#! pip install nltk==3.5
#! pip install pandas==2.0.3

In [None]:
import os

# Get the notebook directory
notebook_dir = os.getcwd()

# Get the root directory by navigating upwards two levels
root_dir = os.path.dirname(os.path.abspath(os.path.join(notebook_dir, '../../')))

# Change the current working directory to the root directory
os.chdir(root_dir)

In [None]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter
from nltk.util import ngrams

In [None]:
data_dir = 'data/question_answer/questions.csv'
df_all = pd.read_csv(data_dir)

In [None]:
df_all.head()

In [None]:
def detect_ngarms_beginning(sentence, n):
    tokens = nltk.word_tokenize(str(sentence))

    ngrams = list(nltk.ngrams(tokens, n))

    beginning_ngrams = []
    for gram in ngrams:
        if gram[0][0].isupper():
            beginning_ngrams.append(gram)

    return beginning_ngrams

In [None]:
questions = df_all['Question']

In [None]:
answers = df_all['Answer']

## Common n-grams Questions and Answers

### Get n-grams

In [None]:
ngrams_lengths = [3, 4, 5, 6, 7]

a_ngrams = [[] for _ in range(len(ngrams_lengths))]
q_ngrams = [[] for _ in range(len(ngrams_lengths))]

a_3grams, a_4grams, a_5grams, a_6grams, a_7grams = a_ngrams
q_3grams, q_4grams, q_5grams, q_6grams, q_7grams = q_ngrams

for a, q in zip(answers, questions):
    for i, n in enumerate(ngrams_lengths):
        a_ngrams[i].append(detect_ngarms_beginning(str(a), n))
        q_ngrams[i].append(detect_ngarms_beginning(q, n))


### Questions

In [None]:
ngrams_lengths = [0, 1, 2, 3, 4]  # Index mapping to n-gram lengths
ngrams_range = range(3, 8)  # Actual n-gram lengths
top_n = 12

for i, n in zip(ngrams_lengths, ngrams_range):
    single_list = [item for sublist in q_ngrams[i] for item in sublist]
    freq_dist = nltk.FreqDist(single_list)
    top_ngrams = freq_dist.most_common(top_n)
    print(f"Top {top_n} {n}-grams:")
    for ngram, frequency in top_ngrams:
        print(f"{ngram}: {frequency}")
    print()


### Answers

In [None]:
ngrams_lengths = [0, 1, 2, 3, 4]  # Index mapping to n-gram lengths
ngrams_range = range(3, 8)  # Actual n-gram lengths
top_n = 12

for i, n in zip(ngrams_lengths, ngrams_range):
    single_list = [item for sublist in a_ngrams[i] for item in sublist]
    freq_dist = nltk.FreqDist(single_list)
    top_ngrams = freq_dist.most_common(top_n)
    print(f"Top {top_n} {n}-grams:")
    for ngram, frequency in top_ngrams:
        print(f"{ngram}: {frequency}")
    print()
