In [1]:
import pandas as pd
import numpy as np
import nltk

## Quick overview of the basic properties of the dataframes (train_context and _question)

In [2]:
df = pd.read_pickle('qt_train_context_long.pickle')
df_labels = pd.read_pickle("qt_train_questions.pickle")

In [3]:
df.head()

Unnamed: 0,uid,score,context
0,s3q8053,5.811907,Backgammon FAQ : Different Ways of Playing Bac...
1,s3q8053,5.488704,Backgammon Rules - How to Play Backgammon Navi...
2,s3q8053,5.472953,Backgammon Rules Backgammon Backgammon Home Ba...
3,s3q8053,5.097101,Backgammon FAQ Home Backgammon Articles Backga...
4,s3q8053,4.011511,The Rules of How to Play Backgammon Back to 1o...


In [4]:
df_labels.head()

Unnamed: 0,answer,question,uid,tags
0,24,How many points does a backgammon board have,s3q8053,"[1tok, yes-answer-long, yes-answer-short]"
1,sherlock holmes,Whose cases were Empty House Copper Beeches Bl...,s3q33199,"[yes-answer-long, yes-answer-short]"
2,sam torrance,Which Scottish Golfer Was Captain Of Europes 2...,s3q33198,[]
3,first quarter,What is a two-bit moon,s3q33194,"[yes-answer-long, yes-answer-short]"
4,nissan,The `` Maxima '' was a model of which car,s3q33197,"[1tok, yes-answer-long, yes-answer-short]"


In [11]:
df.shape
df_labels.shape

(37012, 4)

## Merge dataframes and drop the non-text columns to keep only the context, question, and answer

In [5]:
# Merge dataframes with respect to the unique UIDs
condition = (df_labels[df_labels.uid.isin(df.uid)])
df = df.merge(condition)

df.head()

Unnamed: 0,uid,score,context,answer,question,tags
0,s3q8053,5.811907,Backgammon FAQ : Different Ways of Playing Bac...,24,How many points does a backgammon board have,"[1tok, yes-answer-long, yes-answer-short]"
1,s3q8053,5.488704,Backgammon Rules - How to Play Backgammon Navi...,24,How many points does a backgammon board have,"[1tok, yes-answer-long, yes-answer-short]"
2,s3q8053,5.472953,Backgammon Rules Backgammon Backgammon Home Ba...,24,How many points does a backgammon board have,"[1tok, yes-answer-long, yes-answer-short]"
3,s3q8053,5.097101,Backgammon FAQ Home Backgammon Articles Backga...,24,How many points does a backgammon board have,"[1tok, yes-answer-long, yes-answer-short]"
4,s3q8053,4.011511,The Rules of How to Play Backgammon Back to 1o...,24,How many points does a backgammon board have,"[1tok, yes-answer-long, yes-answer-short]"


In [6]:
df.drop(columns=['uid','score','tags'], inplace=True) #drop non-text columns

df.head()

Unnamed: 0,context,answer,question
0,Backgammon FAQ : Different Ways of Playing Bac...,24,How many points does a backgammon board have
1,Backgammon Rules - How to Play Backgammon Navi...,24,How many points does a backgammon board have
2,Backgammon Rules Backgammon Backgammon Home Ba...,24,How many points does a backgammon board have
3,Backgammon FAQ Home Backgammon Articles Backga...,24,How many points does a backgammon board have
4,The Rules of How to Play Backgammon Back to 1o...,24,How many points does a backgammon board have


## Remove the stopwords with NLTK

In [7]:
from nltk.corpus import stopwords

print(stopwords.words('english')) #view the stopwords to be removed

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [8]:
# Stopwords removal with NLTK
stop = stopwords.words('english')
columns = ['context', 'answer', 'question']

for col in columns:
    df[col] = df[col].apply(lambda x: ' '.join([word
       for word in x.split() if word not in (stop)
    ]))

df.head()

Unnamed: 0,context,answer,question
0,Backgammon FAQ : Different Ways Playing Backga...,24,How many points backgammon board
1,Backgammon Rules - How Play Backgammon Navigat...,24,How many points backgammon board
2,Backgammon Rules Backgammon Backgammon Home Ba...,24,How many points backgammon board
3,Backgammon FAQ Home Backgammon Articles Backga...,24,How many points backgammon board
4,The Rules How Play Backgammon Back 1on1backgam...,24,How many points backgammon board


Running the stopwords removal took my computer 10 min. I really wonder if we can eventually get better devices for this large dataset.

In [9]:
# Check the text content after stopwords removal
print(df.iloc[0][0])
print(df.iloc[0][1])
print(df.iloc[0][2])

Backgammon FAQ : Different Ways Playing Backgammon FAQ Different Ways Playing Tables Backgammon What tables ? How backgammon different games tables ? Does backgammon official rules ? Backgammon Variants What Nackgammon ? What hyper-backgammon ? What long-gammon ? What roll-over ? What backgammon-to-lose ? Acey-Deucey What acey-deucey ? How play American acey-deucey ? How play European acey-deucey ? Greek Backgammon What tavli ? How play portes ? How play plakoto ? How play fevga ? Other Games What trictrac ? What Russian backgammon ? What French backgammon ? What Dutch backgammon ? What snake ? Forms Competition What money play ? What match play ? What freeze-out match ? What duplicate backgammon ? Table Stakes What table stakes betting ? Why table stakes used ? How strategy table stakes differ unlimited money play ? Chouette What chouette ? What multiple-cube chouette ? When consulting allowed ? What extras ? Tables Backgammon Q : What tables ? Tables general term game played backgamm

## Perform lemmatization with NLTK

I chose lemmatization over stemming since the former is considered more powerful in terms of "chopping" words with respect to the context which better avoids misspelling and incorrect meaning. 

In [10]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

# Define the lemmatization function
def lemmatize_words(text):
    words = text.split()
    words = [lemmatizer.lemmatize(word,pos='v') for word in words]
    return ' '.join(words)
# Lemmatization with NLTK
for col in columns:
    df[col] = df[col].apply(lemmatize_words)

df.head()

Unnamed: 0,context,answer,question
0,Backgammon FAQ : Different Ways Playing Backga...,24,How many point backgammon board
1,Backgammon Rules - How Play Backgammon Navigat...,24,How many point backgammon board
2,Backgammon Rules Backgammon Backgammon Home Ba...,24,How many point backgammon board
3,Backgammon FAQ Home Backgammon Articles Backga...,24,How many point backgammon board
4,The Rules How Play Backgammon Back 1on1backgam...,24,How many point backgammon board


EVEN SLOWER. Lemmatization took like forever. Is there any quicker method that does the same thing or any changes I could make to improve it? 

In [12]:
# Check the lemmatized text content
print(df.iloc[0][0])
print(df.iloc[0][1])
print(df.iloc[0][2])

Backgammon FAQ : Different Ways Playing Backgammon FAQ Different Ways Playing Tables Backgammon What table ? How backgammon different game table ? Does backgammon official rule ? Backgammon Variants What Nackgammon ? What hyper-backgammon ? What long-gammon ? What roll-over ? What backgammon-to-lose ? Acey-Deucey What acey-deucey ? How play American acey-deucey ? How play European acey-deucey ? Greek Backgammon What tavli ? How play port ? How play plakoto ? How play fevga ? Other Games What trictrac ? What Russian backgammon ? What French backgammon ? What Dutch backgammon ? What snake ? Forms Competition What money play ? What match play ? What freeze-out match ? What duplicate backgammon ? Table Stakes What table stake bet ? Why table stake use ? How strategy table stake differ unlimited money play ? Chouette What chouette ? What multiple-cube chouette ? When consult allow ? What extras ? Tables Backgammon Q : What table ? Tables general term game play backgammon board . Actually , 

So far everything looks good! But since my approach and Rachel's appear very similar (we share the same first two steps, though I understand they are the essential starters in lots of NLP preprocessing paths), should we find a way that adds to the variety of our preprocessing approaches?

In [None]:
# Code for stemming that should not be implemented; saved here just for reference

#from nltk.stem.snowball import SnowballStemmer
#stemmer = SnowballStemmer('english')

#for col in columns:
    #df[col] = df[col].apply(lambda x: [stemmer.stem(y) for y in x]) # Stem every word.

#df.head()

## Remove punctuations (not sure if needed)

I think this cleans up and removes more unrelevant pieces from the data but not sure if it's necessary. 

In [13]:
import string #library that contains punctuation
string.punctuation #view the punctuations

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [14]:
# Define the function to remove punctuation
def remove_punctuation(text):
    if(type(text)==float):
        return text
    ans=""  
    for i in text:     
        if i not in string.punctuation:
            ans+=i    
    return ans

# Remove punctuations
for col in columns:
    df[col]= df[col].apply(lambda x:remove_punctuation(x))

df.head()

Unnamed: 0,context,answer,question
0,Backgammon FAQ Different Ways Playing Backgam...,24,How many point backgammon board
1,Backgammon Rules How Play Backgammon Navigati...,24,How many point backgammon board
2,Backgammon Rules Backgammon Backgammon Home Ba...,24,How many point backgammon board
3,Backgammon FAQ Home Backgammon Articles Backga...,24,How many point backgammon board
4,The Rules How Play Backgammon Back 1on1backgam...,24,How many point backgammon board


In [15]:
# Check the punctuation-free text content
print(df.iloc[0][0])
print(df.iloc[0][1])
print(df.iloc[0][2])

Backgammon FAQ  Different Ways Playing Backgammon FAQ Different Ways Playing Tables Backgammon What table  How backgammon different game table  Does backgammon official rule  Backgammon Variants What Nackgammon  What hyperbackgammon  What longgammon  What rollover  What backgammontolose  AceyDeucey What aceydeucey  How play American aceydeucey  How play European aceydeucey  Greek Backgammon What tavli  How play port  How play plakoto  How play fevga  Other Games What trictrac  What Russian backgammon  What French backgammon  What Dutch backgammon  What snake  Forms Competition What money play  What match play  What freezeout match  What duplicate backgammon  Table Stakes What table stake bet  Why table stake use  How strategy table stake differ unlimited money play  Chouette What chouette  What multiplecube chouette  When consult allow  What extras  Tables Backgammon Q  What table  Tables general term game play backgammon board  Actually  bite presumptuous us even call backgammon board

## Vectorize dataframe with n-gram based analysis using scikit-learn CountVectorizer

A bit of overview/review: n-grams are continuous sequences of words/symbols/tokens in a document. They can be defined as the neighbouring sequences of items. In the N-Gram method, a document term matrix is generated and each cell represents the count. The count represents the combination of adjacent words of length n in the title. In n-gram ranking, we simply rank the n-grams according to how many times they appear in a body of text. 

CountVectorizer: a scikit-learn package that uses count vectorization to convert a collection of text documents to a matrix of token counts. Given a corpus of text documents, such as web pages or product descriptions, CountVectorizer can return a matrix outlining the number of occurrences of each word or phrase to help you identify common text patterns in the documents.

In [16]:
from sklearn.feature_extraction.text import CountVectorizer

# Define a function to get n-grams using CountVectorizer
def get_ngrams(text, ngram_from=2, ngram_to=2, n=None, max_features=20000):
    
    vec = CountVectorizer(ngram_range = (ngram_from, ngram_to), 
                          max_features = max_features, 
                          stop_words='english').fit(text)
    bag_of_words = vec.transform(text)
    sum_words = bag_of_words.sum(axis = 0) 
    words_freq = [(word, sum_words[0, i]) for word, i in vec.vocabulary_.items()]
    words_freq = sorted(words_freq, key = lambda x: x[1], reverse = True)
   
    return words_freq[:n]

### Experiment with different hyperparameter values

In [17]:
# 3-grams
trigrams = get_ngrams(df['question'], ngram_from=3, ngram_to=3, n=15)
df_trigrams = pd.DataFrame(trigrams)
df_trigrams.columns=["trigram", "frequency"]

df_trigrams.head()

Unnamed: 0,trigram,frequency
0,basic unit currency,3160
1,released 70 album,1040
2,70 album entitled,1040
3,eurovision song contest,560
4,greek equivalent roman,540


In [19]:
# 5-grams
quadgrams = get_ngrams(df['question'], ngram_from=5, ngram_to=5, n=15)
df_quadgrams = pd.DataFrame(quadgrams)
df_quadgrams.columns=["quadgram", "frequency"]

df_quadgrams.head()

Unnamed: 0,quadgram,frequency
0,state include telephone area code,500
1,international radio code word letter,440
2,la la la la la,140
3,international car registration letter country,100
4,monty python parody search holy,100


Should we experiment with different values for the hyperparameter to find the best performing n-gram? What criteria are we relying on to assess which one is the "best"?

### Apply the ideal n-gram vectorization to the whole dataframe

In [None]:
for col in columns:
    

### Modeling