# Implement t-Test and Chi-Square test
## To check whether a given sequence of words is acollocation or not.
 

#### Importing Modules and gathering data

In [1]:
# import the required module
import nltk
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import gutenberg, stopwords
import string
import math
from scipy.stats import t,chi2

In [2]:
# Loading the data
data = gutenberg.raw('austen-emma.txt')
data

'[Emma by Jane Austen 1816]\n\nVOLUME I\n\nCHAPTER I\n\n\nEmma Woodhouse, handsome, clever, and rich, with a comfortable home\nand happy disposition, seemed to unite some of the best blessings\nof existence; and had lived nearly twenty-one years in the world\nwith very little to distress or vex her.\n\nShe was the youngest of the two daughters of a most affectionate,\nindulgent father; and had, in consequence of her sister\'s marriage,\nbeen mistress of his house from a very early period.  Her mother\nhad died too long ago for her to have more than an indistinct\nremembrance of her caresses; and her place had been supplied\nby an excellent woman as governess, who had fallen little short\nof a mother in affection.\n\nSixteen years had Miss Taylor been in Mr. Woodhouse\'s family,\nless as a governess than a friend, very fond of both daughters,\nbut particularly of Emma.  Between _them_ it was more the intimacy\nof sisters.  Even before Miss Taylor had ceased to hold the nominal\noffice o

#### Pre-Processing of data

In [3]:
#Tokenization
sent_tokens = sent_tokenize(data)
word_tokens = []
for sentence in sent_tokens :
    sentence = sentence.translate(str.maketrans('', '', string.punctuation))
    word_tokens += word_tokenize(sentence)

In [4]:
#Stopwords removal    
stops = set(stopwords.words('english'))
word_tokens = [word for word in word_tokens if word.lower() not in stops]

In [5]:
#Frequency, Propability
unique_words = set(word_tokens)
print(f"TOTAL WORDS IN THE CORPUS : {len(word_tokens)}")
print(f"UNIQUE WORDS : {len(unique_words)}")

frequency = {word : word_tokens.count(word) for word in unique_words}
propability = {word : frequency[word]/len(word_tokens) for word in unique_words}

TOTAL WORDS IN THE CORPUS : 72767
UNIQUE WORDS : 9509


#### Generating Bigrams

In [6]:
#Generating Bigrams, frequency and propability of bigrams
bigrams = zip(word_tokens[:-1], word_tokens[1:])
bigram_freq = {}
bigram_count = 0
for bigram in bigrams :
    bigram_count += 1
    if bigram in bigram_freq :
        bigram_freq[bigram] += 1
    else :
        bigram_freq[bigram] = 1
bigram_prop = {}
for bigram, freq in bigram_freq.items() : 
    bigram_prop[bigram] = freq/bigram_count
print("TOTAL UNIQUE BIGRAMS :", len(bigram_freq))

TOTAL UNIQUE BIGRAMS : 60090


In [7]:
bigram_freq

{('Emma', 'Jane'): 2,
 ('Jane', 'Austen'): 1,
 ('Austen', '1816'): 1,
 ('1816', 'VOLUME'): 1,
 ('VOLUME', 'CHAPTER'): 1,
 ('CHAPTER', 'Emma'): 2,
 ('Emma', 'Woodhouse'): 4,
 ('Woodhouse', 'handsome'): 1,
 ('handsome', 'clever'): 1,
 ('clever', 'rich'): 1,
 ('rich', 'comfortable'): 1,
 ('comfortable', 'home'): 2,
 ('home', 'happy'): 1,
 ('happy', 'disposition'): 1,
 ('disposition', 'seemed'): 1,
 ('seemed', 'unite'): 1,
 ('unite', 'best'): 1,
 ('best', 'blessings'): 2,
 ('blessings', 'existence'): 2,
 ('existence', 'lived'): 1,
 ('lived', 'nearly'): 1,
 ('nearly', 'twentyone'): 1,
 ('twentyone', 'years'): 1,
 ('years', 'world'): 1,
 ('world', 'little'): 1,
 ('little', 'distress'): 2,
 ('distress', 'vex'): 1,
 ('vex', 'youngest'): 1,
 ('youngest', 'two'): 2,
 ('two', 'daughters'): 2,
 ('daughters', 'affectionate'): 1,
 ('affectionate', 'indulgent'): 1,
 ('indulgent', 'father'): 1,
 ('father', 'consequence'): 1,
 ('consequence', 'sisters'): 1,
 ('sisters', 'marriage'): 1,
 ('marriage', 'm

### t-Test Demonstratoin

#### t = (X – μ) / (σ / √n)


t = t-value 

X = sample mean 

μ = true/population mean 

σ = standard deviation 

n = sample size

In [13]:
t_colloc = []
n = len(word_tokens)
t_critical = t.ppf(1-0.05,n-1)
for bigram,prop in bigram_prop.items():
    w1,w2 = bigram
    mu = propability[w1] * propability[w2]
    X_mean = prop
    t_stat = (X_mean -  mu)/(math.sqrt(((X_mean)*(1-X_mean))/n))
    if t_stat > t_critical:
        t_colloc.append((bigram,t_stat))
print(f"{len(t_colloc)} COLLOCATIONS IN THE CORPUS DETERMINED FROM T-TEST : \n")
print(t_colloc)

1055 COLLOCATIONS IN THE CORPUS DETERMINED FROM T-TEST : 

[(('Miss', 'Taylor'), 5.869030158224675), (('Mr', 'Woodhouses'), 4.978772140427534), (('passed', 'away'), 2.4100588573096036), (('Miss', 'Taylors'), 2.8060565784423463), (('think', 'little'), 1.9431999041419368), (('Mr', 'Weston'), 11.25712746129459), (('always', 'wished'), 1.6474231259668621), (('every', 'day'), 3.3911339516521535), (('soon', 'followed'), 2.196770464922868), (('could', 'speak'), 2.9830901550165714), (('could', 'never'), 2.8874698598952104), (('find', 'fault'), 1.7205142768173731), (('half', 'mile'), 2.443559404229196), (('Mrs', 'Weston'), 14.680761543435226), (('Mr', 'Woodhouse'), 9.803735390819975), (('sixteen', 'miles'), 1.999299120525023), (('much', 'beyond'), 2.519486125084382), (('society', 'Highbury'), 2.2009190140513373), (('half', 'day'), 1.9037730813158955), (('Emma', 'could'), 6.726924576795626), (('every', 'body'), 8.386116526101656), (('never', 'able'), 2.5282646413934455), (('could', 'feel'), 2.24

### Chi^2 TEST demonstration

#### X2 = Sum(i,j=1,2) ((Oij - Eij)2 / Eij )

O - Observed Frequencies

E - Excepcted frequencies

In [14]:
chi_colloc = []
n = len(word_tokens)
chi_critical = chi2.ppf(1-0.05, 1)
for bigram, prop in bigram_prop.items() :
    w1, w2 = bigram
    f1 = frequency[w1]
    f2 = frequency[w2]
    #Observed Frequencies
    o_1_2 = bigram_freq[bigram]
    o_n1_2 = f2 - o_1_2
    o_1_n2 = f1 - o_1_2
    o_n1_n2 = n - (o_1_2 + o_n1_2 + o_1_n2)
    obs = [o_1_2, o_n1_2, o_1_n2, o_n1_n2]
    #Excepcted frequencies
    e_1_2 = (f1 * f2)/n
    e_n1_2 = ((n - f1) * f2)/n
    e_1_n2 = (f1 * (n - f2))/n
    e_n1_n2 = ((n - f1)*(n - f2))/n
    exp = [e_1_2, e_n1_2, e_1_n2, e_n1_n2]
    chi_stat = sum( ((obs[i] - exp[i])**2)/exp[i] for i in range(4))
    if chi_stat > chi_critical :
        chi_colloc.append(bigram)
print(f"{len(chi_colloc)} COLLOCATIONS IN THE CORPUS DETERMINED FROM CHI^2-TEST : \n")
print(chi_colloc)

53796 COLLOCATIONS IN THE CORPUS DETERMINED FROM CHI^2-TEST : 

[('Jane', 'Austen'), ('Austen', '1816'), ('1816', 'VOLUME'), ('VOLUME', 'CHAPTER'), ('Woodhouse', 'handsome'), ('handsome', 'clever'), ('clever', 'rich'), ('rich', 'comfortable'), ('comfortable', 'home'), ('happy', 'disposition'), ('disposition', 'seemed'), ('seemed', 'unite'), ('unite', 'best'), ('best', 'blessings'), ('blessings', 'existence'), ('existence', 'lived'), ('lived', 'nearly'), ('nearly', 'twentyone'), ('twentyone', 'years'), ('years', 'world'), ('little', 'distress'), ('distress', 'vex'), ('vex', 'youngest'), ('youngest', 'two'), ('two', 'daughters'), ('daughters', 'affectionate'), ('affectionate', 'indulgent'), ('indulgent', 'father'), ('father', 'consequence'), ('consequence', 'sisters'), ('sisters', 'marriage'), ('marriage', 'mistress'), ('mistress', 'house'), ('house', 'early'), ('early', 'period'), ('period', 'mother'), ('mother', 'died'), ('died', 'long'), ('long', 'ago'), ('ago', 'indistinct'), ('indis