# Implement t-Test and Chi-Square test
## To check whether a given sequence of words is acollocation or not.
 

#### Importing Modules and gathering data

In [1]:
# import the required module
import nltk
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import gutenberg, stopwords
import string
import math
from scipy.stats import t,chi2

In [2]:
# Loading the data
data = gutenberg.raw('austen-emma.txt')
# data

#### Pre-Processing of data

In [3]:
#Tokenization
sent_tokens = sent_tokenize(data)
word_tokens = []
for sentence in sent_tokens :
    sentence = sentence.translate(str.maketrans('', '', string.punctuation))
    word_tokens += word_tokenize(sentence)

In [4]:
#Stopwords removal    
stops = set(stopwords.words('english'))
word_tokens = [word for word in word_tokens if word.lower() not in stops]

In [5]:
#Frequency, Propability
unique_words = set(word_tokens)
print(f"TOTAL WORDS IN THE CORPUS : {len(word_tokens)}")
print(f"UNIQUE WORDS : {len(unique_words)}")

frequency = {word : word_tokens.count(word) for word in unique_words}
propability = {word : frequency[word]/len(word_tokens) for word in unique_words}

TOTAL WORDS IN THE CORPUS : 72767
UNIQUE WORDS : 9509


#### Generating Bigrams

In [6]:
#Generating Bigrams, frequency and propability of bigrams
bigrams = zip(word_tokens[:-1], word_tokens[1:])
bigram_freq = {}
bigram_count = 0
for bigram in bigrams :
    bigram_count += 1
    if bigram in bigram_freq :
        bigram_freq[bigram] += 1
    else :
        bigram_freq[bigram] = 1
bigram_prop = {}
for bigram, freq in bigram_freq.items() : 
    bigram_prop[bigram] = freq/bigram_count
print("TOTAL UNIQUE BIGRAMS :", len(bigram_freq))

TOTAL UNIQUE BIGRAMS : 60090


In [7]:
# bigram_freq

### t-Test Demonstratoin

#### t = (X – μ) / (σ / √n)


t = t-value 

X = sample mean 

μ = true/population mean 

σ = standard deviation 

n = sample size

In [8]:
t_colloc = []
n = len(word_tokens)
t_critical = t.ppf(1-0.05,n-1)
for bigram,prop in bigram_prop.items():
    w1,w2 = bigram
    mu = propability[w1] * propability[w2]
    X_mean = prop
    t_stat = (X_mean -  mu)/(math.sqrt(((X_mean)*(1-X_mean))/n))
    if t_stat > t_critical:
        t_colloc.append((bigram,t_stat))
print(f"{len(t_colloc)} COLLOCATIONS IN THE CORPUS DETERMINED FROM T-TEST : \n")
# print(t_colloc)

1055 COLLOCATIONS IN THE CORPUS DETERMINED FROM T-TEST : 



### Chi^2 TEST demonstration

#### X2 = Sum(i,j=1,2) ((Oij - Eij)2 / Eij )

O - Observed Frequencies

E - Excepcted frequencies

In [9]:
chi_colloc = []
n = len(word_tokens)
chi_critical = chi2.ppf(1-0.05, 1)
for bigram, prop in bigram_prop.items() :
    w1, w2 = bigram
    f1 = frequency[w1]
    f2 = frequency[w2]
    #Observed Frequencies
    o_1_2 = bigram_freq[bigram]
    o_n1_2 = f2 - o_1_2
    o_1_n2 = f1 - o_1_2
    o_n1_n2 = n - (o_1_2 + o_n1_2 + o_1_n2)
    obs = [o_1_2, o_n1_2, o_1_n2, o_n1_n2]
    #Excepcted frequencies
    e_1_2 = (f1 * f2)/n
    e_n1_2 = ((n - f1) * f2)/n
    e_1_n2 = (f1 * (n - f2))/n
    e_n1_n2 = ((n - f1)*(n - f2))/n
    exp = [e_1_2, e_n1_2, e_1_n2, e_n1_n2]
    chi_stat = sum( ((obs[i] - exp[i])**2)/exp[i] for i in range(4))
    if chi_stat > chi_critical :
        chi_colloc.append(bigram)
print(f"{len(chi_colloc)} COLLOCATIONS IN THE CORPUS DETERMINED FROM CHI^2-TEST : \n")
# print(chi_colloc)

53796 COLLOCATIONS IN THE CORPUS DETERMINED FROM CHI^2-TEST : 

