<a href="https://colab.research.google.com/github/Buse-cetin/Topic_Modelling/blob/main/bigram_trigram.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#load all libraries
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
import spacy
import string



In [15]:
#load reviews data
reviews = pd.read_csv('/content/drive/MyDrive/unwanted_clean.csv')

In [16]:
reviews.head(2)

Unnamed: 0.1,Unnamed: 0,tweet
0,0,dakikadan fazladır taksi dksı görünen otobüsü ...
1,1,olimpiyat cadde üzeri pendik buraya araç park ...


In [17]:
comments = reviews['tweet']

Preprocessing

In [18]:
#function to remove non-ascii characters
def _removeNonAscii(s): return "".join(i for i in s if ord(i)<128)

In [19]:
comments = comments.astype('str')

In [20]:
#remove non-ascii characters
comments = comments.map(lambda x: _removeNonAscii(x))

In [22]:
import nltk
nltk.download('stopwords')
#get stop words of all languages
STOPWORDS_DICT = {lang: set(nltk.corpus.stopwords.words(lang)) for lang in nltk.corpus.stopwords.fileids()}

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [23]:
#function to detect language based on # of stop words for particular language
def get_language(text):
    words = set(nltk.wordpunct_tokenize(text.lower()))
    lang = max(((lang, len(words & stopwords)) for lang, stopwords in STOPWORDS_DICT.items()), key = lambda x: x[1])[0]
    if lang == 'turkish':
        return True
    else:
        return False

In [None]:
#filter for only english comments
eng_comments=comments[comments.apply(get_language)]

In [27]:
eng_comments.head()

185     bykehir belediyelerin hakl karyor ile belediye...
446     kru evet ile merkezi yaptn tespiti iaret damla...
696     anne bebek paketini ak partili ile belediyeler...
780                abi abonman yaptrcam sabah bildiin ile
1028    maltepe yaknmzda kartal pendik belediyesi ile ...
Name: tweet, dtype: object

In [28]:
#drop duplicates
eng_comments.drop_duplicates(inplace=True)

In [32]:
#load spacy
nlp = spacy.load("en_core_web_sm")

In [33]:
#function to clean and lemmatize comments
def clean_comments(text):
    #remove punctuations
    regex = re.compile('[' + re.escape(string.punctuation) + '\\r\\t\\n]')
    nopunct = regex.sub(" ", str(text))
    #use spacy to lemmatize comments
    doc = nlp(nopunct, disable=['parser','ner'])
    lemma = [token.lemma_ for token in doc]
    return lemma

In [34]:
#apply function to clean and lemmatize comments
lemmatized = eng_comments.map(clean_comments)

In [35]:
#make sure to lowercase everything
lemmatized = lemmatized.map(lambda x: [word.lower() for word in x])

In [36]:
lemmatized.head()

185     [bykehir, belediyelerin, hakl, karyor, ile, be...
446     [kru, evet, ile, merkezi, yaptn, tespiti, iare...
696     [anne, bebek, paketini, ak, partili, ile, bele...
780         [abi, abonman, yaptrcam, sabah, bildiin, ile]
1028    [maltepe, yaknmzda, kartal, pendik, belediyesi...
Name: tweet, dtype: object

In [37]:
#turn all comments' tokens into one single list
unlist_comments = [item for items in lemmatized for item in items]

Initialize NLTK's Bigrams/Trigrams Finder

In [52]:
bigrams = nltk.collocations.BigramAssocMeasures()
trigrams = nltk.collocations.TrigramAssocMeasures()

In [53]:
bigramFinder = nltk.collocations.BigramCollocationFinder.from_words(unlist_comments)
trigramFinder = nltk.collocations.TrigramCollocationFinder.from_words(unlist_comments)

1. Counting Frequencies of Adjacent Words

In [40]:
bigram_freq = bigramFinder.ngram_fd.items()

In [41]:
bigramFreqTable = pd.DataFrame(list(bigram_freq), columns=['bigram','freq']).sort_values(by='freq', ascending=False)

In [43]:
bigramFreqTable.head().reset_index(drop=True)

Unnamed: 0,bigram,freq
0,"(istanbul, trafik)",38
1,"(istanbulbld, ibbtrafik)",29
2,"(izgi, almas)",27
3,"(almas, istanbulbld)",27
4,"(trafik, ile)",24


In [44]:
bigramFreqTable[:10]

Unnamed: 0,bigram,freq
2407,"(istanbul, trafik)",38
809,"(istanbulbld, ibbtrafik)",29
807,"(izgi, almas)",27
808,"(almas, istanbulbld)",27
2416,"(trafik, ile)",24
810,"(ibbtrafik, ile)",24
812,"(milyoniinalyoruz, ile)",23
811,"(ile, milyoniinalyoruz)",17
291,"(ile, belediyesi)",14
817,"(caddesi, izgi)",13


In [45]:
#get english stopwords
en_stopwords = set(stopwords.words('turkish'))

In [46]:
#function to filter for ADJ/NN bigrams
def rightTypes(ngram):
    if '-pron-' in ngram or '' in ngram or ' 'in ngram or 't' in ngram:
        return False
    for word in ngram:
        if word in en_stopwords:
            return False
    acceptable_types = ('JJ', 'JJR', 'JJS', 'NN', 'NNS', 'NNP', 'NNPS')
    second_type = ('NN', 'NNS', 'NNP', 'NNPS')
    tags = nltk.pos_tag(ngram)
    if tags[0][1] in acceptable_types and tags[1][1] in second_type:
        return True
    else:
        return False

In [49]:
#filter bigrams
import nltk
nltk.download('averaged_perceptron_tagger')
filtered_bi = bigramFreqTable[bigramFreqTable.bigram.map(lambda x: rightTypes(x))]

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [50]:
filtered_bi[:10]

Unnamed: 0,bigram,freq
2407,"(istanbul, trafik)",38
809,"(istanbulbld, ibbtrafik)",29
807,"(izgi, almas)",27
808,"(almas, istanbulbld)",27
817,"(caddesi, izgi)",13
124,"(servis, taksi)",9
125,"(taksi, sahiplerine)",8
126,"(sahiplerine, tllik)",8
127,"(tllik, sosyal)",8
128,"(sosyal, yardm)",8


In [55]:
trigram_freq = trigramFinder.ngram_fd.items()

In [56]:
trigramFreqTable = pd.DataFrame(list(trigram_freq), columns=['trigram','freq']).sort_values(by='freq', ascending=False)

In [57]:
trigramFreqTable.head().reset_index(drop=True)

Unnamed: 0,trigram,freq
0,"(izgi, almas, istanbulbld)",27
1,"(almas, istanbulbld, ibbtrafik)",27
2,"(istanbulbld, ibbtrafik, ile)",24
3,"(istanbul, trafik, ile)",24
4,"(ibbtrafik, ile, milyoniinalyoruz)",17


In [58]:
trigramFreqTable[:10]

Unnamed: 0,trigram,freq
846,"(izgi, almas, istanbulbld)",27
847,"(almas, istanbulbld, ibbtrafik)",27
848,"(istanbulbld, ibbtrafik, ile)",24
2596,"(istanbul, trafik, ile)",24
849,"(ibbtrafik, ile, milyoniinalyoruz)",17
850,"(ile, milyoniinalyoruz, ile)",16
856,"(caddesi, izgi, almas)",13
137,"(srekli, dt, halde)",8
128,"(taksi, sahiplerine, tllik)",8
129,"(sahiplerine, tllik, sosyal)",8


In [59]:
def rightTypesTri(ngram):
    if '-pron-' in ngram or '' in ngram or ' 'in ngram or '  ' in ngram or 't' in ngram:
        return False
    for word in ngram:
        if word in en_stopwords:
            return False
    first_type = ('JJ', 'JJR', 'JJS', 'NN', 'NNS', 'NNP', 'NNPS')
    third_type = ('JJ', 'JJR', 'JJS', 'NN', 'NNS', 'NNP', 'NNPS')
    tags = nltk.pos_tag(ngram)
    if tags[0][1] in first_type and tags[2][1] in third_type:
        return True
    else:
        return False

In [60]:
filtered_tri = trigramFreqTable[trigramFreqTable.trigram.map(lambda x: rightTypesTri(x))]

In [61]:
filtered_tri[:10]

Unnamed: 0,trigram,freq
846,"(izgi, almas, istanbulbld)",27
847,"(almas, istanbulbld, ibbtrafik)",27
856,"(caddesi, izgi, almas)",13
137,"(srekli, dt, halde)",8
128,"(taksi, sahiplerine, tllik)",8
129,"(sahiplerine, tllik, sosyal)",8
130,"(tllik, sosyal, yardm)",8
138,"(dt, halde, servis)",8
139,"(halde, servis, taksi)",7
845,"(yolu, izgi, almas)",6


In [62]:
freq_bi = filtered_bi[:20].bigram.values

In [63]:
freq_tri = filtered_tri[:20].trigram.values

2. PMI

In [64]:
bigramFinder.apply_freq_filter(20)

In [65]:
bigramPMITable = pd.DataFrame(list(bigramFinder.score_ngrams(bigrams.pmi)), columns=['bigram','PMI']).sort_values(by='PMI', ascending=False)

In [66]:
bigramPMITable[:10]

Unnamed: 0,bigram,PMI
0,"(izgi, almas)",6.828488
1,"(istanbulbld, ibbtrafik)",6.78474
2,"(almas, istanbulbld)",6.777862
3,"(istanbul, trafik)",5.383702
4,"(milyoniinalyoruz, ile)",3.655168
5,"(ibbtrafik, ile)",3.462812
6,"(trafik, ile)",2.744583


In [67]:
trigramFinder.apply_freq_filter(20)

In [68]:
trigramPMITable = pd.DataFrame(list(trigramFinder.score_ngrams(trigrams.pmi)), columns=['trigram','PMI']).sort_values(by='PMI', ascending=False)

In [69]:
trigramPMITable[:10]

Unnamed: 0,trigram,PMI
0,"(izgi, almas, istanbulbld)",13.709444
1,"(almas, istanbulbld, ibbtrafik)",13.562603
2,"(istanbulbld, ibbtrafik, ile)",10.343768
3,"(istanbul, trafik, ile)",8.552783


In [70]:
pmi_bi = bigramPMITable[:20].bigram.values

In [71]:
pmi_tri = trigramPMITable[:20].trigram.values

3. t-test

In [72]:
bigramTtable = pd.DataFrame(list(bigramFinder.score_ngrams(bigrams.student_t)), columns=['bigram','t']).sort_values(by='t', ascending=False)

In [73]:
bigramTtable.head()

Unnamed: 0,bigram,t
0,"(istanbul, trafik)",6.016763
1,"(istanbulbld, ibbtrafik)",5.336323
2,"(izgi, almas)",5.150433
3,"(almas, istanbulbld)",5.1488
4,"(ibbtrafik, ile)",4.45466


In [74]:
filteredT_bi = bigramTtable[bigramTtable.bigram.map(lambda x: rightTypes(x))]

In [75]:
filteredT_bi[:10]

Unnamed: 0,bigram,t
0,"(istanbul, trafik)",6.016763
1,"(istanbulbld, ibbtrafik)",5.336323
2,"(izgi, almas)",5.150433
3,"(almas, istanbulbld)",5.1488


In [76]:
trigramTtable = pd.DataFrame(list(trigramFinder.score_ngrams(trigrams.student_t)), columns=['trigram','t']).sort_values(by='t', ascending=False)

In [77]:
trigramTtable.head()

Unnamed: 0,trigram,t
0,"(izgi, almas, istanbulbld)",5.195765
1,"(almas, istanbulbld, ibbtrafik)",5.195723
2,"(istanbulbld, ibbtrafik, ile)",4.89521
3,"(istanbul, trafik, ile)",4.885934


In [78]:
filteredT_tri = trigramTtable[trigramTtable.trigram.map(lambda x: rightTypesTri(x))]

In [79]:

filteredT_tri.head(10)

Unnamed: 0,trigram,t
0,"(izgi, almas, istanbulbld)",5.195765
1,"(almas, istanbulbld, ibbtrafik)",5.195723


In [80]:
t_bi = filteredT_bi[:20].bigram.values

In [81]:
t_tri = filteredT_tri[:20].trigram.values

4. Chi-Square

In [83]:
bigramChiTable = pd.DataFrame(list(bigramFinder.score_ngrams(bigrams.chi_sq)), columns=['bigram','chi-sq']).sort_values(by='chi-sq', ascending=False)

In [84]:
bigramChiTable.head(20)

Unnamed: 0,bigram,chi-sq
0,"(istanbulbld, ibbtrafik)",3195.596893
1,"(izgi, almas)",3065.773762
2,"(almas, istanbulbld)",2959.053746
3,"(istanbul, trafik)",1562.180186
4,"(milyoniinalyoruz, ile)",266.160059
5,"(ibbtrafik, ile)",237.473163
6,"(trafik, ile)",127.117376


In [85]:
trigramChiTable = pd.DataFrame(list(trigramFinder.score_ngrams(trigrams.chi_sq)), columns=['trigram','chi-sq']).sort_values(by='chi-sq', ascending=False)

In [86]:
trigramChiTable.head(20)

Unnamed: 0,trigram,chi-sq
0,"(izgi, almas, istanbulbld)",361702.859278
1,"(almas, istanbulbld, ibbtrafik)",326712.293506
2,"(istanbulbld, ibbtrafik, ile)",31291.107883
3,"(istanbul, trafik, ile)",9222.164141


In [87]:
chi_bi = bigramChiTable[:20].bigram.values

In [88]:
chi_tri = trigramChiTable[:20].trigram.values

5. Likelihood

In [89]:
bigramLikTable = pd.DataFrame(list(bigramFinder.score_ngrams(bigrams.likelihood_ratio)), columns=['bigram','likelihood ratio']).sort_values(by='likelihood ratio', ascending=False)

In [90]:
bigramLikTable.head()

Unnamed: 0,bigram,likelihood ratio
0,"(istanbulbld, ibbtrafik)",319.553663
1,"(izgi, almas)",292.016289
2,"(almas, istanbulbld)",286.090353
3,"(istanbul, trafik)",278.956593
4,"(milyoniinalyoruz, ile)",106.10683


In [91]:
filteredLik_bi = bigramLikTable[bigramLikTable.bigram.map(lambda x: rightTypes(x))]

In [92]:
filteredLik_bi.head(10)

Unnamed: 0,bigram,likelihood ratio
0,"(istanbulbld, ibbtrafik)",319.553663
1,"(izgi, almas)",292.016289
2,"(almas, istanbulbld)",286.090353
3,"(istanbul, trafik)",278.956593


In [93]:
trigramLikTable = pd.DataFrame(list(trigramFinder.score_ngrams(trigrams.likelihood_ratio)), columns=['trigram','likelihood ratio']).sort_values(by='likelihood ratio', ascending=False)

In [94]:
trigramLikTable.head()

Unnamed: 0,trigram,likelihood ratio
0,"(izgi, almas, istanbulbld)",608.145717
1,"(almas, istanbulbld, ibbtrafik)",605.646378
2,"(istanbulbld, ibbtrafik, ile)",423.624645
3,"(istanbul, trafik, ile)",362.501949


In [95]:
filteredLik_tri = trigramLikTable[trigramLikTable.trigram.map(lambda x: rightTypesTri(x))]

In [96]:
filteredLik_tri.head(20)

Unnamed: 0,trigram,likelihood ratio
0,"(izgi, almas, istanbulbld)",608.145717
1,"(almas, istanbulbld, ibbtrafik)",605.646378


In [97]:
lik_bi = filteredLik_bi[:20].bigram.values

In [98]:
lik_tri = filteredLik_tri[:20].trigram.values

Bigram Comparison

In [99]:
bigramsCompare = pd.DataFrame([freq_bi, pmi_bi, t_bi, chi_bi, lik_bi]).T

In [100]:
bigramsCompare.columns = ['Frequency With Filter', 'PMI', 'T-test With Filter', 'Chi-Sq Test', 'Likeihood Ratio Test With Filter']

In [101]:
bigramsCompare

Unnamed: 0,Frequency With Filter,PMI,T-test With Filter,Chi-Sq Test,Likeihood Ratio Test With Filter
0,"(istanbul, trafik)","(izgi, almas)","(istanbul, trafik)","(istanbulbld, ibbtrafik)","(istanbulbld, ibbtrafik)"
1,"(istanbulbld, ibbtrafik)","(istanbulbld, ibbtrafik)","(istanbulbld, ibbtrafik)","(izgi, almas)","(izgi, almas)"
2,"(izgi, almas)","(almas, istanbulbld)","(izgi, almas)","(almas, istanbulbld)","(almas, istanbulbld)"
3,"(almas, istanbulbld)","(istanbul, trafik)","(almas, istanbulbld)","(istanbul, trafik)","(istanbul, trafik)"
4,"(caddesi, izgi)","(milyoniinalyoruz, ile)",,"(milyoniinalyoruz, ile)",
5,"(servis, taksi)","(ibbtrafik, ile)",,"(ibbtrafik, ile)",
6,"(taksi, sahiplerine)","(trafik, ile)",,"(trafik, ile)",
7,"(sahiplerine, tllik)",,,,
8,"(tllik, sosyal)",,,,
9,"(sosyal, yardm)",,,,


Trigram Comparison

In [102]:
trigramsCompare = pd.DataFrame([freq_tri, pmi_tri, t_tri, chi_tri, lik_tri]).T

In [103]:
trigramsCompare.columns = ['Frequency With Filter', 'PMI', 'T-test With Filter', 'Chi-Sq Test', 'Likeihood Ratio Test With Filter']

In [104]:
trigramsCompare

Unnamed: 0,Frequency With Filter,PMI,T-test With Filter,Chi-Sq Test,Likeihood Ratio Test With Filter
0,"(izgi, almas, istanbulbld)","(izgi, almas, istanbulbld)","(izgi, almas, istanbulbld)","(izgi, almas, istanbulbld)","(izgi, almas, istanbulbld)"
1,"(almas, istanbulbld, ibbtrafik)","(almas, istanbulbld, ibbtrafik)","(almas, istanbulbld, ibbtrafik)","(almas, istanbulbld, ibbtrafik)","(almas, istanbulbld, ibbtrafik)"
2,"(caddesi, izgi, almas)","(istanbulbld, ibbtrafik, ile)",,"(istanbulbld, ibbtrafik, ile)",
3,"(srekli, dt, halde)","(istanbul, trafik, ile)",,"(istanbul, trafik, ile)",
4,"(taksi, sahiplerine, tllik)",,,,
5,"(sahiplerine, tllik, sosyal)",,,,
6,"(tllik, sosyal, yardm)",,,,
7,"(dt, halde, servis)",,,,
8,"(halde, servis, taksi)",,,,
9,"(yolu, izgi, almas)",,,,
