In [1]:
import math
import numpy as np
import pandas as pd
from sklearn.metrics import mutual_info_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, ENGLISH_STOP_WORDS

In [2]:
def read_stop_words(file):
    with open(file) as f:
        stop_words = f.read().split('\n')

    return stop_words

In [5]:
!ls dataset/booking

analyze.ipynb
booking-rating-for-one-hot-test.csv
booking-rating-for-one-hot-train.csv
booking-rating-for-one-hot-val.csv
booking-rating-test.csv
booking-rating-train.csv
booking-rating-val.csv
booking-sentences-test.csv
booking-sentences-train.csv
booking-sentences-val.csv
booking-test.csv
booking-train.csv
booking-val.csv
create-dataset-for-rating-classification.ipynb
create-sentence-classification-dataset.ipynb
dnipro-reviews.csv
ivano-frankivsk-reviews.csv
kharkiv-reviews.csv
kyiv-reviews.csv
lviv-reviews.csv
odesa-reviews.csv
[34mtranslated[m[m
uzhgorod-reviews.csv


In [6]:
def read_all_frames(files):
    frames= []
    for file in files:
        df = pd.read_csv(file)
        frames.append(df)
    
    return pd.concat(frames)

In [38]:
def is_review_valid(review):
    if type(review) is not str:
        return False
    try:
        if len(review) == 0:
            return False
        if review == 'Nan':
            return False
        return True
    except:
        return False

In [27]:
reviews

['–õ–∏—à–µ –¥—ñ–≤—á–∞—Ç–∞ –Ω–∞ —Ä–µ—Ü–µ–ø—Ü—ñ—ó - —Ç—Ä–∏ —Ä–∞–∑–∏ –º–µ–Ω—ñ –º—ñ–Ω—è–ª–∏ –∫—ñ–º–Ω–∞—Ç—É. –î—è–∫—É—é —ó–º.',
 '–í—Å–µ. –û–¥–Ω–æ–≥–æ –¥–æ—Å–≤—ñ–¥—É –≤–∏—Å—Ç–∞—Ä—á–∏–ª–æ, —â–æ–± –±—ñ–ª—å—à–µ —Å—é–¥–∏ –Ω–µ –ø–æ–≤–µ—Ä—Ç–∞—Ç–∏—Å—è. –ì–æ—Ç–µ–ª—å –ª–∏—à–µ –¥–ª—è –≥–æ—Å—Ç–µ–π, —è–∫–∏–º –±–∞–π–¥—É–∂–µ —É–º–æ–≤–∏ , —á–∏—Å—Ç–æ—Ç–∞ —ñ –∫–æ–º—Ñ–æ—Ä—Ç.',
 '–û—Ñ–æ—Ä–º–ª–µ–Ω–Ω—è –∫—ñ–º–Ω–∞—Ç–∏ —Ö–æ—Ä–æ—à–µ, –¥–æ—Å–∏—Ç—å –ø—Ä–∏—î–º–Ω–µ, –Ω–∞ –æ–¥–Ω—É-–¥–≤—ñ –Ω–æ—á—ñ - —á—É–¥–æ–≤–∏–π –≤–∞—Ä—ñ–∞–Ω—Ç, –æ—Å–æ–±–ª–∏–≤–æ —è–∫—â–æ –≤—Ä–∞—Ö—É–≤–∞—Ç–∏ —Ü—ñ–Ω—É. –¢–∞–∫—ñ –Ω–æ–º–µ—Ä–∏ —É –∫–æ–Ω–∫—É—Ä–µ–Ω—Ç—ñ–≤ –≤ –¥–≤–∞ —Ä–∞–∑–∏ –¥–æ—Ä–æ–∂—á—ñ.',
 'Nan',
 "–£—Å–µ –≤—ñ–¥–º—ñ–Ω–Ω–æ, –∑–∞–≤–¥—è–∫–∏ —è–∫—ñ—Å–Ω–æ–º—É —Å–µ—Ä–≤—ñ—Å—É –º–∏ –∑–∞–≤–∂–¥–∏ —Ç—É—Ç –∑—É–ø–∏–Ω—è—î–º–æ—Å—è. –¶—å–æ–≥–æ —Ä–∞–∑—É —Ä–µ—Å—Ç–æ—Ä–∞–Ω –±—É–ª–æ –∑–∞–±—Ä–æ–Ω—å–æ–≤–∞–Ω–æ, –Ω–∞–º –ª—é–±'—è–∑–Ω–æ –∑–∞–ø—Ä–æ–ø–æ–Ω—É–≤–∞–ª–∏ –≤–µ—á–µ—Ä—é –≤ –Ω–æ–º–µ—Ä, —Ü–µ –¥—É–∂–µ –∑—Ä—É—á–Ω–æ. –ù–∞–¥–∑–≤–∏—á–∞–π–Ω–æ –ª—é–±'—è–∑–Ω–∏–π –ø–µ—Ä—Å–

In [8]:
full_df = read_all_frames(files)

In [16]:
len(full_df['pos_text'].values)

134083

In [39]:
def get_pos_neg_review(df):
    pos_texts = df['pos_text'].values
    neg_texts = df['neg_text'].values
    
    gt_score, reviews = [], []
    for i in range(0, len(df)):
        if is_review_valid(pos_texts[i]):
            reviews.append(pos_texts[i])
            gt_score.append(1)
            
        if is_review_valid(neg_texts[i]):
            reviews.append(neg_texts[i])
            gt_score.append(0)
            
    return reviews, gt_score

In [40]:
reviews, gt_score = get_pos_neg_review(full_df)

In [42]:
uk_stop_words = read_stop_words('./data/ukrainian-stopwords.txt')

### 1. Word frequency

In [43]:
def get_topk_ngram(df, ngram_range=(1,1), k=None, stopwords=True, with_count=False):
    '''
    Extract the most frequently occurred words in countvector
    '''
    if stopwords:
        temp = []
        for name in hotelDf.hotelName.unique():
            for token in name.split():
                if len(token) > 1:
                    temp.append(token)
        my_stop_words = ENGLISH_STOP_WORDS.union(temp)
        vectorizer = CountVectorizer(ngram_range=ngram_range, stop_words=my_stop_words, max_features=500)
        
    else:
        vectorizer = CountVectorizer(ngram_range=ngram_range, stop_words=None, max_features=k)
        
    countvector = vectorizer.fit_transform(df['review'])

    # Get topk occurred ngrams
    topk_words = []
    sortedindices = countvector.toarray().sum(axis=0).argsort()[::-1][:k]
    counts = countvector.toarray().sum(axis=0)
    
    for i in sortedindices:
        word = vectorizer.get_feature_names()[i]
        
        if with_count:
            count = counts[i]
            topk_words.append((word, count))
        else:
            topk_words.append(word)
            
    return topk_words

In [None]:
topkTotal = get_topk_ngram(hotelDf, k=500)

### 2. Mutual information

**Mutual information tells you how much you learn about X from knowing the value of Y (on average over the choice of Y).** 



Since we found the word frequency is not a good indicator for the sentiment analysis, we will examine *mutual information*  for an alternative metric.

http://scikit-learn.org/stable/modules/generated/sklearn.metrics.mutual_info_score.html

In [25]:
# let's calculate Mutual Information for unigrams and bigrams
vectorizer = CountVectorizer(ngram_range=(1,1), stop_words=uk_stop_words, max_features=500)
countvector = vectorizer.fit_transform(reviews)
densevector = np.array(countvector.todense())
    
# miScore_unigram = pd.DataFrame(data = {'word': vectorizer.get_feature_names(),
#              'MI Score': [mutual_info_score(gtScore, densevector[:,i].squeeze()) for i in range(500)]})

miScore_unigram = pd.DataFrame(data =
                               {'MI Score': [mutual_info_score(gt_score, densevector[:,i].squeeze()) for i in range(500)]}
                              , index = vectorizer.get_feature_names())



  'stop_words.' % sorted(inconsistent))


In [44]:
# Bigram version
vectorizer = CountVectorizer(ngram_range=(2,2), stop_words=uk_stop_words, max_features=500)
countvector = vectorizer.fit_transform(reviews)
densevector = np.array(countvector.todense())
miScore_bigram = pd.DataFrame(data =
                    {'MI Score': [mutual_info_score(gt_score, densevector[:,i].squeeze()) for i in range(500)]},
                    index = vectorizer.get_feature_names())

  'stop_words.' % sorted(inconsistent))


In [26]:
miScore_unigram.sort_values('MI Score', inplace=True, ascending=False)
print('Mutual Information - Unigram')
miScore_unigram.head(10)

Mutual Information - Unigram


Unnamed: 0,MI Score
–ø–µ—Ä—Å–æ–Ω–∞–ª,0.067876
—Ä–æ–∑—Ç–∞—à—É–≤–∞–Ω–Ω—è,0.062621
—á–∏—Å—Ç–æ,0.040511
–ø—Ä–∏–≤—ñ—Ç–Ω–∏–π,0.031284
,0.027092
–∑—Ä—É—á–Ω–µ,0.019928
—Ö–æ—Ä–æ—à–∏–π,0.018283
–º—ñ—Å—Ü–µ,0.018195
—á–∏—Å—Ç–∏–π,0.017932
–∑–∞—Ç–∏—à–Ω–æ,0.016047


In [30]:
miScore_bigram.sort_values('MI Score', inplace=True, ascending=False)
print('Mutual Information - Bigram')
miScore_bigram.head(10)

Mutual Information - Bigram


Unnamed: 0,MI Score
–ø—Ä–∏–≤—ñ—Ç–Ω–∏–π –ø–µ—Ä—Å–æ–Ω–∞–ª,0.033663
–∑—Ä—É—á–Ω–µ —Ä–æ–∑—Ç–∞—à—É–≤–∞–Ω–Ω—è,0.017697
–º—ñ—Å—Ü–µ —Ä–æ–∑—Ç–∞—à—É–≤–∞–Ω–Ω—è,0.009666
–ø—Ä–∏—î–º–Ω–∏–π –ø–µ—Ä—Å–æ–Ω–∞–ª,0.009136
–ø–æ–≥–∞–Ω–∞ –∑–≤—É–∫–æ—ñ–∑–æ–ª—è—Ü—ñ—è,0.00707
—á—É–¥–æ–≤–µ —Ä–æ–∑—Ç–∞—à—É–≤–∞–Ω–Ω—è,0.006828
—ñ–Ω—Ç–µ—Ä —î—Ä,0.006487
—Ü–µ–Ω—Ç—Ä—ñ –º—ñ—Å—Ç–∞,0.00643
–ø–µ—Ä—Å–æ–Ω–∞–ª –ø—Ä–∏–≤—ñ—Ç–Ω–∏–π,0.006034
–≤–∞–Ω–Ω—ñ–π –∫—ñ–º–Ω–∞—Ç—ñ,0.005576


###  3. Pointwise Mutual Information

In [69]:
def getPMI_ngram(reviews, gt_score, gt, ngram_range=(1,1), max_features=500):
    vectorizer = CountVectorizer(ngram_range=ngram_range, stop_words=uk_stop_words, max_features=max_features)
    countvector = vectorizer.fit_transform(reviews)
    densevector = np.array(countvector.todense())
    
    px = sum(gt_score == gt) / len(reviews)
    pmis = []
    
    for i in range(max_features):
        py = sum(densevector[:,i] == 1) / len(reviews)
        pxy = sum((gt_score== gt) & (densevector[:,i] == 1)) / len(reviews)
        
        if pxy == 0:
            pmi = math.log10((pxy + 0.0001) / (px * py))
        else:
            pmi = math.log10(pxy / (px * py))
            
        pmis.append(pmi)
        
    gt_name = 'positive' if gt == 1 else 'negative'
    pmis = pd.DataFrame(data = {'pmi' + gt_name: pmis}, index = vectorizer.get_feature_names())
    return pmis.sort_values('pmi' + gt_name, ascending=False)

In [70]:
gt_score = np.array(gt_score)

In [72]:
pmiPos_unigram = getPMI_ngram(reviews, gt_score,  1, max_features=2000)
pmiNeg_unigram = getPMI_ngram(reviews, gt_score,  0, max_features=2000)
pmiPos_bigram = getPMI_ngram(reviews, gt_score,  1, ngram_range=(2,2), max_features=2000)
pmiNeg_bigram = getPMI_ngram(reviews, gt_score,  0, ngram_range=(2,2), max_features=2000)

  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))


### Let's see what PMI values tell us about the reviews

In [73]:
print('PMI for positive reviews - Unigram')
pmiPos_unigram.head(10)

PMI for positive reviews - Unigram


Unnamed: 0,pmipositive
—Ç–æ—Ä–≥–æ–≤–∏–π,0.251974
—Å—Ç–∏–ª—å–Ω–∏–π,0.251648
–ø—Ä–æ—Å—Ç–æ—Ä–æ,0.250339
—à–∏–∫–∞—Ä–Ω–µ,0.249222
–≤—ñ–¥–º—ñ–Ω–Ω–µ,0.249179
–ø–æ—Ö–≤–∞–ª,0.248875
–Ω–µ–æ–±—Ö—ñ–¥–Ω–∏–º,0.247499
—á—É–¥–æ–≤–µ,0.247456
–ø—Ä–æ—Å—Ç–æ—Ä–∏–π,0.246833
—à–≤–∏–¥–∫–µ,0.246613


In [79]:
pmiPos_trigram[900:1000]

Unnamed: 0_level_0,score
ngram,Unnamed: 1_level_1


In [49]:
print('PMI for positive reviews - Bigram')
pmiPos_bigram.head(10)

PMI for positive reviews - Bigram


Unnamed: 0,pmipositive
—á—É–¥–æ–≤–∏–π —Å–Ω—ñ–¥–∞–Ω–æ–∫,0.255065
—Ä–æ–∑—Ç–∞—à—É–≤–∞–Ω–Ω—è —Ö–æ—Ä–æ—à–∏–π,0.255065
—á–∏—Å—Ç–æ—Ç–∞ –∑–∞—Ç–∏—à–æ–∫,0.255065
—Ç–µ–ø–ª–æ —á–∏—Å—Ç–æ,0.255065
—É—Å—ñ–º –Ω–µ–æ–±—Ö—ñ–¥–Ω–∏–º,0.255065
–ø–µ—Ä—Å–æ–Ω–∞–ª —Å–º–∞—á–Ω–∏–π,0.254037
–Ω–æ–º–µ—Ä—ñ –Ω–µ–æ–±—Ö—ñ–¥–Ω–µ,0.253878
–ø—Ä–æ—Å—Ç–æ—Ä–∏–π —á–∏—Å—Ç–∏–π,0.253786
—Ç–æ—Ä–≥–æ–≤–∏–π —Ü–µ–Ω—Ç—Ä,0.253465
—Ä–æ–∑—Ç–∞—à—É–≤–∞–Ω–Ω—è –ø—Ä–∏—î–º–Ω–∏–π,0.253423


In [50]:
print('PMI for negative reviews - Unigram')
pmiNeg_unigram.head(10)

PMI for negative reviews - Unigram


Unnamed: 0,pminegative
–ø–æ–≥–∞–Ω–∞,0.342608
–≤—ñ–¥—Å—É—Ç–Ω—è,0.339392
–Ω–µ–ø—Ä–∏—î–º–Ω–∏–π,0.339246
–±—Ä—É–¥–Ω—ñ,0.338132
–±—Ä—É–¥–Ω–æ,0.337215
—Ç–æ–Ω–∫—ñ,0.334506
–±—Ä—É–¥–Ω–∏–π,0.334448
–Ω–µ–º–æ–∂–ª–∏–≤–æ,0.332829
–Ω–µ–∑—Ä—É—á–Ω–æ,0.331634
–∂–∞—Ö–ª–∏–≤–∏–π,0.330902


In [55]:
print('PMI for negative reviews - Bigram')
pmiNeg_bigram.head(10)

PMI for negative reviews - Bigram


Unnamed: 0,pminegative
–ø–æ–≥–∞–Ω–∞ –∑–≤—É–∫–æ—ñ–∑–æ–ª—è—Ü—ñ—è,0.349638
–ø–æ–≥–∞–Ω–∞ —à—É–º–æ—ñ–∑–æ–ª—è—Ü—ñ—è,0.347431
–≤—ñ–¥—Å—É—Ç–Ω—ñ—Å—Ç—å –∫–æ–Ω–¥–∏—Ü—ñ–æ–Ω–µ—Ä–∞,0.346193
–≤—ñ–¥–±—É–≤–∞—î—Ç—å—Å—è –∫–æ—Ä–∏–¥–æ—Ä—ñ,0.343876
–∑–∞–ø–∞—Ö –∫–∞–Ω–∞–ª—ñ–∑–∞—Ü—ñ—ó,0.343157
—Å–ø–∞—Ç–∏ –Ω–µ–º–æ–∂–ª–∏–≤–æ,0.342714
–Ω–µ–ø—Ä–∏—î–º–Ω–∏–π –∑–∞–ø–∞—Ö,0.342251
–ø–æ–≥–∞–Ω–æ –ø—Ä–∞—Ü—é–≤–∞–≤,0.342223
—Ç–æ–Ω–∫—ñ —Å—Ç—ñ–Ω–∏,0.341177
—á—É—Ç–∏ –≤—ñ–¥–±—É–≤–∞—î—Ç—å—Å—è,0.340378


In [80]:
pmiPos_trigram = getPMI_ngram(reviews, gt_score, 1, ngram_range=(3,3), max_features=2000)
pmiNeg_trigram = getPMI_ngram(reviews, gt_score, 0, ngram_range=(3,3), max_features=2000)

  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))


In [81]:
pmiPos_trigram

Unnamed: 0,pmipositive
–ø–µ—Ä—Å–æ–Ω–∞–ª –≤–≤—ñ—á–ª–∏–≤–∏–π –≥–æ—Ç–æ–≤–∏–π,0.255065
–ø—Ä–∏—î–º–Ω–∏–π –ø–µ—Ä—Å–æ–Ω–∞–ª —Å–º–∞—á–Ω—ñ,0.255065
–ø—Ä–∏—î–º–Ω–∏–π –ø–µ—Ä—Å–æ–Ω–∞–ª —Ö–æ—Ä–æ—à–∏–π,0.255065
–ø—Ä–∏—î–º–Ω–∏–π –ø–µ—Ä—Å–æ–Ω–∞–ª —á–∏—Å—Ç–∏–π,0.255065
–ø–µ—Ä—Å–æ–Ω–∞–ª —Å–º–∞—á–Ω—ñ —Å–∏—Ç–Ω—ñ,0.255065
—á–∏—Å—Ç–æ —Ç–µ–ø–ª–æ –∫–æ–º—Ñ–æ—Ä—Ç–Ω–æ,0.255065
–∫—É—Ö–Ω—è —É—Å—ñ–º –Ω–µ–æ–±—Ö—ñ–¥–Ω–∏–º,0.255065
—á–∏—Å—Ç–æ —Ç–∏—Ö–æ –∫–æ–º—Ñ–æ—Ä—Ç–Ω–æ,0.255065
–ø—Ä–∏—î–º–Ω–∏–π –ø–µ—Ä—Å–æ–Ω–∞–ª —á–∏—Å—Ç–æ—Ç–∞,0.255065
–∫—ñ–º–Ω–∞—Ç–∞ –∑—Ä—É—á–Ω–µ –ª—ñ–∂–∫–æ,0.255065


In [54]:
pmiNeg_trigram

Unnamed: 0,pminegative
–∑–∞–ø–∞—Ö –≤–∞–Ω–Ω—ñ–π –∫—ñ–º–Ω–∞—Ç—ñ,0.352442
–Ω–µ–ø—Ä–∏—î–º–Ω–∏–π –∑–∞–ø–∞—Ö –≤–∞–Ω–Ω—ñ–π,0.352442
—Å–ª–∞–±–∫–∏–π –Ω–∞—Ç–∏—Å–∫ –≤–æ–¥–∏,0.352442
–ø–æ–≥–∞–Ω–∞ –∑–≤—É–∫–æ—ñ–∑–æ–ª—è—Ü—ñ—è —á—É—Ç–∏,0.352442
–∑–∞–ø–∞—Ö –∫–∞–Ω–∞–ª—ñ–∑–∞—Ü—ñ—ó –Ω–æ–º–µ—Ä—ñ,0.352442
–ø–æ–≥–∞–Ω–∞ —à—É–º–æ—ñ–∑–æ–ª—è—Ü—ñ—è —á—É—Ç–∏,0.352442
—á—É—Ç–Ω–æ –≤—ñ–¥–±—É–≤–∞—î—Ç—å—Å—è –∫–æ—Ä–∏–¥–æ—Ä—ñ,0.348246
–ø–æ–≥–∞–Ω–∞ –∑–≤—É–∫–æ—ñ–∑–æ–ª—è—Ü—ñ—è —á—É—Ç–Ω–æ,0.347303
—á—É—Ç–∏ –≤—ñ–¥–±—É–≤–∞—î—Ç—å—Å—è –∫–æ—Ä–∏–¥–æ—Ä—ñ,0.347241
—Å–ª–∞–±–∫–∏–π wi fi,0.345493


In [40]:
df['pos_text'][137]

'–í—Å—ñ —Ñ–æ—Ç–æ–≥—Ä–∞—Ñ—ñ—ó –≤—ñ–¥–ø–æ–≤—ñ–¥–∞—é—Ç—å —Ä–µ–∞–ª—å–Ω–æ—Å—Ç—ñ. –®—Ç–æ—Ä–∫–∞, —Ä–æ–±–æ—á–∞ —Ä–æ–∑–µ—Ç–∫–∞, –ª–∞–º–ø–∞. –ó—Ä—É—á–Ω–µ –ª—ñ–∂–∫–æ, –Ω–æ–≤–∏–π –ø–æ—Å—É–¥ —ñ —á–∏—Å—Ç—ñ –≤–∞–Ω–Ω—ñ –∫—ñ–º–Ω–∞—Ç–∏.–†–µ–º–æ–Ω—Ç –Ω–æ–≤–∏–π. –ê–¥–º—ñ–Ω—ñ—Å—Ç—Ä–∞—Ç–æ—Ä–∏ –¥—É–∂–µ –ø—Ä–∏—î–º–Ω—ñ —ñ –≤–∏–∫–æ–Ω—É—é—Ç—å —Å–≤–æ—é —Ä–æ–±–æ—Ç—É —à–≤–∏–¥–∫–æ —Ç–∞ —è–∫—ñ—Å–Ω–æ. –í—Ä–∞–∂–µ–Ω–Ω—è –Ω–∞–π-–Ω–∞–π–∫—Ä–∞—â—ñ.'

## Save pmi results to file

In [82]:
pmiPos_bigram.index.name = 'ngram'
pmiPos_bigram = pmiPos_bigram.rename(columns={"pmipositive" : "score"})


In [83]:
pmiPos_bigram.to_csv('./data/bigram-pmi-positive-scores.csv')

In [84]:
pmiPos_unigram.index.name = 'ngram'
pmiPos_unigram = pmiPos_unigram.rename(columns={"pmipositive" : "score"})

In [85]:
pmiPos_unigram.to_csv('./data/unigram-pmi-positive-scores.csv')

In [86]:
pmiNeg_unigram.index.name = 'ngram'
pmiNeg_unigram = pmiNeg_unigram.rename(columns={"pminegative" : "score"})

In [87]:
pmiNeg_unigram.to_csv('./data/unigram-pmi-negative-scores.csv')

In [88]:
pmiNeg_bigram.index.name = 'ngram'
pmiNeg_bigram = pmiNeg_bigram.rename(columns={"pminegative" : "score"})

In [89]:
pmiNeg_bigram.to_csv('./data/bigram-pmi-negative-scores.csv')

In [92]:
pmiPos_trigram.index.name = 'ngram'
pmiPos_trigram = pmiPos_trigram.rename(columns={"pmipositive" : "score"})

In [93]:
pmiPos_trigram.to_csv('./data/trigram-pmi-positive-scores.csv')

In [94]:
pmiNeg_trigram.index.name = 'ngram'
pmiNeg_trigram = pmiNeg_trigram.rename(columns={"pminegative" : "score"})

In [95]:
pmiNeg_trigram.to_csv('./data/trigram-pmi-negative-scores.csv')

In [96]:
pmiNeg_trigram

Unnamed: 0_level_0,score
ngram,Unnamed: 1_level_1
–≤—ñ–¥—Å—É—Ç–Ω—ñ—Å—Ç—å –ø–∏—Ç–Ω–æ—ó –≤–æ–¥–∏,0.352442
–∑–∞–ø–∞—Ö –≤–∞–Ω–Ω—ñ–π –∫—ñ–º–Ω–∞—Ç—ñ,0.352442
–∑–≤—É–∫–æ—ñ–∑–æ–ª—è—Ü—ñ—è –∑–∞–ª–∏—à–∞—î –±–∞–∂–∞—Ç–∏,0.352442
–ø–æ—Ä–æ–∂–Ω—ñ–π –º—ñ–Ω—ñ –±–∞—Ä,0.352442
—Å–ª–∞–±–∫–∏–π —Å–∏–≥–Ω–∞–ª wi,0.352442
–ø–æ–≥–∞–Ω–∞ –∑–≤—É–∫–æ—ñ–∑–æ–ª—è—Ü—ñ—è –Ω–æ–º–µ—Ä—ñ–≤,0.352442
—á—É—Ç–∏ –≤—ñ–¥–±—É–≤–∞—î—Ç—å—Å—è —Å—É—Å—ñ–¥–Ω—ñ—Ö,0.352442
–≤—ñ–¥—Å—É—Ç–Ω—ñ—Å—Ç—å –æ–¥–Ω–æ—Ä–∞–∑–æ–≤–∏—Ö –∫–∞–ø—Ü—ñ–≤,0.352442
–≤—ñ–¥—Å—É—Ç–Ω—ñ—Å—Ç—å –º—ñ–Ω—ñ –±–∞—Ä—É,0.352442
–∑–≤—É–∫–æ—ñ–∑–æ–ª—è—Ü—ñ—è —á—É—Ç–Ω–æ —Å—É—Å—ñ–¥—ñ–≤,0.352442


In [101]:
from IPython.display import display, HTML

CSS = """
.output {
    flex-direction: row;
}
"""

HTML('<style>{}</style>'.format(CSS))

In [125]:
display(pmiPos_unigram.head(5))
display(pmiPos_bigram.head(5))
display(pmiPos_trigram.head(5))

Unnamed: 0_level_0,score
ngram,Unnamed: 1_level_1
—Ç–æ—Ä–≥–æ–≤–∏–π,0.251974
—Å—Ç–∏–ª—å–Ω–∏–π,0.251648
–ø—Ä–æ—Å—Ç–æ—Ä–æ,0.250339
—à–∏–∫–∞—Ä–Ω–µ,0.249222
–≤—ñ–¥–º—ñ–Ω–Ω–µ,0.249179


Unnamed: 0_level_0,score
ngram,Unnamed: 1_level_1
–º—ñ—Å—Ü–µ–∑–Ω–∞—Ö–æ–¥–∂–µ–Ω–Ω—è —Å—É–ø–µ—Ä,0.255065
–ø—Ä–∏–≤—ñ—Ç–Ω–∞ –≥–æ—Å–ø–æ–¥–∏–Ω—è,0.255065
—á—É–¥–æ–≤–µ —Å–ø—ñ–≤–≤—ñ–¥–Ω–æ—à–µ–Ω–Ω—è,0.255065
–ø—Ä–æ—Å—Ç–æ—Ä—ñ –∫—ñ–º–Ω–∞—Ç–∏,0.255065
—Ö–æ—Ä–æ—à—ñ –Ω–æ–º–µ—Ä–∏,0.255065


Unnamed: 0_level_0,score
ngram,Unnamed: 1_level_1
–ø–µ—Ä—Å–æ–Ω–∞–ª –≤–≤—ñ—á–ª–∏–≤–∏–π –≥–æ—Ç–æ–≤–∏–π,0.255065
–ø—Ä–∏—î–º–Ω–∏–π –ø–µ—Ä—Å–æ–Ω–∞–ª —Å–º–∞—á–Ω—ñ,0.255065
–ø—Ä–∏—î–º–Ω–∏–π –ø–µ—Ä—Å–æ–Ω–∞–ª —Ö–æ—Ä–æ—à–∏–π,0.255065
–ø—Ä–∏—î–º–Ω–∏–π –ø–µ—Ä—Å–æ–Ω–∞–ª —á–∏—Å—Ç–∏–π,0.255065
–ø–µ—Ä—Å–æ–Ω–∞–ª —Å–º–∞—á–Ω—ñ —Å–∏—Ç–Ω—ñ,0.255065


In [120]:
pmiPos_bigram = pmiPos_bigram.drop(pmiPos_bigram.index[2])

In [105]:
pmiPos_unigram.columns

Index(['score'], dtype='object')

In [104]:
pmiPos_unigram.rename(columns={'ngram': 'unigram'})

Unnamed: 0_level_0,score
ngram,Unnamed: 1_level_1
—Ç–æ—Ä–≥–æ–≤–∏–π,0.251974
—Å—Ç–∏–ª—å–Ω–∏–π,0.251648
–ø—Ä–æ—Å—Ç–æ—Ä–æ,0.250339
—à–∏–∫–∞—Ä–Ω–µ,0.249222
–≤—ñ–¥–º—ñ–Ω–Ω–µ,0.249179
–ø–æ—Ö–≤–∞–ª,0.248875
–Ω–µ–æ–±—Ö—ñ–¥–Ω–∏–º,0.247499
—á—É–¥–æ–≤–µ,0.247456
–ø—Ä–æ—Å—Ç–æ—Ä–∏–π,0.246833
—à–≤–∏–¥–∫–µ,0.246613
