In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

%matplotlib notebook
import matplotlib.pyplot as plt



In [2]:
pos_unigram_scores = pd.read_csv('./data/unigram-pmi-positive-scores.csv')
pos_bigram_scores = pd.read_csv('./data/bigram-pmi-positive-scores.csv')
pos_trigram_scores = pd.read_csv('./data/trigram-pmi-positive-scores.csv')

In [3]:
neg_unigram_scores = pd.read_csv('./data/unigram-pmi-negative-scores.csv')
neg_bigram_scores = pd.read_csv('./data/bigram-pmi-negative-scores.csv')
neg_trigram_scores = pd.read_csv('./data/trigram-pmi-negative-scores.csv')

#### The class for acquiriung top trigrams, bigrams and inigrams with specific threshold

In [4]:
class NgramManager:
    INVALID_SCORE = -100
    
    def __init__(self, unigram_scores, 
                 bigram_scores, 
                 trigram_scores, 
                 stop_words=None, 
                 threshold=None):
        
        self.trigram_scores = trigram_scores
        self.bigram_scores = bigram_scores
        self.unigram_scores = unigram_scores
        
        self.__threshold = threshold        
        self.__ignore_score = False
        self.__stop_words = stop_words
        
    
    def ignore_score(self, enabled):
        self.__ignore_score = enabled
        
        
    def find_important_unigrams(self, text: str):
        valuable_trigrams = self.find_importnat_ngrams_for_text(text, 
                                                          self.unigram_scores, 
                                                          (1,1))
        
        return valuable_trigrams


        
    def find_important_bigrams(self, text: str):
        valuable_trigrams = self.find_importnat_ngrams_for_text(text, 
                                                          self.bigram_scores, 
                                                          (2,2))
        
        return valuable_trigrams


    
    def find_important_trigrams(self, text: str):
        valuable_trigrams = self.find_importnat_ngrams_for_text(text, 
                                                          self.trigram_scores, 
                                                          (3,3))
        
        return valuable_trigrams
    
        
     
    def find_importnat_ngrams_for_text(self, text: str, ngram_scores, ngram_range):
        ngrams = NgramManager.get_all_ngrams_for_text(text, ngram_range, self.__stop_words)
        
        important_ngrams = list()
        
        for ngram in ngrams:
            score = self.find_ngram_score(ngram_scores, ngram)
            
            if score == self.INVALID_SCORE:
                continue
                
            if self.__ignore_score:
                important_ngrams.append((ngram, score))
                continue
            
            if self.__threshold is None:
                important_ngrams.append((ngram, score))
            else:
                if score >= self.__threshold:
                    important_ngrams.append((ngram, score))
                            
        important_ngrams.sort(key=lambda tup: tup[1], reverse=True)
                
        return important_ngrams

    @staticmethod
    def find_ngram_score(ngram_scores, ngram: str):
        found_df = ngram_scores[ngram_scores['ngram'] == ngram]
        if len(found_df) == 0:
            return NgramManager.INVALID_SCORE

        return found_df['score'].values[0] 
    
    @staticmethod
    def get_all_ngrams_for_text(text: str, ngram_range, stop_words):
        try:
            vectorizer = CountVectorizer(ngram_range=ngram_range, stop_words=stop_words)
            countvector = vectorizer.fit_transform([text])
            ngrams = vectorizer.get_feature_names()
            return ngrams
        except:
            return []

### Load dataset

In [5]:
!ls dataset/booking

analyze.ipynb
booking-rating-for-one-hot-test.csv
booking-rating-for-one-hot-train.csv
booking-rating-for-one-hot-val.csv
booking-rating-test.csv
booking-rating-train.csv
booking-rating-val.csv
booking-sentences-test.csv
booking-sentences-train.csv
booking-sentences-val.csv
booking-test.csv
booking-train.csv
booking-val.csv
create-dataset-for-rating-classification.ipynb
create-detect-ngram-dataset.ipynb
create-sentence-classification-dataset.ipynb
dnipro-reviews.csv
ivano-frankivsk-reviews.csv
kharkiv-reviews.csv
kyiv-reviews.csv
lviv-reviews.csv
odesa-reviews.csv
[34mtranslated[m[m
uzhgorod-reviews.csv


In [6]:
def read_stop_words(file):
    with open(file) as f:
        stop_words = f.read().split('\n')

    return stop_words

In [7]:
uk_stop_words = read_stop_words('./data/ukrainian-stopwords.txt')

In [8]:
senteces_files = [
    'dataset/booking/booking-sentences-train.csv',
    'dataset/booking/booking-sentences-test.csv',
    'dataset/booking/booking-sentences-val.csv',
]

In [9]:
def read_all_frames(files):
    frames= []
    for file in files:
        df = pd.read_csv(file)
        frames.append(df)
    
    return pd.concat(frames)

In [10]:
full_df = read_all_frames(senteces_files)

In [11]:
full_df

Unnamed: 0,sentence,label
0,Хороший готель.,pos_2
1,Перебувала в готелі одну добу.,pos_2
2,Повернули полную терасовані футболку.,neg_1
3,"Комфортні номери, хороша кухня)",pos_2
4,"Прогулянка до пляжу займала лише 20 хвилин, че...",pos_2
5,У головному корпусі до 23ч галаслива музика і ...,neg_1
6,Ціна для такого номера (6 м.кв.) дорога.,neg_1
7,"Добре місцезнаходження, своя приватна автостоя...",pos_2
8,"Смачний сніданок, хороший номер, гостинний пер...",pos_2
9,І відвертий плінтус на 3 поверсі в коридорі . ...,neg_1


In [12]:
full_df.iloc[0]['label']

'pos_2'

In [13]:
def get_pos_neg_text(df):
    pos, neg = [], []
    for i in range(0, len(df)):
        if df.iloc[i]['label'] == 'pos_2':
            pos.append(df.iloc[i]['sentence'])
        elif df.iloc[i]['label'] == 'neg_1':
            neg.append(df.iloc[i]['sentence'])
            
    return pos, neg
        

In [14]:
pos_texts, neg_texts = get_pos_neg_text(full_df)

In [None]:
neg_texts

In [None]:
pos_texts[:5]

In [None]:
neg_texts[:5]

In [None]:
def calc_num_of_coincided_ngram(texts, find_ngram_fn):
    count = 0
    for text in texts:
        coincided_ngrams = find_ngram_fn(text)
        if len(coincided_ngrams) > 0:
            count += 1
    return count

In [None]:
def show_statistics(tops, procents_unigrams, title):
    labels = [f"top - {top}" for top in tops]
    values = procents_unigrams.tolist()
    indexes = np.arange(len(labels))
    plt.bar(indexes, values, 0.8)
    plt.xticks(indexes, labels)
    plt.grid()
    plt.ylim(0, 100)
    plt.ylabel('%')
    plt.title(title)
    plt.show()

In [None]:
def create_n_grams_dataset(texts, unigram_scores, bigram_score, trigram_scores, stop_words):
    ngram_mng = NgramManager(unigram_scores=unigram_scores, 
                             bigram_scores=bigram_score, 
                             trigram_scores=trigram_scores,
                            stop_words=stop_words)
    
    unigrams, bigrams, trigrams = [], [], []
    
    for text in texts:
        important_unigrams = ngram_mng.find_important_unigrams(text)
        
        if len(important_unigrams) > 0:
            unigrams.append(important_unigrams[0][0])
        else:
            unigrams.append(None)
            
            
        important_bigrams = ngram_mng.find_important_bigrams(text)
        
        if len(important_bigrams) > 0:
            bigrams.append(important_bigrams[0][0])
        else:
            bigrams.append(None)
            
        
        important_trigrams = ngram_mng.find_important_trigrams(text)
        
        if len(important_trigrams) > 0:
            trigrams.append(important_trigrams[0][0])
        else:
            trigrams.append(None)
            
            
    return unigrams, bigrams, trigrams 

### Positive data

In [None]:
pos_unigrams, pos_bigrams, pos_trigrams = create_n_grams_dataset(pos_texts, 
                                                                pos_unigram_scores,
                                                                pos_bigram_scores,
                                                                pos_trigram_scores,
                                                                uk_stop_words)

In [None]:
len(pos_unigrams)

In [None]:
len(pos_bigrams)

In [None]:
len(pos_trigrams)

In [None]:
positive_data = pd.DataFrame.from_dict({"text": pos_texts, 
                                        'unigram': pos_unigrams,
                                        'bigram': pos_bigrams,
                                        'trigram': pos_trigrams})

In [None]:
print(f"There were found {100*sum(np.array(pos_unigrams) != None) / len(pos_texts)}% positive unigrams")

In [None]:
print(f"There were found {100*sum(np.array(pos_bigrams) != None) / len(pos_texts)}% positive bigrams")

In [None]:
print(f"There were found {100*sum(np.array(pos_trigrams) != None) / len(pos_texts)}% positive trigrams")

In [None]:
positive_data

In [None]:
t = type(positive_data['trigram'].values[0])

In [None]:
t

In [None]:
positive_data['unigram'] = positive_data['unigram'].apply(lambda x: 'None' if type(x) is not str else x)

In [None]:
positive_data['unigram'].values[0]

In [None]:
len(positive_data.loc[positive_data['trigram'] != 'None'])

In [None]:
len(positive_data.loc[positive_data['bigram'] != 'None'])

In [None]:
len(positive_data.loc[positive_data['unigram'] != 'None'])

In [None]:
positive_data.loc[positive_data['trigram'] != 'None']

In [None]:
positive_data.to_csv("./data/pos-sentence-ngram.csv")

In [None]:
negative_data.to_csv("./data/pos-sentence-ngram.csv")

### Negative data

In [None]:
neg_unigrams, neg_bigrams, neg_trigrams = create_n_grams_dataset(neg_texts, 
                                                                neg_unigram_scores,
                                                                neg_bigram_scores,
                                                                neg_trigram_scores,
                                                                uk_stop_words)

In [None]:
negative_data = pd.DataFrame.from_dict({"text": neg_texts, 
                                        'unigram': neg_unigrams,
                                        'bigram': neg_bigrams,
                                        'trigram': neg_trigrams})

In [None]:
print(f"There were found {100*sum(np.array(neg_unigrams) != None) / len(neg_texts)}% negative unigrams")

In [None]:
print(f"There were found {100*sum(np.array(neg_bigrams) != None) / len(neg_texts)}% negative bigrams")

In [None]:
print(f"There were found {100*sum(np.array(neg_trigrams) != None) / len(neg_texts)}% negative trigrams")

In [None]:
negative_data

In [None]:
negative_data['unigram'] = negative_data['unigram'].apply(lambda x: 'None' if type(x) is not str else x)

In [None]:
negative_data['unigram'].values[0]

In [None]:
len(negative_data['unigram'])

In [None]:
len(negative_data.loc[negative_data['bigram'] != 'None'])

In [None]:
len(negative_data.loc[negative_data['trigram'] != 'None'])

## Example for several pos and neg texts using default threshold

In [None]:
def get_all_important_phrases(text: str, ngram_mng):
    unigrams = ngram_mng.find_important_unigrams(text)
    bigrams = ngram_mng.find_important_bigrams(text)
    trigrams = ngram_mng.find_important_trigrams(text)
    
    return unigrams, bigrams, trigrams

## positive text

In [None]:
ngram_mng = NgramManager(pos_unigram_scores, pos_bigram_scores, pos_trigram_scores, uk_stop_words, threshold=None)

In [None]:
pos_text = df['text'][127]
pos_text

In [None]:
pos_unigrams, pos_bigrams, pos_trigrams = get_all_important_phrases(pos_text, ngram_mng)

In [None]:
pos_unigrams

In [None]:
pos_bigrams

In [None]:
pos_trigrams

## negative text

In [None]:
neg_ngram_mng = NgramManager(neg_unigram_scores, neg_bigram_scores, neg_trigram_scores, stop_words=uk_stop_words)

In [None]:
neg_unigrams, neg_bigrams, neg_trigrams = get_all_important_phrases(neg_text, neg_ngram_mng)

In [None]:
neg_unigrams

In [None]:
neg_bigrams

In [None]:
neg_trigrams