In [341]:
import pandas as pd
import numpy as np
from collections import Counter
import re
from sklearn import metrics
from catboost import CatBoostRegressor

In [300]:
data_train = pd.read_csv("train.csv", encoding = 'latin-1')
data_train = data_train.dropna()

y_train = data_train.Rating

data_train.Review_Title

0                                       Refuge in Chennai
1                                          Hilton Chennai
2       No worth the rating shown in websites. Pricing...
3                                               Good stay
4                                       Needs improvement
                              ...                        
2345                                    Five star service
2347                                         Homely villa
2348                    Nice accommodation and facilities
2349           The Residency Good Centrally located Hotel
2350                  Homely stay with comfort and luxury
Name: Review_Title, Length: 2136, dtype: object

In [301]:
data_test = pd.read_csv("test.csv", encoding = 'latin-1')
data_test.fillna('', inplace=True)

data_test.head(3)

Unnamed: 0,Id,Hotel_name,Review_Title,Review_Text
0,2351,ITC Grand Chola,Mr Neeraj,On the night of my arrival from NY I had a min...
1,2352,Hotel Pandian,,Not so great. But it is still acceptable. Bit...
2,2353,Oyo Rooms Guindy Olympia Tech Park,Nice stay for corporate people,Been a good place to stay for people who visit...


In [302]:
y_test = pd.read_csv("sample submission.csv", encoding = 'latin-1').Rating
y_test.head(3)

0    73.0
1    73.0
2    73.0
Name: Rating, dtype: float64

In [322]:
stop_words = set(open("negative-words.txt", "r").read().split())
good_words = set(open("positive-words.txt", "r").read().split())

In [323]:
import nltk
from nltk.corpus import stopwords

sw_eng = set(stopwords.words('english'))

In [324]:
from nltk import wordnet, pos_tag
def get_wordnet_pos(treebank_tag):
    my_switch = {
        'J': wordnet.wordnet.ADJ,
        'V': wordnet.wordnet.VERB,
        'N': wordnet.wordnet.NOUN,
        'R': wordnet.wordnet.ADV,
    }
    for key, item in my_switch.items():
        if treebank_tag.startswith(key):
            return item
    return wordnet.wordnet.NOUN

In [325]:
from nltk import WordNetLemmatizer
def my_lemmatizer(sent):
    lemmatizer = WordNetLemmatizer()
    tokenized_sent = sent.split()
    pos_tagged = [(word, get_wordnet_pos(tag))
                 for word, tag in pos_tag(tokenized_sent)]
    return ' '.join([lemmatizer.lemmatize(word, tag)
                    for word, tag in pos_tagged])

In [326]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Александр\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [327]:
def clean_sentence(sent):
    sent = sent.lower()
    expr = r'[0-9][$&!.?,]'
    parser = re.compile(expr)
    sent = sent.lower()
    tmp_sent = parser.sub(r'', sent)
    
    return my_lemmatizer(tmp_sent)

In [328]:
def count_pos_words(sent):
    pos_ = 0
    pos_ = len([word for word in set(sent.split()) if word in good_words])
    return pos_

def count_neg_words(sent):
    neg_ = 0
    neg_ = len([word for word in set(sent.split()) if word in stop_words])
    return neg_

In [344]:
def prepare_data(df):
    df['Review_Text'] = df['Review_Text'].apply(clean_sentence)


    #df['Good_words'] = df.Review_Text.transform(lambda x: count_pos_words(x))
    #df['Bad_words'] = df.Review_Text.transform(lambda x: count_neg_words(x))

    #df['Good_Review_Title'] = df.Review_Title.transform(lambda x: count_pos_words(x))
    #df['Bad_Review_Title'] = df.Review_Title.transform(lambda x: count_neg_words(x))

    #df['Good_Hotel_name'] = df.Hotel_name.transform(lambda x: count_pos_words(x))
    #df['Bad_Hotel_name'] = df.Hotel_name.transform(lambda x: count_neg_words(x))

    #df = df.drop(columns=["Id", "Review_Title", "Hotel_name"])

    return df

In [345]:
X_train = prepare_data(data_train)
X_train = X_train.drop(columns=["Rating"])

X_test = prepare_data(data_test)

In [346]:
Counter(X_train.Bad_words)

Counter({0: 1422, 1: 487, 2: 138, 3: 63, 4: 18, 7: 1, 5: 6, 6: 1})

In [347]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()

corpus = [sent]
X = vectorizer.fit_transform(corpus)
print(vectorizer.vocabulary_)

{'excellent': 7, 'room': 15, 'exercise': 8, 'facility': 9, 'around': 0, 'atmosphere': 1, 'calm': 3, 'comfortable': 4, 'main': 12, 'dining': 5, 'offer': 13, 'food': 11, 'service': 16, 'avoid': 2, 'fly': 10, 'elephant': 6, 'restaurant': 14, 'stick': 17}


In [348]:
X.todense()

matrix([[1, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 2, 1, 1, 3, 1, 1]],
       dtype=int64)

In [349]:
from sklearn.feature_extraction.text import TfidfVectorizer
idf_vectorizer = TfidfVectorizer()

corpus = [sent]
Y = idf_vectorizer.fit_transform(corpus)
print(idf_vectorizer.vocabulary_)

{'excellent': 7, 'room': 15, 'exercise': 8, 'facility': 9, 'around': 0, 'atmosphere': 1, 'calm': 3, 'comfortable': 4, 'main': 12, 'dining': 5, 'offer': 13, 'food': 11, 'service': 16, 'avoid': 2, 'fly': 10, 'elephant': 6, 'restaurant': 14, 'stick': 17}


In [350]:
Y.todense()

matrix([[0.16903085, 0.16903085, 0.16903085, 0.16903085, 0.16903085,
         0.3380617 , 0.16903085, 0.3380617 , 0.16903085, 0.16903085,
         0.16903085, 0.16903085, 0.3380617 , 0.16903085, 0.16903085,
         0.50709255, 0.16903085, 0.16903085]])

In [356]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()

X_train_counts = count_vect.fit_transform(X_train.Review_Text)
X_train_counts.shape

(2136, 4094)

In [357]:
X_test_counts = count_vect.transform(X_test.Review_Text)

#X_test_counts['Good_words'] = X_test.Review_Text.transform(lambda x: count_pos_words(x))
#X_test_counts['Bad_words'] = X_test.Review_Text.transform(lambda x: count_neg_words(x))

X_test_counts.shape

(2352, 4094)

In [358]:
from sklearn.linear_model import SGDClassifier
svm = SGDClassifier()

svm.fit(X_train_counts, y_train)
y_pred = svm.predict(X_test_counts)

metrics.mean_squared_error(y_pred, y_test) ** 0.5

20.15741621764626

In [359]:
cbr = CatBoostRegressor(max_depth=5, random_state=42)

cbr.fit(X_train_counts, y_train)

y_pred = cbr.predict(X_test_counts)

metrics.mean_squared_error(y_pred, y_test) ** 0.5

Learning rate set to 0.044676
0:	learn: 20.7033227	total: 15.1ms	remaining: 15.1s
1:	learn: 20.5246223	total: 30.9ms	remaining: 15.4s
2:	learn: 20.3614545	total: 44.1ms	remaining: 14.7s
3:	learn: 20.2127411	total: 56.9ms	remaining: 14.2s
4:	learn: 20.0606373	total: 70.5ms	remaining: 14s
5:	learn: 19.9041482	total: 85.8ms	remaining: 14.2s
6:	learn: 19.7568142	total: 101ms	remaining: 14.3s
7:	learn: 19.6299762	total: 114ms	remaining: 14.1s
8:	learn: 19.5095125	total: 131ms	remaining: 14.4s
9:	learn: 19.3876530	total: 147ms	remaining: 14.5s
10:	learn: 19.2875863	total: 163ms	remaining: 14.6s
11:	learn: 19.1784496	total: 190ms	remaining: 15.6s
12:	learn: 19.0978716	total: 206ms	remaining: 15.6s
13:	learn: 19.0042450	total: 219ms	remaining: 15.4s
14:	learn: 18.9011928	total: 232ms	remaining: 15.2s
15:	learn: 18.8157132	total: 245ms	remaining: 15.1s
16:	learn: 18.7189076	total: 258ms	remaining: 14.9s
17:	learn: 18.6409311	total: 270ms	remaining: 14.8s
18:	learn: 18.5650091	total: 284ms	remai

161:	learn: 15.1172630	total: 2.36s	remaining: 12.2s
162:	learn: 15.1010082	total: 2.39s	remaining: 12.3s
163:	learn: 15.0892168	total: 2.41s	remaining: 12.3s
164:	learn: 15.0849261	total: 2.43s	remaining: 12.3s
165:	learn: 15.0742370	total: 2.44s	remaining: 12.3s
166:	learn: 15.0529599	total: 2.45s	remaining: 12.2s
167:	learn: 15.0413504	total: 2.47s	remaining: 12.2s
168:	learn: 15.0327337	total: 2.48s	remaining: 12.2s
169:	learn: 15.0163780	total: 2.49s	remaining: 12.2s
170:	learn: 15.0061056	total: 2.5s	remaining: 12.1s
171:	learn: 14.9961904	total: 2.52s	remaining: 12.1s
172:	learn: 14.9862102	total: 2.53s	remaining: 12.1s
173:	learn: 14.9731043	total: 2.55s	remaining: 12.1s
174:	learn: 14.9612786	total: 2.56s	remaining: 12.1s
175:	learn: 14.9536119	total: 2.58s	remaining: 12.1s
176:	learn: 14.9388076	total: 2.59s	remaining: 12s
177:	learn: 14.9142520	total: 2.6s	remaining: 12s
178:	learn: 14.9024994	total: 2.62s	remaining: 12s
179:	learn: 14.8963502	total: 2.63s	remaining: 12s
180

325:	learn: 13.4784264	total: 4.74s	remaining: 9.79s
326:	learn: 13.4661915	total: 4.75s	remaining: 9.78s
327:	learn: 13.4539359	total: 4.77s	remaining: 9.77s
328:	learn: 13.4498896	total: 4.78s	remaining: 9.76s
329:	learn: 13.4474649	total: 4.8s	remaining: 9.75s
330:	learn: 13.4381333	total: 4.82s	remaining: 9.73s
331:	learn: 13.4292829	total: 4.83s	remaining: 9.72s
332:	learn: 13.4123101	total: 4.84s	remaining: 9.7s
333:	learn: 13.4001596	total: 4.86s	remaining: 9.69s
334:	learn: 13.3881673	total: 4.87s	remaining: 9.67s
335:	learn: 13.3857681	total: 4.89s	remaining: 9.66s
336:	learn: 13.3775815	total: 4.9s	remaining: 9.64s
337:	learn: 13.3715890	total: 4.91s	remaining: 9.62s
338:	learn: 13.3665103	total: 4.93s	remaining: 9.61s
339:	learn: 13.3592383	total: 4.95s	remaining: 9.6s
340:	learn: 13.3421614	total: 4.96s	remaining: 9.59s
341:	learn: 13.3307815	total: 4.97s	remaining: 9.57s
342:	learn: 13.3268391	total: 4.99s	remaining: 9.55s
343:	learn: 13.3162117	total: 5.02s	remaining: 9.5

484:	learn: 12.4629470	total: 7.33s	remaining: 7.78s
485:	learn: 12.4502161	total: 7.35s	remaining: 7.77s
486:	learn: 12.4463568	total: 7.36s	remaining: 7.75s
487:	learn: 12.4444945	total: 7.38s	remaining: 7.74s
488:	learn: 12.4310222	total: 7.39s	remaining: 7.72s
489:	learn: 12.4230572	total: 7.4s	remaining: 7.71s
490:	learn: 12.4200177	total: 7.42s	remaining: 7.69s
491:	learn: 12.4181702	total: 7.43s	remaining: 7.67s
492:	learn: 12.4131438	total: 7.44s	remaining: 7.65s
493:	learn: 12.4060087	total: 7.46s	remaining: 7.64s
494:	learn: 12.4030296	total: 7.47s	remaining: 7.62s
495:	learn: 12.3995866	total: 7.48s	remaining: 7.6s
496:	learn: 12.3917811	total: 7.5s	remaining: 7.59s
497:	learn: 12.3820761	total: 7.52s	remaining: 7.58s
498:	learn: 12.3793106	total: 7.53s	remaining: 7.56s
499:	learn: 12.3772968	total: 7.56s	remaining: 7.56s
500:	learn: 12.3712206	total: 7.58s	remaining: 7.55s
501:	learn: 12.3670223	total: 7.59s	remaining: 7.53s
502:	learn: 12.3643268	total: 7.6s	remaining: 7.5

650:	learn: 11.7222326	total: 9.83s	remaining: 5.27s
651:	learn: 11.7196103	total: 9.86s	remaining: 5.26s
652:	learn: 11.7102245	total: 9.89s	remaining: 5.26s
653:	learn: 11.7072197	total: 9.92s	remaining: 5.25s
654:	learn: 11.7047138	total: 9.94s	remaining: 5.24s
655:	learn: 11.7012036	total: 9.96s	remaining: 5.22s
656:	learn: 11.6984469	total: 9.98s	remaining: 5.21s
657:	learn: 11.6958453	total: 10s	remaining: 5.2s
658:	learn: 11.6933920	total: 10s	remaining: 5.19s
659:	learn: 11.6911015	total: 10s	remaining: 5.17s
660:	learn: 11.6894093	total: 10.1s	remaining: 5.16s
661:	learn: 11.6877293	total: 10.1s	remaining: 5.14s
662:	learn: 11.6807588	total: 10.1s	remaining: 5.13s
663:	learn: 11.6719827	total: 10.1s	remaining: 5.11s
664:	learn: 11.6694928	total: 10.1s	remaining: 5.09s
665:	learn: 11.6680100	total: 10.1s	remaining: 5.08s
666:	learn: 11.6629183	total: 10.1s	remaining: 5.06s
667:	learn: 11.6581446	total: 10.2s	remaining: 5.05s
668:	learn: 11.6564954	total: 10.2s	remaining: 5.03s


816:	learn: 11.1233687	total: 12.6s	remaining: 2.82s
817:	learn: 11.1212862	total: 12.6s	remaining: 2.8s
818:	learn: 11.1168532	total: 12.6s	remaining: 2.79s
819:	learn: 11.1155616	total: 12.6s	remaining: 2.77s
820:	learn: 11.1140813	total: 12.6s	remaining: 2.76s
821:	learn: 11.1131044	total: 12.7s	remaining: 2.74s
822:	learn: 11.1096438	total: 12.7s	remaining: 2.73s
823:	learn: 11.1070059	total: 12.7s	remaining: 2.71s
824:	learn: 11.1044316	total: 12.7s	remaining: 2.69s
825:	learn: 11.0977213	total: 12.7s	remaining: 2.68s
826:	learn: 11.0966511	total: 12.7s	remaining: 2.66s
827:	learn: 11.0886567	total: 12.8s	remaining: 2.65s
828:	learn: 11.0860453	total: 12.8s	remaining: 2.63s
829:	learn: 11.0839772	total: 12.8s	remaining: 2.62s
830:	learn: 11.0828498	total: 12.8s	remaining: 2.61s
831:	learn: 11.0802911	total: 12.8s	remaining: 2.59s
832:	learn: 11.0789387	total: 12.8s	remaining: 2.57s
833:	learn: 11.0769609	total: 12.9s	remaining: 2.56s
834:	learn: 11.0755195	total: 12.9s	remaining: 

983:	learn: 10.5346040	total: 15.1s	remaining: 246ms
984:	learn: 10.5320844	total: 15.2s	remaining: 231ms
985:	learn: 10.5302588	total: 15.2s	remaining: 216ms
986:	learn: 10.5248977	total: 15.2s	remaining: 200ms
987:	learn: 10.5226791	total: 15.2s	remaining: 185ms
988:	learn: 10.5208712	total: 15.2s	remaining: 169ms
989:	learn: 10.5147722	total: 15.2s	remaining: 154ms
990:	learn: 10.5130113	total: 15.3s	remaining: 139ms
991:	learn: 10.5081184	total: 15.3s	remaining: 123ms
992:	learn: 10.5069758	total: 15.3s	remaining: 108ms
993:	learn: 10.5051071	total: 15.3s	remaining: 92.4ms
994:	learn: 10.5038337	total: 15.3s	remaining: 77ms
995:	learn: 10.4995427	total: 15.3s	remaining: 61.6ms
996:	learn: 10.4966162	total: 15.3s	remaining: 46.2ms
997:	learn: 10.4912812	total: 15.4s	remaining: 30.8ms
998:	learn: 10.4832583	total: 15.4s	remaining: 15.4ms
999:	learn: 10.4799132	total: 15.4s	remaining: 0us


13.625399701324