# Домашнее задание №6 Чебыкина Артёма

In [1]:
%config IPCompleter.greedy=True

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import string
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression
from nltk.stem.porter import *
from stop_words import get_stop_words
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator

In [3]:
pd.options.mode.chained_assignment = None 

### №1

In [4]:
DataSet = pd.read_csv("data/singapore_airlines_reviews.csv")

In [5]:
DataSet.head()

Unnamed: 0,published_date,published_platform,rating,type,text,title,helpful_votes
0,2024-03-12T14:41:14-04:00,Desktop,3,review,We used this airline to go from Singapore to L...,Ok,0
1,2024-03-11T19:39:13-04:00,Desktop,5,review,The service on Singapore Airlines Suites Class...,The service in Suites Class makes one feel lik...,0
2,2024-03-11T12:20:23-04:00,Desktop,1,review,"Booked, paid and received email confirmation f...",Don’t give them your money,0
3,2024-03-11T07:12:27-04:00,Desktop,5,review,"Best airline in the world, seats, food, servic...",Best Airline in the World,0
4,2024-03-10T05:34:18-04:00,Desktop,2,review,Premium Economy Seating on Singapore Airlines ...,Premium Economy Seating on Singapore Airlines ...,0


In [6]:
ImportantDataSet = DataSet[["text","title","rating"]]

In [7]:
ImportantDataSet.head()

Unnamed: 0,text,title,rating
0,We used this airline to go from Singapore to L...,Ok,3
1,The service on Singapore Airlines Suites Class...,The service in Suites Class makes one feel lik...,5
2,"Booked, paid and received email confirmation f...",Don’t give them your money,1
3,"Best airline in the world, seats, food, servic...",Best Airline in the World,5
4,Premium Economy Seating on Singapore Airlines ...,Premium Economy Seating on Singapore Airlines ...,2


In [8]:
ImportantDataSet['full_review'] = ImportantDataSet['title'] + " " + ImportantDataSet['text']

In [9]:
ImportantDataSet = ImportantDataSet[["full_review","rating"]]

In [10]:
UnprocessedDataSet = ImportantDataSet

In [11]:
ImportantDataSet.head()

Unnamed: 0,full_review,rating
0,Ok We used this airline to go from Singapore t...,3
1,The service in Suites Class makes one feel lik...,5
2,"Don’t give them your money Booked, paid and re...",1
3,Best Airline in the World Best airline in the ...,5
4,Premium Economy Seating on Singapore Airlines ...,2


In [12]:
def preprocess_text(x):
    res = []
    for word in x.lower().split():
        for sign in string.punctuation:
            word = word.replace(sign, '')
        res.append(word)
    return res

In [13]:
ImportantDataSet['full_review'].apply(preprocess_text)

0       [ok, we, used, this, airline, to, go, from, si...
1       [the, service, in, suites, class, makes, one, ...
2       [don’t, give, them, your, money, booked, paid,...
3       [best, airline, in, the, world, best, airline,...
4       [premium, economy, seating, on, singapore, air...
                              ...                        
9995    [flew, to, nz, 1st, half, singapore, airlines,...
9996    [best, airline, and, again, a, great, flight, ...
9997    [superb, service, on, singapore, airlines, we,...
9998    [a, comfortable, fiight, spoiled, by, lack, of...
9999    [delivered, as, expected, , as, always, singap...
Name: full_review, Length: 10000, dtype: object

In [14]:
train, test = train_test_split(ImportantDataSet, random_state=0)

Разделим выборку, и применим метод Bag of Words и TF-IDF.

In [15]:
bow = CountVectorizer()
x_train = bow.fit_transform(train['full_review'])
x_test = bow.transform(test['full_review'])
y_train = train['rating']
y_test = test['rating']

Запустим логистическую регрессию, проверив ее с помощью F1 score, подсчитанного для нескольких классов(оценки 1-5) с помощью Micro averaged F1 Score, чтобы учесть разные метрики.

In [16]:
model = LogisticRegression(max_iter=1000)

In [17]:
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
f1_score(y_pred, y_test,average='micro')

0.6748

In [18]:
tf_idf = TfidfVectorizer()
x_train = tf_idf.fit_transform(train['full_review'])
x_test = tf_idf.transform(test['full_review'])
y_train = train['rating']
y_test = test['rating']


In [19]:
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
f1_score(y_pred, y_test, average='micro')

0.7056

Модель TF-IDF отработала немного лучше

### №2

Создадим Пайплайн по похожей на практику схеме, чтобы протестировать разные комбинации моделей для получения наилучшего результата

In [20]:
class BasePreprocessor(BaseEstimator):
    
    def fit(self, x, y=None):
        return self
    
    def transform(self, x):
        return np.array(list(map(lambda x: ' '.join(preprocess_text(x)), x)))

In [21]:
stemmer = PorterStemmer()

In [22]:
stop_words = list(get_stop_words('en'))

In [23]:
def better_preprocess_text(x,mode):
    res = []
    for word in x.lower().split():
        if word not in stop_words:
            for sign in string.punctuation:
                word = word.replace(sign, '')
            res.append(word)
    if mode:
        return ' '.join(map(stemmer.stem, res))
    else: return res

In [24]:
class StemPreprocessor(BaseEstimator):
    
    def fit(self, x, y=None):
        return self
    
    def _stem(self, word):
        return stemmer.stem(word)
    
    def _transform_text(self, text):
        return better_preprocess_text(text,1)

    def transform(self, x):
        return list(map(self._transform_text, x))

Не забудем разделить выборку на необработанном датасете

In [25]:
train, test = train_test_split(UnprocessedDataSet, random_state=0)

In [26]:
fit_results = []

x_train = train['full_review']
x_test = test['full_review']
y_train = train['rating']
y_test = test['rating']

for vect in [CountVectorizer(), TfidfVectorizer()]:
    for model in [LogisticRegression(max_iter=1000), RandomForestClassifier()]:
        pipeline = Pipeline(
            [
                ("base", BasePreprocessor()),
                ("stem", StemPreprocessor()),
                ("vect", vect),
                ("model", model),
            ]
        )
        pipeline.fit(x_train, y_train)
        y_pred = pipeline.predict(x_test)
        metric = f1_score(y_pred, y_test, average='micro')
        fit_results.append(
            {
                'vect': vect.__class__.__name__,
                'model': model.__class__.__name__,
                'f1': metric,
            }
        )

fit_results = pd.DataFrame(fit_results)

In [27]:
print(fit_results)

              vect                   model      f1
0  CountVectorizer      LogisticRegression  0.6664
1  CountVectorizer  RandomForestClassifier  0.6288
2  TfidfVectorizer      LogisticRegression  0.7000
3  TfidfVectorizer  RandomForestClassifier  0.6264


Модель TF-IDF с логистической регрессией отработала лучше всех, а с Лесом наоборот хуже(хотя по сути почти также как BOW). Можно увидеть что на таких жанных логистическая регрессия даёт более хороший результат.

Напишем один коментарий на оценку 2 и один на оценку 5:

2 : "Meh" - "The only good thing about this flight was the fact, that we have reached the destination. The food, we were served was stale, and none of the crew seemed to care about passengers or anything for that matter."

5: "Fantastic" - "The flight was great. I appreciated the fact, that the crew was friendly. Both descending and landing were very smooth, which was a very pleasant thing as well."

In [28]:
Bad_comment = "Meh The only good thing about this flight was the fact, that we have reached the destination. The food, we were served was stale, and none of the crew seemed to care about passengers or anything for that matter."
Good_comment = "Fantastic The flight was great. I appreciated the fact, that the crew was friendly. Both descending and landing were very smooth, which was a very pleasant thing as well."

In [29]:
BadProcessed = tf_idf.transform(better_preprocess_text(Bad_comment,0))
GoodProcessed = tf_idf.transform(better_preprocess_text(Good_comment,0))


In [30]:
print("Bad comment:" + str(model.predict(BadProcessed)))

print("Bad comment:" + str(model.predict(GoodProcessed)))

ValueError: X has 17353 features, but RandomForestClassifier is expecting 15291 features as input.

Простая модель оба комментария оценила хорошо, скорее всего из-за того, что не является слишком продвуинутой, а в плохом коментарии я использовал слова, которые часто имеют и положительное значение