# 1. Osnovne biblioteke i skup podataka za rad

In [2]:
import pandas as pd
import numpy as np
from warnings import filterwarnings
filterwarnings('ignore')

In [3]:
pd.set_option('display.max_columns', 500)

In [4]:
data = pd.read_csv('FinalAirlineReviews.csv')

In [5]:
data.sample(5)

Unnamed: 0,OriginalReview,FilteredReview,StemmedReview,LemmatizedReview,Sentiment2Cat,Sentiment3Cat,Sentiment4Cat
74365,Flight from MUC to ORD Economy Class on Airbus...,flight muc ord economy class airbus windows se...,flight muc ord economi class airbu window seat...,flight muc ord economy class airbus windows se...,Negative,Negative,Extremely Negative
69903,Overall the flight was modest. The check-in wa...,overall flight modest check done using web che...,overal flight modest check done use web check ...,overall flight modest check do use web check s...,Positive,Positive,Extremely Positive
116429,Istanbul to Bucharest. We make our check in in...,istanbul bucharest make check airport take lug...,istanbul bucharest make check airport take lug...,istanbul bucharest make check airport take lug...,Negative,Negative,Extremely Negative
28650,I had nine flights with Avianca this summer to...,nine flights avianca summer around latin ameri...,nine flight avianca summer around latin americ...,nine flight avianca summer around latin americ...,Positive,Positive,Extremely Positive
13468,AI 864 from BOM-IXC via DEL. A321 with decent ...,ai bom ixc via del decent leg space slightly l...,ai bom ixc via del decent leg space slightli l...,ai bom ixc via del decent leg space slightly l...,Positive,Neutral,Mildly Positive


# 2. Bag of Words Models

In [6]:
from enum import Enum

In [7]:
class ReviewType(Enum):
    Filtered = 1,
    Stemmed = 2,
    Lemmatized = 3

In [8]:
from sklearn.feature_extraction.text import CountVectorizer

def bagOfWordsDataFrame(reviewType: ReviewType, numOfSentiments: int, maxFeatures: int):
    
    reviewAttribute = ""
    sentimentAttribute = ""
    
    if reviewType == ReviewType.Filtered:
        reviewAttribute = "FilteredReview"
    elif reviewType == ReviewType.Stemmed:
        reviewAttribute = "StemmedReview"
    elif reviewType == ReviewType.Lemmatized:
        reviewAttribute = "LemmatizedReview"
    
    if numOfSentiments == 2:
        sentimentAttribute = "Sentiment2Cat"
    elif numOfSentiments == 3:
        sentimentAttribute = "Sentiment3Cat"
    elif numOfSentiments == 4:
        sentimentAttribute = "Sentiment4Cat"
    
    reviewList = data[reviewAttribute].to_list()
    
    countVector = CountVectorizer(max_features=maxFeatures)
    
    wordVectors = countVector.fit_transform(reviewList)
    featureNames = countVector.get_feature_names_out()
    
    bowDataFrame = pd.DataFrame(wordVectors.toarray(), columns=featureNames)
    
    sentimentColumn = data[sentimentAttribute]
    
    bowDataFrame = pd.concat([bowDataFrame, sentimentColumn], axis=1)
    
    return bowDataFrame

In [9]:
bow = bagOfWordsDataFrame(ReviewType.Filtered, 2, 100)

In [10]:
bow.sample(5)

Unnamed: 0,air,aircraft,airline,airlines,airport,also,another,arrived,asked,back,bag,baggage,better,board,boarding,booked,business,cabin,cancelled,check,class,comfortable,could,crew,customer,day,delay,delayed,drinks,due,economy,entertainment,even,experience,extra,first,flew,flight,flights,fly,flying,food,friendly,gate,get,go,good,got,great,hour,hours,however,last,late,left,leg,like,long,luggage,made,make,meal,minutes,money,much,never,new,next,nice,no,not,one,paid,passengers,pay,people,plane,really,return,said,seat,seats,service,staff,still,take,ticket,time,times,told,took,trip,two,us,use,via,way,well,worst,would,Sentiment2Cat
48754,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,2,0,0,0,0,0,0,Negative
26434,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,Positive
77835,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,2,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,2,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,Positive
36811,0,0,0,1,1,0,0,1,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,1,0,1,1,0,0,2,0,0,0,0,0,0,Positive
87945,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,2,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,2,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,1,0,0,0,0,1,2,0,0,0,0,0,3,1,0,0,0,0,0,1,0,0,0,0,0,Negative


In [11]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

def getEvaluationMetrics(model, reviewType: ReviewType, numOfSentiments: int, maxFeatures: int):
    
    bowData = bagOfWordsDataFrame(reviewType, numOfSentiments, maxFeatures)
    x, y = bowData.iloc[:, :-1], bowData.iloc[:, -1]
    #x.drop(columns=['no', 'not'], inplace=True)
    
    xtr, xtest, ytr, ytest = train_test_split(x, y, test_size=0.25, random_state=18)
    
    model.fit(xtr, ytr)
    
    predictionNB = model.predict(xtest)
    
    acc = accuracy_score(predictionNB, ytest)
    prec = precision_score(predictionNB, ytest, average='weighted')
    recall = recall_score(predictionNB, ytest, average='weighted')
    f1 = f1_score(predictionNB, ytest, average='weighted')
    
    return acc, prec, recall, f1

In [12]:
from sklearn.naive_bayes import GaussianNB

def metricLists(start: int, stop: int, step: int):
    
    accList, precList, recallList, f1List = list(), list(), list(), list()

    for numOfFeatures in range(start, stop, step):

        naiveBayes = GaussianNB()

        acc, prec, recall, f1 = getEvaluationMetrics(naiveBayes, ReviewType.Stemmed, 2, numOfFeatures)

        accList.append(acc)
        precList.append(prec)
        recallList.append(recall)
        f1List.append(f1)
        
    return accList, precList, recallList, f1List

# 3. Classification Models

In [13]:
#ucitavamo neophodne biblioteke evaluaciju modela klasifikacije

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV

In [25]:
bowData = bagOfWordsDataFrame(ReviewType.Filtered, 4, 1000)

In [26]:
x, y = bowData.iloc[:, :-1], bowData.iloc[:, -1]

In [27]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report

In [28]:
xtr, xtest, ytr, ytest = train_test_split(x, y, test_size=0.25, random_state=18)

## 3.1 Naive Bayes Model

In [62]:
#ucitavamo biblioteke za rad sa Naivnim Bajesom

from sklearn.naive_bayes import GaussianNB, ComplementNB, MultinomialNB

In [67]:
naiveBayes = GaussianNB()

In [68]:
naiveBayes.fit(xtr, ytr)

GaussianNB()

In [69]:
predictionNB = naiveBayes.predict(xtest)
predictionNB

array(['Positive', 'Positive', 'Negative', ..., 'Positive', 'Negative',
       'Positive'], dtype='<U8')

In [70]:
print(f"Model's accuracy is: {round(accuracy_score(predictionNB, ytest), 4)*100}%")
print(f"Model's precision is: {round(precision_score(predictionNB, ytest, average='weighted'), 4)*100}%")
print(f"Model's recall is: {round(recall_score(predictionNB, ytest, average='weighted'), 4)*100}%")
print(f"Model's f1 is: {round(f1_score(predictionNB, ytest, average='weighted'), 4)*100}%")

Model's accuracy is: 79.44%
Model's precision is: 81.63%
Model's recall is: 79.44%
Model's f1 is: 79.29%


In [71]:
print(classification_report(predictionNB, ytest))

              precision    recall  f1-score   support

    Negative       0.72      0.91      0.81     14556
    Positive       0.90      0.69      0.78     16519

    accuracy                           0.79     31075
   macro avg       0.81      0.80      0.79     31075
weighted avg       0.82      0.79      0.79     31075



## 3.2 Decision Tree Model

In [72]:
from sklearn.tree import DecisionTreeClassifier

In [73]:
dtModel = DecisionTreeClassifier()

In [74]:
dtModel.fit(xtr, ytr)

DecisionTreeClassifier()

In [76]:
predictionDT = dtModel.predict(xtest)
predictionDT

array(['Positive', 'Negative', 'Negative', ..., 'Positive', 'Negative',
       'Positive'], dtype=object)

In [78]:
print(f"Model's accuracy is: {round(accuracy_score(predictionDT, ytest), 4)*100}%")
print(f"Model's precision is: {round(precision_score(predictionDT, ytest, average='weighted'), 4)*100}%")
print(f"Model's recall is: {round(recall_score(predictionDT, ytest, average='weighted'), 4)*100}%")
print(f"Model's f1 is: {round(f1_score(predictionDT, ytest, average='weighted'), 4)*100}%")

Model's accuracy is: 82.14%
Model's precision is: 82.13000000000001%
Model's recall is: 82.14%
Model's f1 is: 82.14%


In [79]:
print(classification_report(predictionDT, ytest))

              precision    recall  f1-score   support

    Negative       0.85      0.85      0.85     18346
    Positive       0.78      0.78      0.78     12729

    accuracy                           0.82     31075
   macro avg       0.82      0.82      0.82     31075
weighted avg       0.82      0.82      0.82     31075



## 3.3 Logistic Regression Model

In [29]:
from sklearn.linear_model import LogisticRegression

In [30]:
lrModel = LogisticRegression()

In [31]:
lrModel.fit(xtr, ytr)

LogisticRegression()

In [32]:
predictionLR = lrModel.predict(xtest)
predictionLR

array(['Extremely Positive', 'Extremely Positive', 'Extremely Negative',
       ..., 'Extremely Positive', 'Extremely Positive',
       'Extremely Positive'], dtype=object)

In [33]:
print(f"Model's accuracy is: {round(accuracy_score(predictionLR, ytest), 4)*100}%")
print(f"Model's precision is: {round(precision_score(predictionLR, ytest, average='weighted'), 4)*100}%")
print(f"Model's recall is: {round(recall_score(predictionLR, ytest, average='weighted'), 4)*100}%")
print(f"Model's f1 is: {round(f1_score(predictionLR, ytest, average='weighted'), 4)*100}%")

Model's accuracy is: 83.84%
Model's precision is: 91.16%
Model's recall is: 83.84%
Model's f1 is: 87.14%


In [34]:
print(classification_report(predictionLR, ytest))

                    precision    recall  f1-score   support

Extremely Negative       0.95      0.86      0.90     17436
Extremely Positive       0.93      0.84      0.88     12671
   Mildly Negative       0.10      0.33      0.16       781
   Mildly Positive       0.02      0.15      0.04       187

          accuracy                           0.84     31075
         macro avg       0.50      0.55      0.50     31075
      weighted avg       0.91      0.84      0.87     31075



## 3.4 Gradient Boosting Model

In [86]:
from sklearn.ensemble import GradientBoostingClassifier

In [87]:
gbModel = GradientBoostingClassifier()

In [88]:
gbModel.fit(xtr, ytr)

GradientBoostingClassifier()

In [90]:
predictionGB = gbModel.predict(xtest)
predictionGB

array(['Positive', 'Negative', 'Negative', ..., 'Negative', 'Positive',
       'Positive'], dtype=object)

In [91]:
print(f"Model's accuracy is: {round(accuracy_score(predictionGB, ytest), 4)*100}%")
print(f"Model's precision is: {round(precision_score(predictionGB, ytest, average='weighted'), 4)*100}%")
print(f"Model's recall is: {round(recall_score(predictionGB, ytest, average='weighted'), 4)*100}%")
print(f"Model's f1 is: {round(f1_score(predictionGB, ytest, average='weighted'), 4)*100}%")

Model's accuracy is: 88.37%
Model's precision is: 88.66000000000001%
Model's recall is: 88.37%
Model's f1 is: 88.44999999999999%


In [92]:
print(classification_report(predictionGB, ytest))

              precision    recall  f1-score   support

    Negative       0.93      0.88      0.90     19261
    Positive       0.82      0.88      0.85     11814

    accuracy                           0.88     31075
   macro avg       0.87      0.88      0.88     31075
weighted avg       0.89      0.88      0.88     31075

