# 1. Osnovne biblioteke i skup podataka za rad

In [1]:
import pandas as pd
import numpy as np
from warnings import filterwarnings
filterwarnings('ignore')

In [2]:
pd.set_option('display.max_columns', 500)

In [3]:
data = pd.read_csv('FinalAirlineReviews.csv')

In [4]:
data.sample(5)

Unnamed: 0,OriginalReview,FilteredReview,StemmedReview,LemmatizedReview,Sentiment2Cat,Sentiment3Cat,Sentiment4Cat
119835,We flew economy from Bangkok to HCMC (Saigon)....,flew economy bangkok hcmc saigon cabin dirties...,flew economi bangkok hcmc saigon cabin dirties...,flew economy bangkok hcmc saigon cabin dirties...,Negative,Negative,Extremely Negative
13105,"The new Air France Business seat is full flat,...",new air france business seat full flat comfort...,new air franc busi seat full flat comfort prov...,new air france business seat full flat comfort...,Positive,Positive,Extremely Positive
38094,"✅ , | I flew the A380 of China Southern Airli...",flew china southern airlines lax guangzhou ret...,flew china southern airlin lax guangzhou retur...,flew china southern airline lax guangzhou retu...,Positive,Positive,Extremely Positive
64173,JFK via FLL for a business trip - my only pref...,jfk via fll business trip preferred airline do...,jfk via fll busi trip prefer airlin domest cit...,jfk via fll business trip preferred airline do...,Positive,Positive,Extremely Positive
50635,"✅ , | Was overall happy with flights to Melbo...",overall happy flights melbourne london heathro...,overal happi flight melbourn london heathrow v...,overall happy flight melbourne london heathrow...,Positive,Positive,Extremely Positive


# 2. Tf-Idf Model Creation

In [5]:
from enum import Enum

class ReviewType(Enum):
    Filtered = 1,
    Stemmed = 2,
    Lemmatized = 3

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

def bagOfWordsDataFrame(reviewType: ReviewType, numOfSentiments: int, maxFeatures: int):
    
    reviewAttribute = ""
    sentimentAttribute = ""
    
    if reviewType == ReviewType.Filtered:
        reviewAttribute = "FilteredReview"
    elif reviewType == ReviewType.Stemmed:
        reviewAttribute = "StemmedReview"
    elif reviewType == ReviewType.Lemmatized:
        reviewAttribute = "LemmatizedReview"
    
    if numOfSentiments == 2:
        sentimentAttribute = "Sentiment2Cat"
    elif numOfSentiments == 3:
        sentimentAttribute = "Sentiment3Cat"
    elif numOfSentiments == 4:
        sentimentAttribute = "Sentiment4Cat"
    
    reviewList = data[reviewAttribute].to_list()
    
    tfIdfVector = TfidfVectorizer(max_features=maxFeatures)
    
    wordVectors = tfIdfVector.fit_transform(reviewList)
    featureNames = tfIdfVector.get_feature_names_out()
    
    bowDataFrame = pd.DataFrame(wordVectors.toarray(), columns=featureNames)
    
    sentimentColumn = data[sentimentAttribute]
    
    bowDataFrame = pd.concat([bowDataFrame, sentimentColumn], axis=1)
    
    return bowDataFrame

In [33]:
bowData = bagOfWordsDataFrame(ReviewType.Filtered, 2, 1000)

In [34]:
x, y = bowData.iloc[:, :-1], bowData.iloc[:, -1]

In [35]:
from sklearn.model_selection import train_test_split

In [36]:
xtr, xtest, ytr, ytest = train_test_split(x, y, test_size=0.25, random_state=18)

# 3. Classification Models

In [11]:
#ucitavamo neophodne biblioteke evaluaciju modela klasifikacije

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV

## 3.1 Naive Bayes Model

In [12]:
#ucitavamo biblioteke za rad sa Naivnim Bajesom

from sklearn.naive_bayes import GaussianNB, ComplementNB, MultinomialNB

In [13]:
naiveBayes = GaussianNB()

In [14]:
naiveBayes.fit(xtr, ytr)

GaussianNB()

In [15]:
predictionNB = naiveBayes.predict(xtest)
predictionNB

array(['Positive', 'Positive', 'Negative', ..., 'Positive', 'Positive',
       'Positive'], dtype='<U8')

In [16]:
print(f"Model's accuracy is: {round(accuracy_score(predictionNB, ytest), 4)*100}%")
print(f"Model's precision is: {round(precision_score(predictionNB, ytest, average='weighted'), 4)*100}%")
print(f"Model's recall is: {round(recall_score(predictionNB, ytest, average='weighted'), 4)*100}%")
print(f"Model's f1 is: {round(f1_score(predictionNB, ytest, average='weighted'), 4)*100}%")

Model's accuracy is: 84.71%
Model's precision is: 84.91%
Model's recall is: 84.71%
Model's f1 is: 84.61%


In [17]:
print(classification_report(predictionNB, ytest))

              precision    recall  f1-score   support

    Negative       0.83      0.90      0.87     16908
    Positive       0.87      0.78      0.82     14167

    accuracy                           0.85     31075
   macro avg       0.85      0.84      0.84     31075
weighted avg       0.85      0.85      0.85     31075



## 3.2 Decision Tree Model

In [18]:
from sklearn.tree import DecisionTreeClassifier

In [19]:
dtModel = DecisionTreeClassifier()

In [20]:
dtModel.fit(xtr, ytr)

DecisionTreeClassifier()

In [21]:
predictionDT = dtModel.predict(xtest)
predictionDT

array(['Positive', 'Negative', 'Negative', ..., 'Negative', 'Negative',
       'Positive'], dtype=object)

In [22]:
print(f"Model's accuracy is: {round(accuracy_score(predictionDT, ytest), 4)*100}%")
print(f"Model's precision is: {round(precision_score(predictionDT, ytest, average='weighted'), 4)*100}%")
print(f"Model's recall is: {round(recall_score(predictionDT, ytest, average='weighted'), 4)*100}%")
print(f"Model's f1 is: {round(f1_score(predictionDT, ytest, average='weighted'), 4)*100}%")

Model's accuracy is: 82.28%
Model's precision is: 82.31%
Model's recall is: 82.28%
Model's f1 is: 82.3%


In [23]:
print(classification_report(predictionDT, ytest))

              precision    recall  f1-score   support

    Negative       0.85      0.85      0.85     18481
    Positive       0.78      0.79      0.78     12594

    accuracy                           0.82     31075
   macro avg       0.82      0.82      0.82     31075
weighted avg       0.82      0.82      0.82     31075



## 3.3 Logistic Regression Model

In [24]:
from sklearn.linear_model import LogisticRegression

In [25]:
lrModel = LogisticRegression()

In [26]:
lrModel.fit(xtr, ytr)

LogisticRegression()

In [27]:
predictionLR = lrModel.predict(xtest)
predictionLR

array(['Positive', 'Positive', 'Negative', ..., 'Positive', 'Positive',
       'Positive'], dtype=object)

In [28]:
print(f"Model's accuracy is: {round(accuracy_score(predictionLR, ytest), 4)*100}%")
print(f"Model's precision is: {round(precision_score(predictionLR, ytest, average='weighted'), 4)*100}%")
print(f"Model's recall is: {round(recall_score(predictionLR, ytest, average='weighted'), 4)*100}%")
print(f"Model's f1 is: {round(f1_score(predictionLR, ytest, average='weighted'), 4)*100}%")

Model's accuracy is: 92.36%
Model's precision is: 92.4%
Model's recall is: 92.36%
Model's f1 is: 92.36999999999999%


In [29]:
print(classification_report(predictionLR, ytest))

              precision    recall  f1-score   support

    Negative       0.94      0.93      0.94     18617
    Positive       0.90      0.91      0.91     12458

    accuracy                           0.92     31075
   macro avg       0.92      0.92      0.92     31075
weighted avg       0.92      0.92      0.92     31075



## 3.4 Gradient Boosting Model

In [30]:
from sklearn.ensemble import GradientBoostingClassifier

In [31]:
gbModel = GradientBoostingClassifier()

In [32]:
gbModel.fit(xtr, ytr)

KeyboardInterrupt: 

In [None]:
predictionGB = gbModel.predict(xtest)
predictionGB

In [None]:
print(f"Model's accuracy is: {round(accuracy_score(predictionGB, ytest), 4)*100}%")
print(f"Model's precision is: {round(precision_score(predictionGB, ytest, average='weighted'), 4)*100}%")
print(f"Model's recall is: {round(recall_score(predictionGB, ytest, average='weighted'), 4)*100}%")
print(f"Model's f1 is: {round(f1_score(predictionGB, ytest, average='weighted'), 4)*100}%")

In [None]:
print(classification_report(predictionGB, ytest))