Paketleri yükleme

In [None]:
!pip install requests beautifulsoup4 pandas



In [None]:
# imports
import requests
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm

In [None]:
# Yorumları almak için fonksiyon
def get_reviews(appid, params={'json':1}):
    url = 'https://store.steampowered.com/appreviews/'
    response = requests.get(url=url+str(appid), params=params, headers={'User-Agent': 'Mozilla/5.0'})
    return response.json()

# Belirli bir oyunun app id'sini almak için fonksiyon
def get_app_id(game_name):
    response = requests.get(url=f'https://store.steampowered.com/search/?term={game_name}&category1=998', headers={'User-Agent': 'Mozilla/5.0'})
    soup = BeautifulSoup(response.text, 'html.parser')
    app_id = soup.find(class_='search_result_row')['data-ds-appid']
    return app_id

# Yorumları toplamak için fonksiyon (toplam n yorum)
def get_n_reviews(appid, n=100, pbar=None):
    reviews = []
    cursor = '*'
    params = {
        'json': 1,
        'filter': 'all',
        'language': 'english',
        'day_range': 9223372036854775807,
        'review_type': 'all',
        'purchase_type': 'all'
    }

    while n > 0:
        params['cursor'] = cursor
        params['num_per_page'] = min(100, n)

        response = get_reviews(appid, params)
        if response["cursor"] is None:
            continue

        fetched_reviews = response['reviews']
        reviews += fetched_reviews
        cursor = response['cursor']

        num_fetched = len(fetched_reviews)
        n -= num_fetched

        if pbar:
            pbar.update(num_fetched)  # Genel ilerleme çubuğunu güncelle

        if n <= 0:
            break

    return reviews

# Yorumları bir oyun için alıp bir DataFrame'e kaydetme
def get_game_reviews(game_name, n_reviews=100):
    # Oyun ID'sini al
    appid = get_app_id(game_name)

    # Yorumları çek
    reviews = get_n_reviews(appid, n=n_reviews)

    # Yorum verilerini bir DataFrame'e dönüştür
    df = pd.DataFrame(reviews)

    # Yalnızca gerekli sütunları seç
    df = df[['review', 'voted_up']].rename(columns={'review': 'comment', 'voted_up': 'positive'})

    # Olumlu/Olumsuz yorumları 1 ve 0'a dönüştür
    # df['positive'] = df['positive'].astype(int)

    return df

In [None]:
def get_games_reviews(game_names, n_reviews=100):
    all_reviews = []
    total_reviews = len(game_names) * n_reviews  # Toplam yorum sayısını hesapla

    with tqdm(total=total_reviews, desc="Fetching all reviews", unit="review") as pbar:
        for game_name in game_names:
            # Oyun ID'sini al
            appid = get_app_id(game_name)

            # Yorumları çek (progress bar referansını geçiriyoruz)
            reviews = get_n_reviews(appid, n=n_reviews, pbar=pbar)

            # DataFrame oluştur
            df = pd.DataFrame(reviews)

            # Gerekli sütunları seç ve yeniden adlandır
            df = df[['review', 'voted_up']].rename(columns={'review': 'comment', 'voted_up': 'positive'})

            # Oyun adını ekle
            df['game'] = game_name

            # Listeye ekle
            all_reviews.append(df)

    # Tüm oyun yorumlarını tek bir DataFrame'de birleştir
    final_df = pd.concat(all_reviews, ignore_index=True)

    return final_df

In [None]:
game_names = ("Cyberpunk 2077", "Halo Infinite", "Counter-Strike 2", "Dead Cells", "The Elder Scrolls V: Skyrim Special Edition", "Forza Horizon 5","Cuphead","Undertale","Elden Ring","Baldurs Gate 3")
comments = get_games_reviews(game_names, 10000)

Fetching all reviews: 100%|██████████| 100000/100000 [04:33<00:00, 365.42review/s]


In [None]:
num_duplicates = comments.duplicated(keep=False).sum()
print(f"Tekrar eden yorum sayısı: {num_duplicates}")

duplicate_counts = comments[comments.duplicated(keep=False)].groupby(comments.columns.tolist()).size().reset_index(name='count')
print(duplicate_counts)

Tekrar eden yorum sayısı: 267
                                               comment  positive  \
0    "Games as a Service" model needs to die. Perma...     False   
1    ---{ Graphics }---\n☐ You forget what reality ...      True   
2    ---{ Graphics }---\n☐ You forget what reality ...      True   
3    ---{ Graphics }---\n☑ You forget what reality ...      True   
4    ---{ Graphics }---\r\n☐ You forget what realit...      True   
..                                                 ...       ...   
110  if I could, I'd remove this game from my mind ...      True   
111  terrible anti cheat people are getting false b...     False   
112  valarante child game.... look to cartoon grapf...      True   
113  valarante child game.... look to cartoon grapf...      True   
114                    𝘎𝘳𝘦𝘢𝘵 𝘨𝘢𝘮𝘦, 𝘳𝘶𝘪𝘯𝘦𝘥 𝘣𝘺 𝘤𝘩𝘦𝘢𝘵𝘦𝘳𝘴.      True   

                                            game  count  
0                                  Halo Infinite      2  
1    The Elder Scrolls V: Skyrim Spec

In [None]:
comments.drop_duplicates(keep="first", inplace=True)

In [None]:
comments.shape

(99848, 3)

In [None]:
comments

Unnamed: 0,comment,positive,game
0,[h1] For me this is the end of the Night City ...,True,Cyberpunk 2077
1,Cyberpunk 2077 is a science fiction role playi...,True,Cyberpunk 2077
2,"This game has done so much for me, but thats p...",True,Cyberpunk 2077
3,"Probably my favorite game ever, Cyberpunk is a...",True,Cyberpunk 2077
4,Patch 1.5 made the game what it should have be...,True,Cyberpunk 2077
...,...,...,...
99995,Best rpg in recent years. Exceptionally polish...,True,Baldurs Gate 3
99996,Its a great story game with impactful choices....,True,Baldurs Gate 3
99997,Already in Love with this title. Absolutely on...,True,Baldurs Gate 3
99998,"Incredible game, and very friendly to non-D&D ...",True,Baldurs Gate 3


In [None]:
comments["game"].value_counts()

Unnamed: 0_level_0,count
game,Unnamed: 1_level_1
Halo Infinite,9998
Dead Cells,9996
Baldurs Gate 3,9996
Cyberpunk 2077,9994
Undertale,9992
The Elder Scrolls V: Skyrim Special Edition,9991
Cuphead,9991
Elden Ring,9991
Forza Horizon 5,9987
Counter-Strike 2,9912


In [None]:
comments["positive"].value_counts()

Unnamed: 0_level_0,count
positive,Unnamed: 1_level_1
True,80417
False,19431


In [None]:
comments.to_csv("steam_reviews.csv", index=False, encoding="utf-8")

# NLP

In [None]:
import time
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.model_selection import StratifiedKFold, cross_validate, cross_val_predict
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, precision_recall_fscore_support, mean_absolute_error, mean_squared_error
from sklearn.decomposition import PCA

In [None]:
# Classifierlar
knn_classifier = KNeighborsClassifier(n_neighbors=3)
bayes_multi_classifier = MultinomialNB()
bayes_gauss_classifier = GaussianNB()
decision_tree_classifier = DecisionTreeClassifier(random_state=42)

In [None]:
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
data = comments
# Pre-processing
data['comment'] = data['comment'].str.lower() # Küçük harflere çevirme
data['comment'] = data['comment'].str.replace('[^\w\s]', '') # Özel karakterleri kaldırma
data['tokenized_comment'] = data['comment'].apply(word_tokenize) # Kelimeleri ayırma
# Stopwordleri kaldırma
stop_words = set(stopwords.words('english'))
data['filtered_comment'] = data['tokenized_comment'].apply(lambda tokens: [word for word in tokens if word not in stop_words])
# Stemming
stemmer = PorterStemmer()
data['processed_comment'] = data['filtered_comment'].apply(lambda tokens: [stemmer.stem(word) for word in tokens])

data

Unnamed: 0,comment,positive,game,tokenized_comment,filtered_comment,processed_comment
0,[h1] for me this is the end of the night city ...,True,Cyberpunk 2077,"[[, h1, ], for, me, this, is, the, end, of, th...","[[, h1, ], end, night, city, story, ., [, /h1,...","[[, h1, ], end, night, citi, stori, ., [, /h1,..."
1,cyberpunk 2077 is a science fiction role playi...,True,Cyberpunk 2077,"[cyberpunk, 2077, is, a, science, fiction, rol...","[cyberpunk, 2077, science, fiction, role, play...","[cyberpunk, 2077, scienc, fiction, role, play,..."
2,"this game has done so much for me, but thats p...",True,Cyberpunk 2077,"[this, game, has, done, so, much, for, me, ,, ...","[game, done, much, ,, thats, personal, bias, ....","[game, done, much, ,, that, person, bia, ., cy..."
3,"probably my favorite game ever, cyberpunk is a...",True,Cyberpunk 2077,"[probably, my, favorite, game, ever, ,, cyberp...","[probably, favorite, game, ever, ,, cyberpunk,...","[probabl, favorit, game, ever, ,, cyberpunk, g..."
4,patch 1.5 made the game what it should have be...,True,Cyberpunk 2077,"[patch, 1.5, made, the, game, what, it, should...","[patch, 1.5, made, game, released, ., 've, don...","[patch, 1.5, made, game, releas, ., 've, done,..."
...,...,...,...,...,...,...
99995,best rpg in recent years. exceptionally polish...,True,Baldurs Gate 3,"[best, rpg, in, recent, years, ., exceptionall...","[best, rpg, recent, years, ., exceptionally, p...","[best, rpg, recent, year, ., except, polish, g..."
99996,its a great story game with impactful choices....,True,Baldurs Gate 3,"[its, a, great, story, game, with, impactful, ...","[great, story, game, impactful, choices, ., go...","[great, stori, game, impact, choic, ., good, g..."
99997,already in love with this title. absolutely on...,True,Baldurs Gate 3,"[already, in, love, with, this, title, ., abso...","[already, love, title, ., absolutely, one, gre...","[alreadi, love, titl, ., absolut, one, greates..."
99998,"incredible game, and very friendly to non-d&d ...",True,Baldurs Gate 3,"[incredible, game, ,, and, very, friendly, to,...","[incredible, game, ,, friendly, non-d, &, play...","[incred, game, ,, friendli, non-d, &, player, ..."


In [None]:
comments.to_csv("preprocessed_reviews.csv", index=False, encoding="utf-8")

In [None]:
# 5-fold cross validate
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=50)

# Knn Tf-Idf

In [None]:
tfidf_vectorizer = TfidfVectorizer(max_features=500)
X = tfidf_vectorizer.fit_transform(data['processed_comment'].astype('str'))
y = data['positive']

# Metrikler
scoring_metrics = ['accuracy', 'precision', 'recall', 'f1']

# 5-fold cross-validation ile metrikleri ve süreleri ölçme
cv_results = cross_validate(knn_classifier, X, y, cv=cv, scoring=scoring_metrics, return_train_score=False)

# Sonuçları yazdır
for metric in scoring_metrics:
    average_metric = cv_results[f'test_{metric}'].mean()
    print(f"Average {metric} across folds: {average_metric}")

# Süreleri yazdır
average_training_time = cv_results['fit_time'].mean()
average_test_time = cv_results['score_time'].mean()
print(f"Average Training Time across folds: {average_training_time} seconds")
print(f"Average Test Time across folds: {average_test_time} seconds")

Average accuracy across folds: 0.8286695936036862
Average precision across folds: 0.8394497357008592
Average recall across folds: 0.9734509073277128
Average f1 across folds: 0.9014975099983253
Average Training Time across folds: 0.026793766021728515 seconds
Average Test Time across folds: 85.5180212020874 seconds


# Naive Bayes Tf-Idf

In [None]:
tfidf_vectorizer = TfidfVectorizer(max_features=2500)
X = tfidf_vectorizer.fit_transform(data['processed_comment'].astype('str'))
y = data['positive']

# Metrikler
scoring_metrics = ['accuracy', 'precision', 'recall', 'f1']

# 5-fold cross-validation ile metrikleri ve süreleri ölçme
cv_results = cross_validate(bayes_multi_classifier, X, y, cv=cv, scoring=scoring_metrics, return_train_score=False)

# Sonuçları yazdır
for metric in scoring_metrics:
    average_metric = cv_results[f'test_{metric}'].mean()
    print(f"Average {metric} across folds: {average_metric}")

# Süreleri yazdır
average_training_time = cv_results['fit_time'].mean()
average_test_time = cv_results['score_time'].mean()
print(f"Average Training Time across folds: {average_training_time} seconds")
print(f"Average Test Time across folds: {average_test_time} seconds")

Average accuracy across folds: 0.8909342344931088
Average precision across folds: 0.906430448716937
Average recall across folds: 0.96411209257473
Average f1 across folds: 0.9343794685087875
Average Training Time across folds: 0.05533242225646973 seconds
Average Test Time across folds: 0.03376989364624024 seconds


# Decision Tree Tf-Idf

In [None]:
tfidf_vectorizer = TfidfVectorizer(max_features=2500)
X = tfidf_vectorizer.fit_transform(data['processed_comment'].astype('str'))
y = data['positive']

# Metrikler
scoring_metrics = ['accuracy', 'precision', 'recall', 'f1']

# 5-fold cross-validation ile metrikleri ve süreleri ölçme
cv_results = cross_validate(decision_tree_classifier, X, y, cv=cv, scoring=scoring_metrics, return_train_score=False)

# Sonuçları yazdır
for metric in scoring_metrics:
    average_metric = cv_results[f'test_{metric}'].mean()
    print(f"Average {metric} across folds: {average_metric}")

# Süreleri yazdır
average_training_time = cv_results['fit_time'].mean()
average_test_time = cv_results['score_time'].mean()
print(f"Average Training Time across folds: {average_training_time} seconds")
print(f"Average Test Time across folds: {average_test_time} seconds")

Average accuracy across folds: 0.8306425986819379
Average precision across folds: 0.8924845298223408
Average recall across folds: 0.8978947264410808
Average f1 across folds: 0.8951785738166421
Average Training Time across folds: 115.7506058216095 seconds
Average Test Time across folds: 0.07370519638061523 seconds


# Knn Tf-Idf Feature Selection

In [None]:
# TF-IDF vektörleme
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
X = tfidf_vectorizer.fit_transform(data['processed_comment'].astype('str'))
y = data['positive']

# Feature Selection
k_best_features = 500  # Öznitelik sayısı
X_selected = SelectKBest(chi2, k=k_best_features).fit_transform(X, y)

# Metrikler
scoring_metrics = ['accuracy', 'precision', 'recall', 'f1']

# 5-fold cross-validation ile metrikleri ve süreleri ölçme
cv_results = cross_validate(knn_classifier, X_selected, y, cv=cv, scoring=scoring_metrics, return_train_score=False)

# Sonuçları yazdır
for metric in scoring_metrics:
    average_metric = cv_results[f'test_{metric}'].mean()
    print(f"Average {metric} across folds: {average_metric}")

# Süreleri yazdır
average_training_time = cv_results['fit_time'].mean()
average_test_time = cv_results['score_time'].mean()
print(f"Average Training Time across folds: {average_training_time} seconds")
print(f"Average Test Time across folds: {average_test_time} seconds")

Average accuracy across folds: 0.8261157021726188
Average precision across folds: 0.8337655774382092
Average recall across folds: 0.9793700672353067
Average f1 across folds: 0.9007197764123183
Average Training Time across folds: 0.021852397918701173 seconds
Average Test Time across folds: 75.00743532180786 seconds


# Naive Bayes Tf-Idf Fs

In [None]:
# TF-IDF vektörleme
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X = tfidf_vectorizer.fit_transform(data['processed_comment'].astype('str'))
y = data['positive']

# Feature Selection
k_best_features = 2500
X_selected = SelectKBest(chi2, k=k_best_features).fit_transform(X, y)

# Metrikler
scoring_metrics = ['accuracy', 'precision', 'recall', 'f1']

# 5-fold cross-validation ile metrikleri ve süreleri ölçme
cv_results = cross_validate(bayes_multi_classifier, X_selected, y, cv=cv, scoring=scoring_metrics, return_train_score=False)

# Sonuçları yazdır
for metric in scoring_metrics:
    average_metric = cv_results[f'test_{metric}'].mean()
    print(f"Average {metric} across folds: {average_metric}")

# Süreleri yazdır
average_training_time = cv_results['fit_time'].mean()
average_test_time = cv_results['score_time'].mean()
print(f"Average Training Time across folds: {average_training_time} seconds")
print(f"Average Test Time across folds: {average_test_time} seconds")

Average accuracy across folds: 0.8935081454371451
Average precision across folds: 0.9091182623455815
Average recall across folds: 0.9641742653902305
Average f1 across folds: 0.9358338693281294
Average Training Time across folds: 0.04174189567565918 seconds
Average Test Time across folds: 0.02909255027770996 seconds


# Decision Tree Tf-Idf Fs

In [None]:
# TF-IDF vektörleme
tfidf_vectorizer = TfidfVectorizer(max_features=2500)
X = tfidf_vectorizer.fit_transform(data['processed_comment'].astype('str'))
y = data['positive']

# Feature Selection
k_best_features = 1500
X_selected = SelectKBest(chi2, k=k_best_features).fit_transform(X, y)

# Metrikler
scoring_metrics = ['accuracy', 'precision', 'recall', 'f1']

# 5-fold cross-validation ile metrikleri ve süreleri ölçme
cv_results = cross_validate(decision_tree_classifier, X_selected, y, cv=cv, scoring=scoring_metrics, return_train_score=False)

# Sonuçları yazdır
for metric in scoring_metrics:
    average_metric = cv_results[f'test_{metric}'].mean()
    print(f"Average {metric} across folds: {average_metric}")

# Süreleri yazdır
average_training_time = cv_results['fit_time'].mean()
average_test_time = cv_results['score_time'].mean()
print(f"Average Training Time across folds: {average_training_time} seconds")
print(f"Average Test Time across folds: {average_test_time} seconds")

Average accuracy across folds: 0.8320647865483439
Average precision across folds: 0.8940585393794247
Average recall across folds: 0.8978823102791672
Average f1 across folds: 0.8959647557138182
Average Training Time across folds: 80.38077049255371 seconds
Average Test Time across folds: 0.06387548446655274 seconds


# Knn Tf-Idf Pca

In [None]:
# TF-IDF vektörleme
tfidf_vectorizer = TfidfVectorizer(max_features=1500)
X = tfidf_vectorizer.fit_transform(data['processed_comment'].astype('str'))
y = data['positive']

# PCA
pca = PCA(n_components=250)
X_pca = pca.fit_transform(X.toarray())

# Metrikler
scoring_metrics = ['accuracy', 'precision', 'recall', 'f1']

# 5-fold cross-validation ile metrikleri ve süreleri ölçme
cv_results = cross_validate(knn_classifier, X_pca, y, cv=cv, scoring=scoring_metrics, return_train_score=False)

# Sonuçları yazdır
for metric in scoring_metrics:
    average_metric = cv_results[f'test_{metric}'].mean()
    print(f"Average {metric} across folds: {average_metric}")

# Süreleri yazdır
average_fit_time = cv_results['fit_time'].mean()
average_score_time = cv_results['score_time'].mean()
print(f"Average Fit Time across folds: {average_fit_time} seconds")
print(f"Average Score Time across folds: {average_score_time} seconds")
# 250 96,12

Average accuracy across folds: 0.8347688626936097
Average precision across folds: 0.8867727189377288
Average recall across folds: 0.9112128688991389
Average f1 across folds: 0.8988148032062669
Average Fit Time across folds: 0.1008988857269287 seconds
Average Score Time across folds: 33.807275247573855 seconds


# Naive Bayes Tf-Idf Pca

In [None]:
# TF-IDF vektörleme
tfidf_vectorizer = TfidfVectorizer(max_features=1500)
X = tfidf_vectorizer.fit_transform(data['processed_comment'].astype('str'))
y = data['positive']

# PCA
pca = PCA(n_components=250)
X_pca = pca.fit_transform(X.toarray())

# Metrikler
scoring_metrics = ['accuracy', 'precision', 'recall', 'f1']

# 5-fold cross-validation ile metrikleri ve süreleri ölçme
cv_results = cross_validate(bayes_gauss_classifier, X_pca, y, cv=cv, scoring=scoring_metrics, return_train_score=False)

# Sonuçları yazdır
for metric in scoring_metrics:
    average_metric = cv_results[f'test_{metric}'].mean()
    print(f"Average {metric} across folds: {average_metric}")

# Süreleri yazdır
average_fit_time = cv_results['fit_time'].mean()
average_score_time = cv_results['score_time'].mean()
print(f"Average Fit Time across folds: {average_fit_time} seconds")
print(f"Average Score Time across folds: {average_score_time} seconds")

Average accuracy across folds: 0.7876472749085569
Average precision across folds: 0.9282443590055376
Average recall across folds: 0.7980278056772238
Average f1 across folds: 0.8582231743398776
Average Fit Time across folds: 0.4209431171417236 seconds
Average Score Time across folds: 0.11250529289245606 seconds


# Decision Tree Tf-Idf Pca

In [None]:
# TF-IDF vektörleme
tfidf_vectorizer = TfidfVectorizer(max_features=2500)
X = tfidf_vectorizer.fit_transform(data['processed_comment'].astype('str'))
y = data['positive']

# PCA
pca = PCA(n_components=500)
X_pca = pca.fit_transform(X.toarray())

# Metrikler
scoring_metrics = ['accuracy', 'precision', 'recall', 'f1']

# 5-fold cross-validation ile metrikleri ve süreleri ölçme
cv_results = cross_validate(decision_tree_classifier, X_pca, y, cv=cv, scoring=scoring_metrics, return_train_score=False)

# Sonuçları yazdır
for metric in scoring_metrics:
    average_metric = cv_results[f'test_{metric}'].mean()
    print(f"Average {metric} across folds: {average_metric}")

# Süreleri yazdır
average_fit_time = cv_results['fit_time'].mean()
average_score_time = cv_results['score_time'].mean()
print(f"Average Fit Time across folds: {average_fit_time} seconds")
print(f"Average Score Time across folds: {average_score_time} seconds")

Average accuracy across folds: 0.8098309791293179
Average precision across folds: 0.8848785934264715
Average recall across folds: 0.8781352146397119
Average f1 across folds: 0.8814910343386538
Average Fit Time across folds: 277.66824288368224 seconds
Average Score Time across folds: 0.05255346298217774 seconds


# Knn n-gram

In [None]:
# Most popular n-gram yaklaşımı kullanarak kelime sayma (count) vektörleme
ngram_range = (1, 2)  # bigram (2-gram)
count_vectorizer = CountVectorizer(max_features=2500, ngram_range=ngram_range)
X = count_vectorizer.fit_transform(data['processed_comment'].astype('str'))
y = data['positive']

# Metrikler
scoring_metrics = ['accuracy', 'precision', 'recall', 'f1']

# 5-fold cross-validation ile metrikleri ve süreleri ölçme
cv_results = cross_validate(knn_classifier, X, y, cv=cv, scoring=scoring_metrics, return_train_score=False)

# Sonuçları yazdır
for metric in scoring_metrics:
    average_metric = cv_results[f'test_{metric}'].mean()
    print(f"Average {metric} across folds: {average_metric}")

# Süreleri yazdır
average_training_time = cv_results['fit_time'].mean()
average_test_time = cv_results['score_time'].mean()
print(f"Average Training Time across folds: {average_training_time} seconds")
print(f"Average Test Time across folds: {average_test_time} seconds")

Average accuracy across folds: 0.8302319065257209
Average precision across folds: 0.8458933488997381
Average recall across folds: 0.9650322338531637
Average f1 across folds: 0.9015392448186841
Average Training Time across folds: 0.04028611183166504 seconds
Average Test Time across folds: 106.8772177696228 seconds


# Naive Bayes n-gram

In [None]:
# Most popular n-gram yaklaşımı kullanarak kelime sayma (count) vektörleme
ngram_range = (1, 2)  # bigram (2-gram)
count_vectorizer = CountVectorizer(max_features=5000, ngram_range=ngram_range)
X = count_vectorizer.fit_transform(data['processed_comment'].astype('str'))
y = data['positive']

# Metrikler
scoring_metrics = ['accuracy', 'precision', 'recall', 'f1']

# 5-fold cross-validation ile metrikleri ve süreleri ölçme
cv_results = cross_validate(bayes_multi_classifier, X, y, cv=cv, scoring=scoring_metrics, return_train_score=False)

# Sonuçları yazdır
for metric in scoring_metrics:
    average_metric = cv_results[f'test_{metric}'].mean()
    print(f"Average {metric} across folds: {average_metric}")

# Süreleri yazdır
average_training_time = cv_results['fit_time'].mean()
average_test_time = cv_results['score_time'].mean()
print(f"Average Training Time across folds: {average_training_time} seconds")
print(f"Average Test Time across folds: {average_test_time} seconds")

Average accuracy across folds: 0.8657860499999336
Average precision across folds: 0.965673636203309
Average recall across folds: 0.8640710989063308
Average f1 across folds: 0.9120492099102497
Average Training Time across folds: 0.05099921226501465 seconds
Average Test Time across folds: 0.0324190616607666 seconds


# Decision Tree n-gram

In [None]:
# Most popular n-gram yaklaşımı kullanarak kelime sayma (count) vektörleme
ngram_range = (1, 2)  # bigram (2-gram)
count_vectorizer = CountVectorizer(max_features=5000, ngram_range=ngram_range)
X = count_vectorizer.fit_transform(data['processed_comment'].astype('str'))
y = data['positive']

# Metrikler
scoring_metrics = ['accuracy', 'precision', 'recall', 'f1']

# 5-fold cross-validation ile metrikleri ve süreleri ölçme
cv_results = cross_validate(decision_tree_classifier, X, y, cv=cv, scoring=scoring_metrics, return_train_score=False)

# Sonuçları yazdır
for metric in scoring_metrics:
    average_metric = cv_results[f'test_{metric}'].mean()
    print(f"Average {metric} across folds: {average_metric}")

# Süreleri yazdır
average_training_time = cv_results['fit_time'].mean()
average_test_time = cv_results['score_time'].mean()
print(f"Average Training Time across folds: {average_training_time} seconds")
print(f"Average Test Time across folds: {average_test_time} seconds")

Average accuracy across folds: 0.8321548565524435
Average precision across folds: 0.894492042329329
Average recall across folds: 0.8974594734356683
Average f1 across folds: 0.8959707161163866
Average Training Time across folds: 89.55081295967102 seconds
Average Test Time across folds: 0.0871312141418457 seconds


# Knn n-gram Fs

In [None]:
# Most popular n-gram yaklaşımı kullanarak kelime sayma (count) vektörleme
ngram_range = (1, 2)  # bigram (2-gram)
count_vectorizer = CountVectorizer(max_features=2500, ngram_range=ngram_range)
X = count_vectorizer.fit_transform(data['processed_comment'].astype('str'))
y = data['positive']

# Feature Selection
k_best_features = 1500
X_selected = SelectKBest(chi2, k=k_best_features).fit_transform(X, y)

# Metrikler
scoring_metrics = ['accuracy', 'precision', 'recall', 'f1']

# 5-fold cross-validation ile metrikleri ve süreleri ölçme
cv_results = cross_validate(knn_classifier, X_selected, y, cv=cv, scoring=scoring_metrics, return_train_score=False)

# Sonuçları yazdır
for metric in scoring_metrics:
    average_metric = cv_results[f'test_{metric}'].mean()
    print(f"Average {metric} across folds: {average_metric}")

# Süreleri yazdır
average_training_time = cv_results['fit_time'].mean()
average_test_time = cv_results['score_time'].mean()
print(f"Average Training Time across folds: {average_training_time} seconds")
print(f"Average Test Time across folds: {average_test_time} seconds")

Average accuracy across folds: 0.829190338665392
Average precision across folds: 0.8396441326247818
Average recall across folds: 0.9739234064993887
Average f1 across folds: 0.9018100764481843
Average Training Time across folds: 0.027766990661621093 seconds
Average Test Time across folds: 97.20839161872864 seconds


# Naive Bayes n-gram Fs

In [None]:
# Most popular n-gram yaklaşımı kullanarak kelime sayma (count) vektörleme
ngram_range = (1, 2)  # bigram (2-gram)
count_vectorizer = CountVectorizer(max_features=5000, ngram_range=ngram_range)
X = count_vectorizer.fit_transform(data['processed_comment'].astype('str'))
y = data['positive']

# Feature Selection
k_best_features = 2500
X_selected = SelectKBest(chi2, k=k_best_features).fit_transform(X, y)

# Metrikler
scoring_metrics = ['accuracy', 'precision', 'recall', 'f1']

# 5-fold cross-validation ile metrikleri ve süreleri ölçme
cv_results = cross_validate(bayes_multi_classifier, X_selected, y, cv=cv, scoring=scoring_metrics, return_train_score=False)

# Sonuçları yazdır
for metric in scoring_metrics:
    average_metric = cv_results[f'test_{metric}'].mean()
    print(f"Average {metric} across folds: {average_metric}")

# Süreleri yazdır
average_training_time = cv_results['fit_time'].mean()
average_test_time = cv_results['score_time'].mean()
print(f"Average Training Time across folds: {average_training_time} seconds")
print(f"Average Test Time across folds: {average_test_time} seconds")

Average accuracy across folds: 0.8623908776179443
Average precision across folds: 0.9656969266983847
Average recall across folds: 0.8596814564424665
Average f1 across folds: 0.9096074708947809
Average Training Time across folds: 0.04324350357055664 seconds
Average Test Time across folds: 0.031188821792602538 seconds


# Decision Tree n-gram Fs

In [None]:
# Most popular n-gram yaklaşımı kullanarak kelime sayma (count) vektörleme
ngram_range = (1, 2)  # bigram (2-gram)
count_vectorizer = CountVectorizer(max_features=2500, ngram_range=ngram_range)
X = count_vectorizer.fit_transform(data['processed_comment'].astype('str'))
y = data['positive']

# Feature Selection
k_best_features = 2000
X_selected = SelectKBest(chi2, k=k_best_features).fit_transform(X, y)

# Metrikler
scoring_metrics = ['accuracy', 'precision', 'recall', 'f1']

# 5-fold cross-validation ile metrikleri ve süreleri ölçme
cv_results = cross_validate(decision_tree_classifier, X_selected, y, cv=cv, scoring=scoring_metrics, return_train_score=False)

# Sonuçları yazdır
for metric in scoring_metrics:
    average_metric = cv_results[f'test_{metric}'].mean()
    print(f"Average {metric} across folds: {average_metric}")

# Süreleri yazdır
average_training_time = cv_results['fit_time'].mean()
average_test_time = cv_results['score_time'].mean()
print(f"Average Training Time across folds: {average_training_time} seconds")
print(f"Average Test Time across folds: {average_test_time} seconds")

Average accuracy across folds: 0.8317442361148013
Average precision across folds: 0.8932003992901526
Average recall across folds: 0.8985288862211809
Average f1 across folds: 0.8958537872842814
Average Training Time across folds: 63.21922769546509 seconds
Average Test Time across folds: 0.06894512176513672 seconds


# Knn n-gram Pca

In [None]:
# Most popular n-gram yaklaşımı kullanarak kelime sayma (count) vektörleme
ngram_range = (1, 2)  # bigram (2-gram)
count_vectorizer = CountVectorizer(max_features=2500, ngram_range=ngram_range)
X = count_vectorizer.fit_transform(data['processed_comment'].astype('str'))
y = data['positive']

# PCA
pca = PCA(n_components=800)
X_pca = pca.fit_transform(X.toarray())

# Metrikler
scoring_metrics = ['accuracy', 'precision', 'recall', 'f1']

# 5-fold cross-validation ile metrikleri ve süreleri ölçme
cv_results = cross_validate(knn_classifier, X_pca, y, cv=cv, scoring=scoring_metrics, return_train_score=False)

# Sonuçları yazdır
for metric in scoring_metrics:
    average_metric = cv_results[f'test_{metric}'].mean()
    print(f"Average {metric} across folds: {average_metric}")

# Süreleri yazdır
average_training_time = cv_results['fit_time'].mean()
average_test_time = cv_results['score_time'].mean()
print(f"Average Training Time across folds: {average_training_time} seconds")
print(f"Average Test Time across folds: {average_test_time} seconds")

Average accuracy across folds: 0.8383242343108032
Average precision across folds: 0.8509596480113055
Average recall across folds: 0.968974213334975
Average f1 across folds: 0.906138272935217
Average Training Time across folds: 0.3317446708679199 seconds
Average Test Time across folds: 96.45693936347962 seconds


# Naive Bayes n-gram Pca

In [None]:
# Most popular n-gram yaklaşımı kullanarak kelime sayma (count) vektörleme
ngram_range = (1, 2)  # bigram (2-gram)
count_vectorizer = CountVectorizer(max_features=2500, ngram_range=ngram_range)
X = count_vectorizer.fit_transform(data['processed_comment'].astype('str'))
y = data['positive']

# PCA
pca = PCA(n_components=400)
X_pca = pca.fit_transform(X.toarray())

# Metrikler
scoring_metrics = ['accuracy', 'precision', 'recall', 'f1']

# 5-fold cross-validation ile metrikleri ve süreleri ölçme
cv_results = cross_validate(bayes_gauss_classifier, X_pca, y, cv=cv, scoring=scoring_metrics, return_train_score=False)

# Sonuçları yazdır
for metric in scoring_metrics:
    average_metric = cv_results[f'test_{metric}'].mean()
    print(f"Average {metric} across folds: {average_metric}")

# Süreleri yazdır
average_training_time = cv_results['fit_time'].mean()
average_test_time = cv_results['score_time'].mean()
print(f"Average Training Time across folds: {average_training_time} seconds")
print(f"Average Test Time across folds: {average_test_time} seconds")

Average accuracy across folds: 0.7191531069452092
Average precision across folds: 0.8394592386517162
Average recall across folds: 0.8053644723777549
Average f1 across folds: 0.8220141111725798
Average Training Time across folds: 0.667252779006958 seconds
Average Test Time across folds: 0.1656874179840088 seconds


# Decision tree n-gram Pca

In [None]:
# Most popular n-gram yaklaşımı kullanarak kelime sayma (count) vektörleme
ngram_range = (1, 2)  # bigram (2-gram)
count_vectorizer = CountVectorizer(max_features=2500, ngram_range=ngram_range)
X = count_vectorizer.fit_transform(data['processed_comment'].astype('str'))
y = data['positive']

# PCA
pca = PCA(n_components=500)
X_pca = pca.fit_transform(X.toarray())

# Metrikler
scoring_metrics = ['accuracy', 'precision', 'recall', 'f1']

# 5-fold cross-validation ile metrikleri ve süreleri ölçme
cv_results = cross_validate(decision_tree_classifier, X_pca, y, cv=cv, scoring=scoring_metrics, return_train_score=False)

# Sonuçları yazdır
for metric in scoring_metrics:
    average_metric = cv_results[f'test_{metric}'].mean()
    print(f"Average {metric} across folds: {average_metric}")

# Süreleri yazdır
average_training_time = cv_results['fit_time'].mean()
average_test_time = cv_results['score_time'].mean()
print(f"Average Training Time across folds: {average_training_time} seconds")
print(f"Average Test Time across folds: {average_test_time} seconds")

Average accuracy across folds: 0.7819886532688511
Average precision across folds: 0.8710480654544126
Average recall across folds: 0.8560628167333215
Average f1 across folds: 0.8634836335436757
Average Training Time across folds: 179.3046826839447 seconds
Average Test Time across folds: 0.05299849510192871 seconds
