In [2]:
# Imports
import os
import re
import warnings
import pandas as pd
import json
import logging
import numpy as np


In [3]:
# Loggings
warnings.filterwarnings(action='ignore', category=FutureWarning)
    
# Configure logging to write to a file and the console
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler(filename='analysis.log'),
        logging.StreamHandler()
    ]
)

In [4]:
# Predifined functions
def extract_number(filename):
    # Regular expression to match a sequence of digits
    match = re.search(r'_(\d+)\.txt$', filename)
    return int(match.group(1)) if match else 0

In [5]:
# input_folder_path = '../../data/output/streamV2_tweetnet_2023-06_splitted/'
input_folder_path = '../../data/output/test_Folder/'
files = [f for f in os.listdir(input_folder_path) if f.endswith('.csv')]
sorted_files = sorted(files, key=extract_number)

In [None]:
start_from_file = 0
for index, file_name in enumerate(sorted_files, start=0):  # start enumeration from 0 for human-readable file numbers
        # Skip files before the 14th file
        if index < start_from_file:
            continue
        
        file_path = os.path.join(input_folder_path, file_name)
        logging.info(f'Processing file: {file_name}')
        df = pd.read_csv(file_path)
        # Here the preprocessing of the data can be done

In [16]:
df = pd.read_csv("../../data/output/test_Folder/output_text1.csv")
df = df.dropna(subset=['tweet_id'])
docs = df["text"].tolist()
docs = [doc for doc in docs if pd.notnull(doc) and doc != '']
docs

['İstanbul’da Kağıthae, Bağcılar ve Çekmeköy ilçelerinde hava saldırıları protesto edildi\n\nKağıthane’de 10 kişi gözaltına alındı\n\n https://t.co/CMKTSiNr7Q https://t.co/Nvd82D38Ga',
 '@meral_aksener Bizler diplomalarımızı ve kpss puanlarımızı aldık sıra sizde:\n#Secime15BinZiraatMüh',
 "Beyoğlu'nda akılalmaz soygun kamerada! Otomobile yol verip soyguna devam ettiler",
 'Beyaz Saray Ulusal Güvenlik Danışmanı Sözcüsü Kirby, Türkiye’nin güneyine yönelik meşru terör tehdidiyle karşı karşı kalmaya devam ettiğini ve kendini savunma hakkının olduğunu söyledi ancak ABD’nin sınır ötesi operasyonlar konusundaki endişesini yineledi\n\nhttps://t.co/uAlnJwt5tB',
 'Sanat şehri Düzce, Devlet Tiyatroları Turne Sahnesi oluyor 🎭\n\nAnkara Devlet Tiyatromuzun, “Siyahlı Kadın Oyunu”nu, Sayın @OzgulOzkanYAVUZ Bakanımız ile birlikte izledik.\n\nHemşehrilerimizin ilgisi her oyunun kapalı gişe oynayacağını gösteriyor. 👏🏻 https://t.co/EkOZhtU7it',
 '@meral_aksener KAMUYA 15 BİN ZİRAAT MÜHENDİSİ ATAMASI ŞART

In [11]:
from sentence_transformers import SentenceTransformer

# Create embeddings
model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-mpnet-base-v2')
embeddings = model.encode(docs, show_progress_bar=True)

2023-11-07 17:21:25,923 - sentence_transformers.SentenceTransformer - INFO - Load pretrained SentenceTransformer: sentence-transformers/paraphrase-multilingual-mpnet-base-v2
2023-11-07 17:21:30,577 - sentence_transformers.SentenceTransformer - INFO - Use pytorch device: cuda
Batches: 100%|██████████| 3/3 [00:00<00:00,  4.06it/s]


In [13]:
import numpy as np
with open('./model/embeddings.npy', 'wb') as f:
    np.save(f, embeddings)

In [None]:
# embeddings = np.load('./model/embeddings.npy')

In [19]:
import collections
from tqdm import tqdm
from sklearn.feature_extraction.text import CountVectorizer

# Extract vocab to be used in BERTopic
vocab = collections.Counter()
tokenizer = CountVectorizer().build_tokenizer()
for doc in tqdm(docs):
  vocab.update(tokenizer(doc))
vocab = [word for word, frequency in vocab.items() if frequency >= 2] #TODO: change frequency
len(vocab) 

100%|██████████| 69/69 [00:00<00:00, 27027.17it/s]


244

In [21]:
import nltk
from nltk.corpus import stopwords
 
nltk.download('stopwords')
stopwords.words('turkish')

[nltk_data] Downloading package stopwords to /home/esener/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['acaba',
 'ama',
 'aslında',
 'az',
 'bazı',
 'belki',
 'biri',
 'birkaç',
 'birşey',
 'biz',
 'bu',
 'çok',
 'çünkü',
 'da',
 'daha',
 'de',
 'defa',
 'diye',
 'eğer',
 'en',
 'gibi',
 'hem',
 'hep',
 'hepsi',
 'her',
 'hiç',
 'için',
 'ile',
 'ise',
 'kez',
 'ki',
 'kim',
 'mı',
 'mu',
 'mü',
 'nasıl',
 'ne',
 'neden',
 'nerde',
 'nerede',
 'nereye',
 'niçin',
 'niye',
 'o',
 'sanki',
 'şey',
 'siz',
 'şu',
 'tüm',
 've',
 'veya',
 'ya',
 'yani']

In [22]:
from cuml.manifold import UMAP
from cuml.cluster import HDBSCAN
from bertopic import BERTopic

# Prepare sub-models
embedding_model = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2')
umap_model = UMAP(n_components=5, n_neighbors=50, random_state=42, metric="cosine", verbose=True)
hdbscan_model = HDBSCAN(min_samples=20, gen_min_span_tree=True, prediction_data=False, min_cluster_size=20, verbose=True)
vectorizer_model = CountVectorizer(vocabulary=vocab, stop_words=stopwords.words('turkish'))

# Fit BERTopic without actually performing any clustering
topic_model= BERTopic(
        embedding_model=embedding_model,
        umap_model=umap_model,
        hdbscan_model=hdbscan_model,
        vectorizer_model=vectorizer_model,
        verbose=True
).fit(docs, embeddings=embeddings)

ModuleNotFoundError: No module named 'cuml'