In [5]:
import sys
import os
sys.path.append("../src")
import llm_utils
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, classification_report
import numpy as np

classes = ["War/Terror", "Conspiracy Theory", "Education", "Election Campaign", "Environment", 
              "Government/Public", "Health", "Immigration/Integration", 
              "Justice/Crime", "Labor/Employment", 
              "Macroeconomics/Economic Regulation", "Media/Journalism", "Religion", "Science/Technology"]

oa_without_context_elaboration_first_v04_df = pd.read_csv("../data/openassistant_llama_30b_4bit/generic_prompt_without_context_elaboration_first_v04/generic_test_0.csv")

In [6]:
# Add the new columns filled initially with NaN
oa_without_context_elaboration_first_v04_df['elaboration'] = np.nan
oa_without_context_elaboration_first_v04_df['tested_on'] = np.nan

# Go through all columns and check for not NaN
for class_name in classes:
    pred_column = class_name + "_pred"
    
    # Mask where data is not NaN
    not_nan_mask = oa_without_context_elaboration_first_v04_df[pred_column].notna()

    # Update 'elaboration' and 'tested_on' where mask is True
    oa_without_context_elaboration_first_v04_df.loc[not_nan_mask, 'elaboration'] = oa_without_context_elaboration_first_v04_df.loc[not_nan_mask, pred_column]
    oa_without_context_elaboration_first_v04_df.loc[not_nan_mask, 'tested_on'] = class_name

# If you want to drop the rows still containing NaN in 'elaboration' and 'tested_on', uncomment the following line
# oa_without_context_elaboration_first_v04_df = oa_without_context_elaboration_first_v04_df.dropna(subset=['elaboration', 'tested_on'])

In [7]:
oa_without_context_elaboration_first_v04_df.iloc[0]

id                                                                        891103871484870657
campaign_name                                                             VENEZUELA_201901_2
text                                       How life may find a way on Saturn's moon https...
annotations                                                           ['Science/Technology']
normalized_tweet                           How life may find a way on Saturn's moon [url]...
War/Terror_pred                                                                          NaN
Conspiracy Theory_pred                                                                   NaN
Education_pred                                                                           NaN
Election Campaign_pred                                                                   NaN
Environment_pred                                                                         NaN
Government/Public_pred                                                

In [9]:
from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter

def get_most_frequent_terms(texts, top_n=10):
    if len(texts) == 0 or all(isinstance(text, (type(None), str)) and not text.strip() for text in texts):
        return []
    vectorizer = CountVectorizer().fit(texts)
    bag_of_words = vectorizer.transform(texts)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vectorizer.vocabulary_.items()]
    words_freq = sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:top_n]

tested_on_categories = oa_without_context_elaboration_first_v04_df['tested_on'].unique()

for category in tested_on_categories:
    category_df = oa_without_context_elaboration_first_v04_df[oa_without_context_elaboration_first_v04_df['tested_on'] == category]
    texts = category_df['elaboration'].values
    most_common_terms = get_most_frequent_terms(texts)
    if most_common_terms:
        print("Category:", category)
        print("Most common terms:", most_common_terms)
    else:
        print("Category:", category)
        print("No valid text data")
    print("\n")


Category: Science/Technology
Most common terms: [('this', 158), ('to', 149), ('tweet', 147), ('explanation', 130), ('technology', 129), ('science', 121), ('class', 121), ('is', 116), ('related', 112), ('not', 110)]


Category: nan
No valid text data


Category: Media/Journalism
Most common terms: [('tweet', 149), ('this', 147), ('to', 145), ('is', 129), ('the', 127), ('media', 126), ('explanation', 122), ('class', 116), ('journalism', 105), ('related', 87)]


Category: Labor/Employment
Most common terms: [('to', 133), ('this', 131), ('tweet', 122), ('labor', 117), ('explanation', 115), ('class', 110), ('employment', 105), ('is', 104), ('related', 102), ('not', 94)]


Category: Environment
Most common terms: [('this', 132), ('tweet', 120), ('to', 112), ('explanation', 104), ('class', 99), ('environment', 98), ('is', 96), ('not', 87), ('related', 85), ('but', 60)]


Category: Macroeconomics/Economic Regulation
Most common terms: [('to', 142), ('this', 134), ('tweet', 128), ('economic', 1

In [13]:
!pip3 install gensim

/bin/bash: /media/bruno/0d2f61d2-2b9c-4043-9a46-8e4dfe74fc95/bruno/anaconda3/envs/my_env/bin/pip3: /home/bruno/anaconda3/envs/my_env/bin/python: bad interpreter: No such file or directory


In [17]:
import nltk
nltk.download('stopwords')

[nltk_data] Error loading stopwords: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1002)>


False

In [None]:
import gensim
from gensim import corpora
from gensim.models import LdaModel
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from pprint import pprint

tested_on_categories = oa_without_context_elaboration_first_v04_df['tested_on'].unique()
stop_words = set(stopwords.words('english'))

for category in tested_on_categories:
    category_df = oa_without_context_elaboration_first_v04_df[oa_without_context_elaboration_first_v04_df['tested_on'] == category]
    texts = category_df['elaboration'].dropna().values
    texts = [word_tokenize(text) for text in texts]
    texts = [[token.lower() for token in text if token.isalpha() and token.lower() not in stop_words] for text in texts]
    dictionary = corpora.Dictionary(texts)
    dictionary.filter_extremes(no_below=2, no_above=0.5)
    corpus = [dictionary.doc2bow(text) for text in texts]
    lda_model = LdaModel(corpus=corpus, num_topics=5, id2word=dictionary)

    print(f"\nCategory: {category}\n")
    pprint(lda_model.print_topics(num_words=10))
