In [2]:
import pandas as pd
import spacy
import os
from tqdm import tqdm
from collections import Counter
import json
import random


In [3]:
nlp = spacy.load('en_core_web_sm')


In [4]:
def extract_pos_single(sentences):
    """
    Extracts parts of speech (adjectives, nouns, verbs) from a list of sentences.
    Args:
        sentences (list): List of sentences to process.
    Returns:
        tuple: Three Counter objects containing counts of adjectives, nouns, and verbs.
    """
    adjectives, nouns, verbs = Counter(), Counter(), Counter()
    
    for sentence in sentences:
        doc = nlp(sentence.lower())
        for token in doc:
            if token.pos_ == "ADJ":
                adjectives[token.text] += 1
            elif token.pos_ == "NOUN":
                nouns[token.text] += 1
            elif token.pos_ == "VERB":
                verbs[token.text] += 1
    
    return adjectives, nouns, verbs

def extract_pos_multi(sentences):
    """
    Extracts parts of speech (adjectives, nouns, verbs) from a list of sentences,
    considering multi-word phrases.
    Args:
        sentences (list): List of sentences to process.
    Returns:
        tuple: Three Counter objects containing counts of adjectives, nouns, and verbs.
    """
    adjectives, nouns, verbs = Counter(), Counter(), Counter()
    
    for sentence in sentences:
        doc = nlp(sentence.lower())
        temp_noun, temp_adj, temp_verb = [], [], []
        
        for token in doc:
            # Capture multi-word noun phrases (compound nouns)
            if token.pos_ == "NOUN":
                temp_noun.append(token.text)
            else:
                if temp_noun:
                    noun_phrase = " ".join(temp_noun)
                    nouns[noun_phrase] += 1
                    temp_noun = []
            
            # Capture adjective phrases (adverb + adjective)
            if token.pos_ == "ADJ" or token.pos_ == "ADV":
                temp_adj.append(token.text)
            else:
                if temp_adj:
                    adj_phrase = " ".join(temp_adj)
                    adjectives[adj_phrase] += 1
                    temp_adj = []
            
            # Capture verb phrases (auxiliary verbs + main verb)
            if token.pos_ == "VERB" or token.pos_ == "AUX":
                temp_verb.append(token.text)
            else:
                if temp_verb:
                    verb_phrase = " ".join(temp_verb)
                    verbs[verb_phrase] += 1
                    temp_verb = []
        
        # Append remaining phrases after the loop
        if temp_noun:
            noun_phrase = " ".join(temp_noun)
            nouns[noun_phrase] += 1
        if temp_adj:
            adj_phrase = " ".join(temp_adj)
            adjectives[adj_phrase] += 1
        if temp_verb:
            verb_phrase = " ".join(temp_verb)
            verbs[verb_phrase] += 1

    return adjectives, nouns, verbs

In [5]:
model = 'phi35mini' # Select the model to analyze
regions = ['EA','NE','LA','MEA','INDIA']

if os.path.exists(f'{model}_pos_frequencies.json'):
    with open(f'{model}_pos_frequencies.json') as f:
        dict_data = json.load(f)
        all_adjectives, all_nouns, all_verbs = dict_data['adjectives'], dict_data['nouns'], dict_data['verbs']# all_adjectives, all_nouns, all_verbs = {k:Counter() for k in regions}, {k:Counter() for k in regions}, {k:Counter() for k in regions}
else:
    all_adjectives, all_nouns, all_verbs = {k:Counter() for k in regions}, {k:Counter() for k in regions}, {k:Counter() for k in regions}
    file_path = os.path.abspath(f'../../response/gen_bias/{model}')
    for filename in os.listdir(file_path):
        if filename.endswith('.csv'):
            df = pd.read_csv(os.path.join(file_path, filename),encoding='utf-8')
            for r in tqdm(regions):
                if df[r].isna().all():
                    continue
                adjectives, nouns, verbs = extract_pos_single(df[r])
                all_adjectives[r].update(adjectives)
                all_nouns[r].update(nouns)
                all_verbs[r].update(verbs)

    def save_counters_to_json(adjectives, nouns, verbs, filename):
        """
        Saves the parts of speech counters to a JSON file.
        Args:
            adjectives (Counter): Counter object for adjectives.
            nouns (Counter): Counter object for nouns.
            verbs (Counter): Counter object for verbs.
            filename (str): Name of the output JSON file.
        """
        data = {
            "adjectives": dict(adjectives),
            "nouns": dict(nouns),
            "verbs": dict(verbs)
        }
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(data, f, ensure_ascii=False, indent=4)

    # Example usage
    save_counters_to_json(all_adjectives, all_nouns, all_verbs, f"{model}_pos_frequencies.json")

In [6]:
# with open(f'{model}_pos_frequencies.json') as f:
#     dict_data = json.load(f)
#     all_adjectives, all_nouns, all_verbs = dict_data['adjectives'], dict_data['nouns'], dict_data['verbs']

In [7]:
def calculate_or(dict1,dict2,reg1,reg2):
    """
    Calculate the odds ratio between two dictionaries of word frequencies.
    Args:
        dict1 (Counter): First dictionary of word frequencies.
        dict2 (Counter): Second dictionary of word frequencies.
        reg1 (str): Region name for the first dictionary.
        reg2 (str): Region name for the second dictionary.
    Returns:
        dict: Dictionary containing the odds ratio for each word.
    """
    words = set(dict1.keys()).union(set(dict2.keys()))
    s1 = sum(dict1.values())
    s2 = sum(dict2.values())
    or_dict = {word:0 for word in words}
    for word in words:
        f1 = dict1[word] if word in dict1 else 0
        f2 = dict2[word] if word in dict2 else 0
        or_dict[word] = (f1/(s1-f1)+0.000001)/(f2/(s2-f2)+0.000001) # +0.000001 to avoid division by zero
    return or_dict


In [8]:
initial_or = calculate_or(all_verbs['NE'],all_verbs['LA'],'NE','LA') # Enter two regions to compare model generations
final_or = dict(sorted(initial_or.items(), key=lambda item: item[1],reverse=True)) # Sort the dictionary by values in descending order


In [None]:
more_one = {k:v for k,v in final_or.items() if v>1}
less_one = {k:v for k,v in final_or.items() if v<=1}
reg_more = {key:more_one[key] for key in random.sample(list(more_one.keys()), 2)}
reg_less = {key:less_one[key] for key in random.sample(list(less_one.keys()), 20)}
for k,v in reg_more.items():
    print("{} ({:.4f})".format(k.capitalize(),v))
print()
for k,v in reg_less.items():
    print("{} ({:.4f})".format(k.capitalize(),v))

In [8]:
# Save the results to a file for further analysis
with open('temp.txt','w') as f:
    for key in final_or.keys():
        f.write(f'{key} : {final_or[key]}\n')