In [None]:
import pandas as pd
import re
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter


In [None]:
file_path = '/.csv'
data = pd.read_csv(file_path)

new_columns = {
    'Paper': 'Paper',
    'Author': 'Authors',
    'Notes': 'Notes',
    'Language Justification Given Beyond Data Availabilituy? td means typological diversity': 'Language-Justification',
    'Typologically diverse with definition': 'Typological-Diversity',
    'Cultural Nuances notes': 'Cultural-Nuances-Notes',
    'Cultural Nuances Mentioned?': 'Cultural-Nuances-Mentioned',
    'Cultural Nuances Addressed outside of limitations section?': 'Cultural-Nuances-Addressed',
    'Any discussion on the impact of model training data in relation to evaluation?': 'Impact-of-Training-Data',
    'Language Coverage': 'Language-Coverage',
    'Language Notes': 'Language-Notes',
    'Task & Dataset Used': 'Task-and-Dataset-Used'
}

data = data.rename(columns=new_columns)

iso_codes = {
    "English": "en", "French": "fr", "German": "de", "Spanish": "es", "Chinese": "zh", "Hindi": "hi",
    "Russian": "ru", "Arabic": "ar", "Vietnamese": "vi", "Thai": "th", "Urdu": "ur", "Swahili": "sw",
    "Finnish": "fi", "Bulgarian": "bg", "Catalan": "ca", "Portuguese": "pt", "Japanese": "ja",
    "Italian": "it", "Korean": "ko", "Estonian": "et", "Telugu": "te", "Basque": "eu", "Indonesian": "id",
    "Malay": "ms", "Haitian Creole": "ht", "Quechua": "qu", "Bengali": "bn", "Tamil": "ta", "Mandarin": "zh",
    "Romanian": "ro", "Polish": "pl", "Turkish": "tr", "Malayalam": "ml", "Hinglish": "en", "Ukrainian": "uk",
    "Czech": "cs", "Kannada": "kn", "Gujarati": "gu", "Oriya": "or", "Punjabi": "pa", "Assamese": "as",
    "Greek": "el", "Marathi": "mr", "Burmese": "my", "Dutch": "nl", "Danish": "da", "Swedish": "sv",
    "Icelandic": "is", "Luxembourgish": "lb", "Norwegian": "no", "Nepali": "ne", "Aymara": "ay", "Xhosa": "xh",
    "Yoruba": "yo", "Zulu": "zu", "Afrikaans": "af", "Javanese": "jv", "Somali": "so", "isiZulu": "zu",
    "Luganda": "lg", "Malagasy": "mg", "Nigerian Pidgin": "pcm", "Hausa": "ha", "Galician": "gl", "Asturian": "ast",
    "Chinese Simplified": "zh", "Gitksan": "git", "Uspanteko": "usp", "Natugu": "ntu", "Tsez": "tse", "Wolof": "wo",
    "Arapaho": "arp", "Bribri": "bzd", "Manchu": "mnc", "Southern Quechua": "qu", "and Chinese": "zh",
    "Lugala": "lg", "Malagasay": "mg"
}

In [None]:
# Function to convert language names to ISO codes
def map_to_iso(languages):
    mapped = []
    for language in re.split(r',|\n', languages):
        # Clean the language name
        clean_language = re.sub(r"\(.*?\)|\.", "", language).strip()
        # Map to ISO code if available, else keep the original name for verification later
        iso_code = iso_codes.get(clean_language, clean_language)
        mapped.append(iso_code)
    return mapped

data['Language-Coverage-ISO'] = data['Language-Coverage'].apply(map_to_iso)

continent_mapping = {
    "en": "Europe", "fr": "Europe", "de": "Europe", "es": "Europe", "zh": "Asia", "hi": "Asia",
    "ru": "Europe", "ar": "Asia", "vi": "Asia", "th": "Asia", "ur": "Asia", "sw": "Africa",
    "fi": "Europe", "bg": "Europe", "ca": "Europe", "pt": "Europe", "ja": "Asia", "it": "Europe",
    "ko": "Asia", "et": "Europe", "te": "Asia", "eu": "Europe", "id": "Asia", "ms": "Asia",
    "ht": "North America", "qu": "South America", "bn": "Asia", "ta": "Asia", "pl": "Europe",
    "tr": "Asia", "ml": "Asia", "uk": "Europe", "cs": "Europe", "kn": "Asia", "gu": "Asia",
    "or": "Asia", "pa": "Asia", "as": "Asia", "el": "Europe", "mr": "Asia", "ne": "Asia",
    "ay": "South America", "xh": "Africa", "yo": "Africa", "zu": "Africa", "af": "Africa",
    "jv": "Asia", "so": "Africa", "lg": "Africa", "mg": "Africa", "pcm": "Africa", "ha": "Africa",
    "gl": "Europe", "ast": "Europe", "nl": "Europe", "da": "Europe", "sv": "Europe", "is": "Europe",
    "lb": "Europe", "no": "Europe", "git": "North America", "usp": "North America", "ntu": "Oceania",
    "tse": "Asia", "wo": "Africa", "arp": "North America", "bzd": "Central America", "mnc": "Asia"
}

papers_with_continents = []
for paper in data['Language-Coverage-ISO']:
    continents = [continent_mapping[lang] for lang in paper if lang in continent_mapping]
    papers_with_continents.append(continents)

most_frequent_continent = [
    Counter(conts).most_common(1)[0][0] if conts else 'Unknown' for conts in papers_with_continents
]

colors = {
    "North America": "blue", "Europe": "green", "Asia": "red", "Africa": "yellow", "South America": "purple",
    "Oceania": "cyan", "Central America": "magenta", "Unknown": "grey"
}


In [None]:
# Bar chart color by most frequent continent
bar_colors = [colors[continent] for continent in most_frequent_continent]

# Function to count the number of languages per continent for each paper
def count_continents(papers):
    continent_counts = {continent: [] for continent in colors.keys()}
    for paper in papers:
        count = Counter(paper)
        for continent in colors.keys():
            continent_counts[continent].append(count.get(continent, 0))
    return continent_counts

continent_counts = count_continents(papers_with_continents)

# Preparing data for stacked bar chart
papers_indices = np.arange(len(data))
bottom = np.zeros(len(data))

# Creating the stacked bar chart
plt.figure(figsize=(14, 8))
for continent, counts in continent_counts.items():
    plt.bar(papers_indices, counts, bottom=bottom, color=colors[continent], edgecolor='white', label=continent)
    bottom += np.array(counts)

plt.title('Number of Languages Covered by Each Paper with Continent Breakdown')
plt.xlabel('Paper Number')
plt.ylabel('Number of Languages')
plt.xticks(papers_indices)
plt.legend(title='Continent')
plt.show()