### Import Section

In [5]:
import sys
import io
import os
import urllib.parse, urllib.request, json
import pandas as pd
import numpy as np
from typing import Iterable, Tuple, List, Set, Dict, Callable, Any, Optional
from pprint import PrettyPrinter
from collections import Counter,defaultdict
from tqdm import tqdm
pprint = PrettyPrinter(indent=4)
print = pprint.pprint

### Constant Section
Important!!! Change the DATA_PATH according to your configuration

In [10]:
DATA_PATH = os.path.join("c:\\","Users","alexp","Jupyter","NU","github","data","CE_SC_papers.csv")
GROUPS_PATH = os.path.join("c:\\","Users","alexp","Jupyter","NU","github","data","groups.txt")
USERNAME = "Kazakh British Technical University"
KEY = "ewpnmenfvbcliylsfkmvrbhbpblkmn"

### Reference Section

In [11]:
GROUPS = {
    "Core":({'unknown','abstract object','action','activity','process','product','resource','scientific object','series','service','source',},{}),
    "Method":({"method","technique","approach","design"},{}),
}

STOP_WORDS = {
    "Open access",
    "Copyright",
    "Scientific method",
    "Research",
    "Systematic review",
    "Information","Database",
    "Infrastructure",
    "Innovation",
    "Systematic review",
    "System",
    "Computer",
    "Density",
    "Machine",
    "API",
    "World Wide Web",
    "Thermal comfort",
    "Techno",
    "Taxonomy",

}

STOP_WORDS = {}

PROBLEM_WORDS = [
    "Steel",
    "Slag",
    "Rice",
    "Water",
    "Limestone",
    "Mineral",
    "Greenhouse gas",
    "Geopolymer",
    "Fossil fuel",
    "Fly ash",
    "Concrete","Composite material",
    "Alkali",
    "Wood"]

SYN_SETS = {
    "Sustainability":"Sustainability",
    "Sustainable development":"Sustainability",
    "Sustainable Development Goals":"Sustainability",
    "Sustainable architecture":"Sustainability",
}



In [8]:
#### Vocabulary of concepts
voc_conc = Counter()
#### Vocabulary of clazzes
voc_claz = Counter()
#### Generalizations of concepts to clazzes (is-a)
is_a = defaultdict(set)

In [12]:
with open(GROUPS_PATH,"r") as fh:
    ref_is_a = json.loads(fh.read())

GROUPS = dict()    
for k,v in ref_is_a.items():
    is_a[k].add(v)
    voc_conc[k]+=1
    voc_claz[v]+=1
    GROUPS[v] = ({v},{})
    
STOP_WORDS = {
    "Creative Commons",
    "Copyright"
}

In [13]:
print(voc_conc.most_common()[:10])
print("")
print(voc_claz.most_common()[:10])
print("")
print(is_a["3D printing"])
print("")
print(GROUPS)

[   ('Sustainable development', 1),
    ('Sustainable Development Goals', 1),
    ('Sustainability', 1),
    ('Circular economy', 1),
    ('Scientific method', 1),
    ('Science', 1),
    ('Research and development', 1),
    ('Research', 1),
    ('Open access', 1),
    ('Literature', 1)]
''
[   ('General Technology', 14),
    ('Environment', 12),
    ('Materials', 10),
    ('Science and Research', 9),
    ('Built Environment', 9),
    ('Waste Management', 5),
    ('Sustainability', 3),
    ('Economics', 3),
    ('Circular economy', 1)]
''
{'General Technology'}
''
{   'Built Environment': ({'Built Environment'}, {}),
    'Circular economy': ({'Circular economy'}, {}),
    'Economics': ({'Economics'}, {}),
    'Environment': ({'Environment'}, {}),
    'General Technology': ({'General Technology'}, {}),
    'Materials': ({'Materials'}, {}),
    'Science and Research': ({'Science and Research'}, {}),
    'Sustainability': ({'Sustainability'}, {}),
    'Waste Management': ({'Waste Manageme

In [6]:
set([list(val)[0] for val in is_a.values()])

{'Built Environment',
 'Circular economy',
 'Economics',
 'Environment',
 'General Technology',
 'Materials',
 'Science and Research',
 'Sustainability',
 'Waste Management'}

In [7]:
def chunk_apply(self, source: str, target: str, func: Callable, parts: int, start: int = 0):
    """
    Applies a function to chunks of a DataFrame column and stores the results in another column.

    This method divides a DataFrame into specified parts and applies a function to a
    column (`source`) in chunks, then stores the results in another column (`target`).
    It uses a progress bar via tqdm to display progress.

    Parameters:
    - self: The DataFrame to operate on.
    - source (str): The name of the column from which data is read.
    - target (str): The name of the column where results of the function are stored.
    - func (Callable): The function to apply to each element of the source column.
    - parts (int): The number of parts to divide the DataFrame into for processing.
    - start (int, optional): The starting index from which to begin processing. Default is 0.

    Returns:
    - None: The function modifies the DataFrame in place and does not return anything.
    """
    from tqdm import tqdm
    tqdm.pandas(desc="Chunk apply")
    chunk_size = int(len(self) / parts)
    for chunk_start in range(start, len(self), chunk_size):
        chunk_end = min(chunk_start + chunk_size, len(self))
        self.loc[chunk_start:chunk_end, target] = self.loc[chunk_start:chunk_end, source].progress_apply(func)

        
pd.DataFrame.chunk_apply = chunk_apply
#pd.DataFrame.chunk_apply(func = None,parts = 10,start = 0)

In [8]:
def call_wikifier(text: str, lang: str = "en", threshold: float = 0.8, retry: int = 0) -> Optional[List[Tuple[str, str]]]:
    """
    Calls the Wikifier service to annotate text with relevant Wikipedia links.
    
    Args:
    text (str): The text to be annotated.
    lang (str, optional): The language of the text. Defaults to "en" (English).
    threshold (float, optional): The page rank square threshold for selecting annotations. Defaults to 0.8.
    retry (int, optional): The retry count for handling request failures. Automatically retries up to 5 times. Defaults to 0.
    
    Returns:
    Optional[List[Tuple[str, str]]]: A list of tuples containing the title and URL of the Wikipedia page if successful, None otherwise.
    
    This function constructs a URL request to the Wikifier service, handling retries and errors. If the number of retries exceeds 5, it returns None. For each successful call, it returns a list of annotated Wikipedia links based on the specified threshold.
    """
    # Prepare the URL.
    if retry > 5:
        return None
    if retry > 0:
        time.sleep(retry)
    try:
        data = urllib.parse.urlencode([
            ("text", text), ("lang", lang),
            ("userKey", "Your_User_Key_Here"),  # Replace with your actual user key.
            ("pageRankSqThreshold", "%g" % threshold), ("applyPageRankSqThreshold", "true"),
            ("nTopDfValuesToIgnore", "200"), ("nWordsToIgnoreFromList", "200"),
            ("wikiDataClasses", "true"), ("wikiDataClassIds", "false"),
            ("support", "true"), ("ranges", "false"), ("minLinkFrequency", "2"),
            ("includeCosines", "false"), ("maxMentionEntropy", "3")
        ])
        url = "http://www.wikifier.org/annotate-article"
        # Call the Wikifier and read the response.
        req = urllib.request.Request(url, data=data.encode("utf8"), method="POST")
        with urllib.request.urlopen(req, timeout=60) as f:
            response = f.read()
            response = json.loads(response.decode("utf8"))
        # Output the annotations.
        return [(annotation["title"], annotation["url"]) for annotation in response["annotations"]]
    except Exception as error:
        print(f"Error on attempt {retry+1}: {error}")
        return call_wikifier(text, lang, threshold, retry + 1)

def get_concept_info(title: str, lang: str = "en", secLang: str = "ru", retry: int = 0):
    """
    Retrieves concept information from the Wikifier service.

    This function queries the Wikifier "concept-info" API to get detailed information about a specified concept,
    including its labels in different languages. It supports retrying the request in case of failures.

    Parameters:
    title (str): The title of the concept to query.
    lang (str): The primary language for the query (default is English).
    secLang (str): A secondary language for additional labels (default is Russian).
    retry (int): The number of retry attempts made in case of a failure.

    Returns:
    list of str: A list of labels of the concept in the secondary language, or an empty list on failure.

    Raises:
    Exception: Propagates exceptions after exceeding retry limits with a descriptive error message.
    """
    # Prepare the URL.
    if retry > 5:
        raise Exception("Maximum retry attempts exceeded.")
    if retry > 0:
        time.sleep(retry)  # Backoff before retrying

    try:
        base_url = "http://www.wikifier.org/concept-info"
        params = {
            "lang": lang,
            "title": title,
            "secLang": secLang
        }
        url = base_url + "?" + urllib.parse.urlencode(params)

        with urllib.request.urlopen(url) as response:
            data = response.read().decode('utf-8')
            concept_info = json.loads(data)
            return [el["enLabel"] for el in concept_info["wikiDataClasses"]]
    except Exception as e:
        print("Error:", e)
        return get_concept_info(title, lang, secLang, retry + 1)


In [9]:
def get_hyperonyms(term: str, lang: str = 'en') -> list:
    """
    Retrieve hyperonyms (superordinate terms) for a given term using the ConceptNet API.

    Args:
    term (str): The term for which to find hyperonyms.
    lang (str): The language of the term (default is 'en' for English).

    Returns:
    list: A list of hyperonyms for the specified term. Returns None if the API request fails.

    Example:
    >>> hyperonyms = get_hyperonyms('apple')
    >>> print("Hyperonyms of 'apple':", hyperonyms)
    """
    
    # Construct the URL and parameters for the ConceptNet API
    base_url = "http://api.conceptnet.io/query"
    params = {
        'node': f'/c/{lang}/{term}',
        'rel': '/r/IsA',
        'limit': 10  # limit the results to 10; adjust as needed
    }
    
    # Encode parameters and construct full URL
    query_string = urllib.parse.urlencode(params)
    url = f"{base_url}?{query_string}"
    
    try:
        # Make the HTTP GET request to the ConceptNet API
        with urllib.request.urlopen(url) as response:
            # Decode the response and load it into JSON
            response_text = response.read().decode('utf-8')
            data = json.loads(response_text)
            hyperonyms = []
            
            # Extract hyperonyms from the API response
            for edge in data['edges']:
                # Get the start or end node as a hyperonym depending on the direction
                if edge['start']['@id'] == f'/c/{lang}/{term}':
                    hyperonym = edge['end']['label']
                else:
                    hyperonym = edge['start']['label']
                hyperonyms.append(hyperonym)
            
            return hyperonyms
    except urllib.error.URLError as e:
        print("Failed to fetch data:", e)
        return None


# Example usage
#term = "matrix"
#hyperonyms = get_hyperonyms(term)
#print(("Hyperonyms of", term, ":", hyperonyms))
#for term in PROBLEM_WORDS:
#    print(get_hyperonyms(term))

In [10]:
def flag_clazz(text: str, features: Tuple[Set[str], Set[str]]) -> bool:
    """
    Determine if a text contains any substrings from the first set in 'features'
    and none from the second set.
    
    Args:
    text (str): The text to check.
    features (Tuple[Set[str], Set[str]]): A tuple containing two sets of substrings,
                                         the first to check for presence and the second for absence.
    
    Returns:
    bool: True if conditions are met, False otherwise.
    """
    return any(subs in text for subs in features[0]) and not any(subs in text for subs in features[1])

def filter_clazz(texts: Iterable[str], groups: Dict[str, Tuple[Set[str], Set[str]]]):
    """
    A generator function that yields tuples of group keys and texts from a collection
    where the texts match the feature criteria of the corresponding group.
    
    Args:
    texts (Iterable[str]): A collection of text strings to be filtered.
    groups (Dict[str, Tuple[Set[str], Set[str]]]): A dictionary of groups, each associated with
                                                   a tuple of sets defining the inclusion and exclusion criteria.
    
    Yields:
    Tuple[str, str]: Tuples where the first element is a group key and the second is a text matching that group's criteria.
    """
    res = []
    for k, v in groups.items():
        for text in texts:
            if flag_clazz(text, v):
                yield (k, text)

def filter_clazz(texts: Iterable[str], groups: Dict[str, Tuple[Set[str], Set[str]]]) -> str:
    """
    Filters texts by applying group criteria from 'groups' and returns the most common group
    label based on the frequency of matches, or a default label if no matches are found.
    
    Args:
    texts (Iterable[str]): A collection of text strings to be filtered.
    groups (Dict[str, Tuple[Set[str], Set[str]]]): A dictionary of groups with their respective features.
    
    Returns:
    str: The most common group label or "Core" if no matches are found.
    """
    labels = [k for text in texts for k, v in groups.items() if flag_clazz(text, v)]
    if labels:
        buff = Counter(labels)
        return buff.most_common()[0][0]
    return "Core"

def filter_clazz(concept: Any, groups: Dict[str, Tuple[Set[str], Set[str]]], default:str = None) -> str:
    """
    Determines the group classification of a concept based on the predefined groups.

    Args:
    concept (Any): The concept to be classified.
    groups (Dict[str, Tuple[Set[str], Set[str]]]): Groups with their respective criteria for classification.

    Returns:
    str: The classification of the concept, if any.
    """
    for group_name, (includes, excludes) in groups.items():
        if any(include in concept for include in includes) and not any(exclude in concept for exclude in excludes):
            return group_name
    return default
            
#print([el for el in voc_claz.items()  if flag_clazz(el[0],GROUPS["Technology"]) ])
#print([el for el in voc_claz.items()  if flag_clazz(el[0],GROUPS["Material"]) ])
#print([el for el in voc_claz.items() if flag_clazz(el[0],GROUPS["Method"]) ])



In [11]:
with open("wikifier.pkl","rb") as fh:
    df = pd.read_pickle(fh)
    



In [12]:
with open("ONTO_9640_article_text_string_lt_1m.xlsx","rb") as fh:
    onto = pd.read_excel(fh)
#onto
#onto = None

#### Get Wiki concepts from Texts

In [13]:
#df.chunk_apply("abstract","concepts",func = call_wikifier,parts = 10,start = 0)

#### Get Wiki classes (is-a)

In [14]:
#df.chunk_apply("concepts","concepts",func = lambda concepts: [(el[0],get_concept_info(el[0])) for el in concepts],parts = 10,start = 8)


In [15]:
df.columns

Index(['author', 'title', 'abstract', 'article_text_string',
       'article_text_list', 'number_of_symbols', 'number_of_words',
       'number_of_sentences', 'concepts'],
      dtype='object')

#### Create Frequency Vocabular of Concepts, Clazzz and Is-A relation

In [16]:
def obsolete():
    for i in range(len(df)):
        for j in range(len(df["concepts"].iloc[i])):
            text = df["concepts"].iloc[i][j][0]
            if text not in STOP_WORDS:

                voc_conc[text] += 1
                if df["concepts"].iloc[i][j][1]:
                    for k in range(len(df["concepts"].iloc[i][j][1])):
                        voc_claz[df["concepts"].iloc[i][j][1][k]] += 1 
                        is_a[text].add(df["concepts"].iloc[i][j][1][k])
                else:
                    is_a[text].add("unknown")


    is_a["Value chain"]=set()
    is_a["Value chain"].add("unknown")

    is_a["Sustainable Development Goals"]=set()
    is_a["Sustainable Development Goals"].add("unknown")

    for word in PROBLEM_WORDS:
        for el in ["material","chemical","substance","element","component"]:
            is_a[word].add(el)
#obsolete()

In [17]:
#is_a["Smart contract"]

In [18]:
def build_concept_matrix(df: pd.DataFrame, stop_words: Set[str]) -> Counter:
    """
    Builds a co-occurrence matrix for concepts listed in each row of a DataFrame,
    excluding any concepts that are contained within a set of stop words.

    Args:
    df (pd.DataFrame): A DataFrame containing a column 'concepts' where each cell is a list of tuples,
                       each tuple containing a concept and potentially other associated data.
    stop_words (Set[str]): A set of words to exclude from the matrix calculation.

    Returns:
    Counter: A Counter object representing the co-occurrence matrix where the keys are tuples of concepts
             and the values are the counts of their co-occurrences in the DataFrame.
    """
    concept_matrix = Counter()
    for concepts in df['concepts']:
        for i, el1 in enumerate(concepts):
            for j in range(i + 1, len(concepts)):
                el2 = concepts[j]
                if el1[0] not in stop_words and el2[0] not in stop_words:
                    concept_matrix[(el1[0], el2[0])] += 1
                    concept_matrix[(el2[0], el1[0])] += 1
    return concept_matrix


In [19]:
# TODO el1[0], el2[0] => el1, el2 
def build_concept_matrix(concepts_series: pd.Series, stop_words: Set[str], syn_sets: Dict[str,str] = None) -> Counter:
    """
    Builds a co-occurrence matrix for concepts listed in each element of a pandas Series,
    excluding any concepts that are contained within a set of stop words.

    Args:
    concepts_series (pd.Series): A Series where each element is a list of tuples,
                                 each tuple containing a concept and potentially other associated data.
    stop_words (Set[str]): A set of words to exclude from the matrix calculation.

    Returns:
    Counter: A Counter object representing the co-occurrence matrix where the keys are tuples of concepts
             and the values are the counts of their co-occurrences in the Series.
    """
    concept_matrix = Counter()
    for concepts in concepts_series:
        for i, el1 in enumerate(concepts):
            for j in range(i + 1, len(concepts)):
                el2 = concepts[j]
                if el1[0] not in stop_words and el2[0] not in stop_words:
                    if not syn_sets:
                        concept_matrix[(el1[0], el2[0])] += 1
                        concept_matrix[(el2[0], el1[0])] += 1
                    else:
                        key1 = syn_sets[el1[0]] if el1[0] in syn_sets else el1[0]
                        key2 = syn_sets[el2[0]] if el2[0] in syn_sets else el2[0]
                        concept_matrix[(key1,key2)] += 1
                        concept_matrix[(key2,key1)] += 1
    return concept_matrix


In [20]:
bufff = build_concept_matrix(df["concepts"],STOP_WORDS,SYN_SETS)

In [21]:
bufff.most_common()[:10]

[(('Construction', 'Sustainability'), 144),
 (('Sustainability', 'Construction'), 144),
 (('Technology', 'Sustainability'), 91),
 (('Sustainability', 'Technology'), 91),
 (('Sustainability', 'Circular economy'), 75),
 (('Circular economy', 'Sustainability'), 75),
 (('Sustainability', 'Energy'), 67),
 (('Energy', 'Sustainability'), 67),
 (('Sustainability', 'Sustainability'), 66),
 (('Construction', 'Circular economy'), 64)]

In [22]:
#[(filter_clazz(is_a[el[0][0]],GROUPS),filter_clazz(is_a[el[0][1]],GROUPS)) for el in concept_matrix.most_common()]

In [23]:
def filter_by_percentile(concept_matrix: Counter, percentile: float = 90) -> List[Tuple[Tuple[str, str], int]]:
    """
    Filters concept pairs from a co-occurrence matrix that meet or exceed the specified percentile.

    Args:
    concept_matrix (Counter): The co-occurrence matrix of concepts.
    percentile (float): The percentile to use as a cutoff for filtering.

    Returns:
    List[Tuple[Tuple[str, str], int]]: A list of concept pairs with their frequencies that meet or exceed the cutoff.
    """
    freqs = np.array([el[1] for el in concept_matrix.most_common()])
    cutoff = np.percentile(freqs, percentile)
    print(f'Cutoff frequency: {cutoff}')
    return [(pair, freq) for pair, freq in concept_matrix.items() if freq >= cutoff]
fltr_list = filter_by_percentile(bufff)
print(len(fltr_list))

'Cutoff frequency: 2.0'
9871


In [24]:
def filter_by_degree(concept_matrix: Counter, cutoff: float = 50) -> List[Tuple[Tuple[str, str], int]]:
    """
    Filters concept pairs from co-occurence matrix that have particular degree.
    
    Args:
    concept_matrix (Counter): The co-occurence matrix of concepts.
    percentile (float): The percentile to use as a cutodd for filtering.
    
    Returns:
    List[Tuple[Tuple[str,str],int]: A list of concept pairs with their freqs that have the concrete degree.
    
    """
    degree = Counter()
    for el in concept_matrix.items():
        degree[el[0][0]]+=1
    # freqs = np.array([el[1] for el in degree.most_common()])
    # cutoff = np.percentile(freqs, percentile)
    concepts = [el[0] for el in degree.most_common() if el[1]>cutoff]
    print(f'Cutoff degree: {cutoff}')
    return [(pair, freq) for pair, freq in concept_matrix.items() if pair[0] in concepts or pair[1] in concepts]
fltr_list = filter_by_degree(bufff, cutoff = 50)
print(len(fltr_list))

'Cutoff degree: 50'
40749


In [25]:
#degree = Counter()
#for el in bufff.items():
#    degree[el[0][0]]+=1


In [26]:
#filter_by_degree(bufff, 99)

In [27]:
def counter_to_dataframe(filtered_pairs: List[Tuple[Tuple[str, str], int]], is_a: Dict[str, Any], groups: Dict[str, Tuple[Set[str], Set[str]]]) -> pd.DataFrame:
    """
    Transforms a list of filtered concept pairs into a DataFrame with additional categorization based on provided groups.

    Args:
    filtered_pairs (List[Tuple[Tuple[str, str], int]]): A list of concept pairs and their frequencies.
    is_a (Dict[str, Any]): A dictionary mapping concepts to categories or other properties.
    groups (Dict[str, Tuple[Set[str], Set[str]]]): A dictionary defining groups and their inclusion/exclusion criteria for categorization.

    Returns:
    pd.DataFrame: A DataFrame with categorized concept pairs and their frequencies.
    """
    buff = []
    for (el1, el2), freq in filtered_pairs:
        clazz1 = filter_clazz(is_a[el1], groups)
        clazz2 = filter_clazz(is_a[el2], groups)
        if clazz1 and clazz2:
            buff.append({
                "Section(Row)": clazz1,
                "Section(Column)": clazz2,
                "Name(Row)": el1,
                "Name(Column)": el2,
                "Frequency": freq
            })
    return pd.DataFrame(buff)
lin_df = counter_to_dataframe(fltr_list,is_a,GROUPS)
print(len(lin_df))

3145


In [28]:
def create_matrix_from_dataframe(df: pd.DataFrame) -> pd.DataFrame:
    """
    Processes a DataFrame to sort by specified criteria, extract unique indices, 
    and populate a new DataFrame with multi-level indices and columns based on frequencies.

    Args:
    df (pd.DataFrame): The input DataFrame that contains 'Section' and 'Name' categories with 'Frequency' data.

    Returns:
    pd.DataFrame: A new DataFrame where rows and columns are multi-level indices representing unique 'Section' and 'Name' combinations, and the cell values are 'Frequency' from the original DataFrame.
    """
    # Sort by row identifiers and calculate unique row indices
    # df = df.sort_values(by=['Section(Row)', 'Name(Row)', 'Frequency'], ascending=False)
    df = df.sort_values(by=['Section(Row)', 'Frequency'], ascending=False)

    unique_rows = df[['Section(Row)', 'Name(Row)']].drop_duplicates()
    indices = pd.MultiIndex.from_tuples(unique_rows.values.tolist())
    print(f"Number of unique rows: {len(indices)}")

    # Sort by column identifiers and calculate unique column indices
    #df = df.sort_values(by=['Section(Column)', 'Name(Column)', 'Frequency'], ascending=False)
    df = df.sort_values(by=['Section(Column)', 'Frequency'], ascending=False)
    unique_columns = df[['Section(Column)', 'Name(Column)']].drop_duplicates()
    columns = pd.MultiIndex.from_tuples(unique_columns.values.tolist())
    print(f"Number of unique columns: {len(columns)}")

    # Initialize an empty data array to create the DataFrame
    data = np.zeros((len(indices), len(columns)))

    # Create the DataFrame with specified indices and columns
    result = pd.DataFrame(data, index=indices, columns=columns)

    # Populate the DataFrame using the frequencies from the original DataFrame
    for i in tqdm(range(len(df)), desc="Populating matrix"):
        result.loc[
            (df.iloc[i]['Section(Row)'], df.iloc[i]['Name(Row)']),
            (df.iloc[i]['Section(Column)'], df.iloc[i]['Name(Column)'])
        ] = df.iloc[i]['Frequency']

    return result
result = create_matrix_from_dataframe(lin_df)

'Number of unique rows: 63'
'Number of unique columns: 63'


Populating matrix: 100%|██████████████████████████████████████████████████████████| 3145/3145 [00:05<00:00, 543.87it/s]


In [None]:
result

In [None]:
result.to_excel('concept_matrix_top7.xlsx', engine='openpyxl')

In [135]:
CORE_CONCEPTS = ["Circular economy","Sustainability"]
df = lin_df.sort_values(by=['Section(Row)', 'Frequency'], ascending=False)
unique_rows = df[['Section(Row)']].drop_duplicates()

In [136]:
pearson_corr = result.corr(method='pearson')

In [137]:
def top_concepts(df:pd.DataFrame, core_concept:str, ascending=False) -> pd.DataFrame:
    # Assume `df` is your DataFrame containing Numerical coefficients
    # and `CORE_CONCEPTS` is a list of concepts you are interested in.

    # Extract the Series for the first concept in CORE_CONCEPTS
    series = df[core_concept]

    # Ensure we have a Series
    if isinstance(series, pd.DataFrame):
        series = series.iloc[:, 0]

    # Sort the Series in descending order
    sorted_series = series.sort_values(ascending=ascending)

    # Get the top 10 correlated concepts, excluding the first one which is the concept itself
    top_series = sorted_series

    # Display the top 10 correlated concepts and their correlation values
    return pd.DataFrame([{"Concept":concept[1], "Value":f"{value:.2f}"} for concept, value in top_series.items()])
        


In [146]:
df_cfreq = pd.concat([top_concepts(result,CORE_CONCEPTS[0]).iloc[0:10],top_concepts(result,CORE_CONCEPTS[1]).iloc[0:10]], keys = CORE_CONCEPTS,axis=1)
df_cfreq = df_cfreq.style.set_caption('Top 10 Most Frequent Concept Pairs with "Circular Economy" and "Sustainable Development"')
with open("table.txt","a") as fh:
    fh.write(df_cfreq.to_latex())

In [139]:
df_cfreq

Unnamed: 0_level_0,Circular economy,Circular economy,Sustainability,Sustainability
Unnamed: 0_level_1,Concept,Value,Concept,Value
0,Sustainability,75.0,Construction,144.0
1,Construction,64.0,Technology,91.0
2,Economy,35.0,Circular economy,75.0
3,Recycling,32.0,Energy,67.0
4,Waste management,31.0,Sustainability,66.0
5,Scientific method,30.0,Scientific method,61.0
6,Technology,29.0,Industrial Revolution,60.0
7,Manufacturing,27.0,Concrete,56.0
8,Industrial Revolution,27.0,Manufacturing,53.0
9,Built environment,22.0,Research,48.0


In [140]:
df_cpearson = pd.concat([top_concepts(pearson_corr,CORE_CONCEPTS[0]).iloc[0:11],top_concepts(pearson_corr,CORE_CONCEPTS[1]).iloc[0:11]], keys = CORE_CONCEPTS, axis=1)
df_cpearson = df_cpearson.style.set_caption('Top 10 Concept Pairs Most Strongly Correlated with "Circular Economy" and "Sustainable Development" Using Pearson Correlation')
with open("table2.txt","a") as fh:
    fh.write(df_cpearson.to_latex())

In [141]:
df_cpearson_bottom = pd.concat([top_concepts(pearson_corr,CORE_CONCEPTS[0]).iloc[-11:],top_concepts(pearson_corr,CORE_CONCEPTS[1]).iloc[-11:]], keys = CORE_CONCEPTS, axis=1)
df_cpearson_bottom

Unnamed: 0_level_0,Circular economy,Circular economy,Sustainability,Sustainability
Unnamed: 0_level_1,Concept,Value,Concept,Value
52,Cement,0.65,Urbanization,0.61
53,Machine,0.64,Clay,0.61
54,Materials science,0.63,Machine,0.6
55,Pollution,0.63,System,0.59
56,Research and development,0.62,Climate change,0.59
57,System,0.62,Biodiversity,0.58
58,Compressive strength,0.61,Materials science,0.58
59,Information,0.61,Composite material,0.57
60,Open access,0.58,Carbon footprint,0.57
61,Carbon footprint,0.56,Efficient energy use,0.54


In [142]:
df_cpearson

Unnamed: 0_level_0,Circular economy,Circular economy,Sustainability,Sustainability
Unnamed: 0_level_1,Concept,Value,Concept,Value
0,Circular economy,1.0,Sustainability,1.0
1,Industrial Revolution,0.84,Manufacturing,0.82
2,Infrastructure,0.83,Industrial Revolution,0.8
3,Technology,0.82,Construction waste,0.78
4,Engineering,0.81,Economy,0.78
5,Economic growth,0.81,Research and development,0.78
6,Manufacturing,0.8,Waste management,0.77
7,Paper,0.79,Paper,0.77
8,Research,0.79,Built environment,0.77
9,Construction waste,0.78,Scientific method,0.77


In [143]:
pearson_corr.to_excel('pearson_corr.xlsx', engine='openpyxl')

In [144]:
spearman_corr = result.corr(method='spearman')

In [145]:
df_cspearman = pd.concat([top_concepts(spearman_corr,CORE_CONCEPTS[0]).iloc[0:11],top_concepts(spearman_corr,CORE_CONCEPTS[1]).iloc[0:11]], axis=1)
df_cspearman = df_cspearman.style.set_caption('Top 10 Concept Pairs Most Strongly Correlated with "Circular Economy" and "Sustainable Development" Using Spearman Correlation')
with open("table.txt","a") as fh:
    fh.write(df_cspearman.to_latex())

In [None]:
with open("table.txt","a") as fh:
    fh.write(df_cspearman.to_latex())

In [None]:
df_cspearman_bottom = pd.concat([top_concepts(spearman_corr,CORE_CONCEPTS[0]).iloc[-11:],top_concepts(spearman_corr,CORE_CONCEPTS[1]).iloc[-11:]], axis=1)

In [None]:
df_cspearman_bottom

In [None]:
spearman_corr.loc[CORE_CONCEPTS[0]]

In [None]:
spearman_corr.loc[CORE_CONCEPTS[1]]

In [None]:
spearman_corr.to_excel('spearman_corr.xlsx', engine='openpyxl')

In [30]:
def create_section_matrix_from_dataframe(df: pd.DataFrame) -> pd.DataFrame:
    df = df.sort_values(by=['Section(Row)', 'Frequency'], ascending=False)

    unique_rows = df[['Section(Row)']].drop_duplicates()
    indices = pd.MultiIndex.from_tuples(unique_rows.values.tolist())
    print(f"Number of unique rows: {len(indices)}")

    # Sort by column identifiers and calculate unique column indices
    #df = df.sort_values(by=['Section(Column)', 'Name(Column)', 'Frequency'], ascending=False)
    df = df.sort_values(by=['Section(Column)', 'Frequency'], ascending=False)
    unique_columns = df[['Section(Column)']].drop_duplicates()
    columns = pd.MultiIndex.from_tuples(unique_columns.values.tolist())
    print(f"Number of unique columns: {len(columns)}")
    # Initialize an empty data array to create the DataFrame
    data = np.zeros((len(indices), len(columns)))
    # Create the DataFrame with specified indices and columns
    result = pd.DataFrame(data, index=indices, columns=columns)
    for i in tqdm(range(len(df)), desc="Populating matrix"):
        result.loc[
            (df.iloc[i]['Section(Row)']),
            (df.iloc[i]['Section(Column)'])
        ] += df.iloc[i]['Frequency']    
    return result

In [127]:
section_matrix = create_section_matrix_from_dataframe(lin_df)

'Number of unique rows: 9'
'Number of unique columns: 9'


Populating matrix: 100%|██████████████████████████████████████████████████████████| 3145/3145 [00:17<00:00, 183.73it/s]


In [128]:
COLUMNS = [
    'Sustainability',
    'Circular economy',
    'Built Environment',
    'Economics',
    'Environment',
    'General Technology',
    'Materials',
    'Science and Research',
    'Waste Management']

section_matrix = section_matrix[COLUMNS].loc[COLUMNS]


for el in ['Sustainability', 'Circular economy', 'Built Environment', 'Economics', 'Environment', 'General Technology', 'Materials', 'Science and Research', 'Waste Management']:
    section_matrix.loc[el,el] = 0

for col in COLUMNS:
    total = section_matrix[col].sum()[0]
    #print((col, total))
    section_matrix[col] = section_matrix[col].apply(lambda x : round(x/total,2))

In [112]:

for col in COLUMNS:
    arr = np.array([section_matrix[col].iloc[ind][0] for ind in range(len(section_matrix[col]))])
    section_matrix[col] = arr/arr.sum()

In [129]:
for col in COLUMNS:
    print(section_matrix[col].sum())

Sustainability    1.01
dtype: float64
Circular economy    1.0
dtype: float64
Built Environment    0.99
dtype: float64
Economics    1.0
dtype: float64
Environment    1.0
dtype: float64
General Technology    1.0
dtype: float64
Materials    0.99
dtype: float64
Science and Research    1.0
dtype: float64
Waste Management    1.0
dtype: float64


In [126]:
round(0.51234,2)

0.51

In [130]:
for el in ['Sustainability', 'Circular economy', 'Built Environment', 'Economics', 'Environment', 'General Technology', 'Materials', 'Science and Research', 'Waste Management']:
    section_matrix.loc[el,el] = "-"
section_matrix

Unnamed: 0,Sustainability,Circular economy,Built Environment,Economics,Environment,General Technology,Materials,Science and Research,Waste Management
Sustainability,-,0.09,0.13,0.12,0.13,0.14,0.14,0.12,0.1
Circular economy,0.04,-,0.05,0.07,0.06,0.05,0.05,0.06,0.08
Built Environment,0.17,0.15,-,0.15,0.14,0.19,0.15,0.18,0.13
Economics,0.05,0.06,0.05,-,0.04,0.05,0.02,0.07,0.04
Environment,0.15,0.15,0.13,0.13,-,0.16,0.19,0.13,0.15
General Technology,0.25,0.18,0.27,0.21,0.25,-,0.24,0.3,0.22
Materials,0.14,0.1,0.12,0.05,0.17,0.14,-,0.07,0.18
Science and Research,0.13,0.14,0.16,0.2,0.12,0.18,0.07,-,0.1
Waste Management,0.08,0.13,0.08,0.07,0.09,0.09,0.13,0.07,-


In [131]:
with open("table3.txt","w") as fh:
    fh.write(section_matrix.to_latex())

In [None]:
result.to_pickle('concept_matrix_top2.pickle')

In [None]:
for el in voc_conc.most_common():
    print(is_a[el])

In [None]:
df_conc=pd.DataFrame(voc_conc.items(), columns=["text","freq"])

In [None]:
df_claz = pd.DataFrame(voc_claz.items(), columns=["text","freq"])

In [None]:
df_claz_fltr = df_claz[(df_claz["freq"]>df_claz["freq"].quantile(0.5)) & (df_claz["freq"]<df_claz["freq"].quantile(0.75))]

In [None]:
df_conc_fltr = df_conc[(df_conc["freq"]>df_conc["freq"].quantile(0.25)) & (df_conc["freq"]<df_conc["freq"].quantile(0.75))]

In [None]:
#ToDo exclude the common clazzz with TFIDF measure, at the moment that was done with the help of quantile filteringa
df_conc

In [None]:
df["concepts"] = df["abstract"].progress_apply(CallWikifier)

In [None]:
df.to_excel("wikifier2.xlsx") 

In [None]:
df.to_pickle("wikifier.pkl") 

In [None]:
df.iloc[11].concepts

In [None]:
df.iloc[11].abstract

In [None]:
your_json = CallWikifier(df.iloc[0].abstract)

In [None]:
print(json.dumps(your_json, indent=4))

In [None]:
import pandas as pd
import numpy as np

# Define entities and attributes
entities = ['US NASA', 'United Launch Alliance', 'Stennis Space Center', 'Roscosmos', 'Rocket Factory Augsburg']
attributes = ['Satellite', 'Spacecraft', 'Research', 'Innovation', 'Funding', 'Launch', 'Cooperation']

# Generate random data
np.random.seed(42)  # for reproducibility
data = np.random.poisson(lam=5, size=(len(entities), len(attributes)))

# Create the DataFrame
df = pd.DataFrame(data, index=entities, columns=attributes)

# Optionally add more realistic touch by setting some entries to 0
for _ in range(10):  # set 10 random points to 0
    i = np.random.randint(0, len(entities))
    j = np.random.randint(0, len(attributes))
    df.iat[i, j] = 0

print(df)


In [None]:
def highlight_cells(val):
    color = 'salmon' if val > 0 else ''
    return f'background-color: {color}'

styled_df = df.style.applymap(highlight_cells)
styled_df


In [None]:
# Export to Excel
styled_df.to_excel('concept_matrix.xlsx', engine='openpyxl')


In [None]:
from collections import defaultdict
import json

# Original dictionary
data = {
    "Waste management": "Environmental Management",
    "Waste": "Environmental Management",
    "Urbanization": "Civil Engineering",
    "Technology": "General Technology",
    "System": "General Technology",
    "Sustainable development": "Sustainability",
    "Sustainable architecture": "Sustainability",
    "Sustainable Development Goals": "Sustainability",
    "Sustainability": "Sustainability",
    "Scientific method": "Science and Research",
    "Science": "Science and Research",
    "Research and development": "Science and Research",
    "Research": "Science and Research",
    "Renewable energy": "General Technology",
    "Recycling": "Environmental Management",
    "Raw material": "Materials Science",
    "Pollution": "Environmental Science",
    "Paper": "Materials Science",
    "Open access": "Science and Research",
    "Natural resource": "Environmental Science",
    "Natural environment": "Environmental Science",
    "Materials science": "Materials Science",
    "Manufacturing": "General Technology",
    "Machine": "General Technology",
    "Literature": "Science and Research",
    "Life-cycle assessment": "Environmental Science",
    "Landfill": "Environmental Management",
    "Knowledge": "Science and Research",
    "Infrastructure": "Civil Engineering",
    "Information": "General Technology",
    "Industrial Revolution": "General Technology",
    "Greenhouse gas emissions": "Environmental Science",
    "Greenhouse gas": "Environmental Science",
    "Green building": "Sustainability",
    "Environmentally friendly": "Sustainability",
    "Engineering": "Civil Engineering",
    "Energy": "General Technology",
    "Efficient energy use": "General Technology",
    "Education": "Science and Research",
    "Economy": "Economics",
    "Economics": "Economics",
    "Economic growth": "Economics",
    "Ecology": "Environmental Science",
    "Design": "General Technology",
    "Decision-making": "General Technology",
    "Database": "General Technology",
    "Construction waste": "Environmental Management",
    "Construction": "Civil Engineering",
    "Concrete": "Materials Science",
    "Computer": "General Technology",
    "Compressive strength": "Materials Science",
    "Composite material": "Materials Science",
    "Climate change": "Environmental Science",
    "Clay": "Materials Science",
    "Circular economy": "Economics",
    "Cement": "Materials Science",
    "Carbon footprint": "Environmental Science",
    "Carbon": "Materials Science",
    "Built environment": "Civil Engineering",
    "Building material": "Materials Science",
    "Building information modeling": "Civil Engineering",
    "Biodiversity": "Environmental Science",
    "Bibliometrics": "Science and Research",
    "Architecture": "Civil Engineering",
    "Air pollution": "Environmental Science",
    "3D printing": "General Technology"
}

# Grouping by values and sorting each group by keys
grouped = defaultdict(list)
for key, value in sorted(data.items(), key=lambda item: item[0]):  # Sort by key (term)
    grouped[value].append(key)

# Optionally, sort categories themselves (values)
sorted_grouped = {category: sorted(keys) for category, keys in sorted(grouped.items())}

# Printing the result
print(json.dumps(sorted_grouped, indent=4))


In [None]:
# Export to Excel
df.to_excel('concept_matrix2.xlsx', engine='openpyxl')

In [None]:
# Extract columns related to Sustainability and Circular economy
sustainability_correlations = result.loc[:, ["Unnamed: 1", "Sustainability"]]
circular_economy_correlations = result.loc[:, ["Unnamed: 1", "Circular economy"]]


In [None]:


# Rename columns for easier comparison
sustainability_correlations.columns = ["Concept", "Sustainability"]
circular_economy_correlations.columns = ["Concept", "Circular economy"]

# Merge the two dataframes on the Concept column
merged_df = pd.merge(sustainability_correlations, circular_economy_correlations, on="Concept")

# Calculate the absolute differences in correlation values
merged_df["Difference"] = abs(merged_df["Sustainability"] - merged_df["Circular economy"])

# Sort by the absolute difference to find the most divergent concepts
divergent_concepts = merged_df.sort_values(by="Difference", ascending=False).head(10)

import ace_tools as tools; tools.display_dataframe_to_user(name="Divergent Concepts between Sustainability and Circular Economy", dataframe=divergent_concepts)

divergent_concepts
