In [1]:
#!/usr/bin/env python
# coding: utf-8

import os
import requests
import datetime
from tqdm import tqdm
import pandas as pd
import re
import xml.etree.cElementTree as ET
from transformers import AutoTokenizer, OPTForCausalLM, OPTModel

  from .autonotebook import tqdm as notebook_tqdm


# 从xml中获取文本 path：存放xml的文件夹 (./xml)  get_type：获取的内容（text abstract caption title）

In [2]:
def get_text_from_xml(path, get_type):
    _filepath = path
    _get_type=get_type
    files = os.listdir(_filepath)
    _temp_data = []
    _temp_abstract = []
    _temp_caption = []
    data = {}
    abstract = {}
    caption = {}
    title = {}

    for file in files:
        if os.path.isfile(os.path.join(_filepath, file)):
            root = os.path.join(_filepath, file)
            doc = ET.parse(root).getroot()
            for elem in doc.iter():
    #           Get title
                if 'coredata' in elem.tag:
                    for core in elem:
                        if 'title' in core.tag:
    #                         print(core.text)
                            _temp_title = core.text
                            _temp_title = re.sub("\n", '', _temp_title)
                            _temp_title = re.sub("\xa0", '', _temp_title)
                            _temp_title = ' '.join(_temp_title.split())
    #           Get abstract
                if 'abstract' in elem.tag:
                    for abs_child in elem:
    #                     print(abs_child.tag+' Here!')
                        if 'simple-para' in abs_child.tag:
    #                         print(abs_child.tag)
                            abs_text = ''.join(abs_child.itertext())
            #               Remove \n and \xa0 string literals
                            abs_text = re.sub("\n", '', abs_text)
                            abs_text = re.sub("\xa0", '', abs_text)
            #               Remove reference citations
            #               Can't use simple removal of [xxx] since crystallographic notation is in this form
    #                         pre_clean = re.sub("[\[].*?[\]]", "", pre_clean)
            #               Extract abstract and captions
            #               --------------------Clean sentences----------------
                            abs_text = ' '.join(abs_text.split())
    #                       to_save_abs = ''.join(abs_text.split())
    #                         print('Some text is:', abs_text)
                            _temp_abstract.append(abs_text)
    #           Remove empty elements
                _temp_abstract = list(filter(None, _temp_abstract))
    #           Get caption
                if 'caption' in elem.tag:
                    for cap_child in elem:
    #                     print(cap_child.tag)
                        if 'simple-para' in cap_child.tag:
                            cap_text = ''.join(cap_child.itertext())
            #               Remove \n and \xa0 string literals
                            cap_text = re.sub("\n", '', cap_text)
                            cap_text = re.sub("\xa0", '', cap_text)
            #               Remove reference citations
            #               Can't use simple removal of [xxx] since crystallographic notation is in this form
    #                         pre_clean = re.sub("[\[].*?[\]]", "", pre_clean)
            #               Extract abstract and captions
            #               --------------------Clean sentences----------------
                            cap_text = ' '.join(cap_text.split())
    #                       to_save_abs = ''.join(abs_text.split())
    #                         print('Some text is:', abs_text)
                            _temp_caption.append(cap_text)
                caption[file] = _temp_caption
    #           Get text
                if 'para' in elem.tag and 'simple-para' not in elem.tag:
    #               --------------------remove equations -----------------------
                    for _child in elem:
    #                     print(_child.tag)
                        if 'display' in _child.tag:
                            for __child in _child:
    #                             print(__child.tag)
                                if 'formula' in __child.tag:
                                    _child.remove(__child)
    #               --------------------remove equations ------------------------
    #               --------------------------Clean sentences--------------------
    #               Extract text from XML
                    pre_clean = ''.join(elem.itertext())
    #               Remove \n and \xa0 string literals
                    pre_clean = re.sub("\n", '', pre_clean)
                    pre_clean = re.sub("\xa0", '', pre_clean)
    #               Remove reference citations
    #               Can't use simple removal of [xxx] since crystallographic notation is in this form
                    pre_clean = re.sub("[\[].*?[\]]", "", pre_clean)
    #               Extract abstract and captions
    #               --------------------Clean sentences----------------
                    pre_clean = ' '.join(pre_clean.split())
                    # Collect text in _temp_data
                    _temp_data.append(pre_clean)
        data[file] = _temp_data
        abstract[file] = _temp_abstract
        caption[file] = _temp_caption
        #title[file] = _temp_title
        _temp_data = []
        _temp_abstract = []
        _temp_caption = []
    if _get_type == 'text':
        return data
    elif _get_type == 'abstract':
        return abstract
    elif _get_type == 'caption':
        return caption
    elif _get_type == 'title':
        return title

In [3]:
class Extractor():
    def __init__(self, model, examples, example_labels, description, task_description, example_prompt):
        self.model = model
        self.examples = examples
        self.example_labels = example_labels
        self.description = description
        self.task_description = task_description
        self.example_prompt = example_prompt

    def make_prompt(self, example):
        examples = self.examples + [example]
        labels = self.example_labels + [""]
        # return (self.task_description +
        #         "\n".join([self.description+examples[i] + "\n" +
        #                         self.example_prompt +
        #                          labels[i] for i in range(len(examples))]))
        # print(("\n".join([self.description + examples[i] + "\n" +
        #                   self.example_prompt +
        #                   labels[i] + self.task_description for i in range(len(examples))])))
        return (self.example_prompt + '\n'+ "\n---\n".join([self.description + examples[i] + "\n" +
                           self.task_description +
                           labels[i] for i in range(len(examples))]))

    def extract(self, example):
        prompt=self.make_prompt(example)
        #print(prompt)
        
        # model = OPTForCausalLM.from_pretrained("/home/user/data/opt-66b", device_map="auto", load_in_8bit = True)
        tokenizer = AutoTokenizer.from_pretrained("/home/user/data/opt-66b")

        #prompt = "I like dog. --dog-- \nI hate bird. --bird-- \nI love cat."
        inputs = tokenizer(prompt, return_tensors="pt")

        # Generate
        #generate_ids = self.model.generate(inputs.input_ids, temperature=1, repetition_penalty=0.1, length_penalty = -0.5, max_length=1500)
        generate_ids = self.model.generate(inputs.input_ids, temperature=0, max_length=1950)
        output=tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        # print(output)
        
        # extraction = openai.Completion.create(
        #     model='text-davinci-002',  # text-ada-001 text-babbage-001 text-curie-001
        #     prompt=self.make_prompt(example),
        #     temperature=0,
        #     max_tokens=1000,
        #     top_p=1,
        #     frequency_penalty=0.0,
        #     presence_penalty=0.0,
        #     stop=["\n"])
        return output

In [4]:
model = OPTForCausalLM.from_pretrained("/home/user/data/opt-66b", device_map="auto", load_in_8bit = True)
#model = OPTModel.from_pretrained("/home/user/data/opt-66b", device_map="auto", load_in_8bit = True)


Welcome to bitsandbytes. For bug reports, please submit your error trace to: https://github.com/TimDettmers/bitsandbytes/issues


In [5]:
examples = [
    ('pure Al | electron mean free path | 18.9 nm and 22.7 nm', 'The estimated electron mean free path for pure Al is very close to the values reported in the literature, i.e., 18.9 nm and 22.7 nm.'),
    ('Magnesium | density | 1.74 g/cm3, Mg alloys | formability | poor, Mg alloys | corrosion | high susceptibility', 'To meet the ever-increasing demand for weight reduction and the associated energy savings, the development of high-performance lightweight materials is urgently needed . Magnesium (Mg) is the lightest structural metal; its density is about 1.74 g/cm3, which is two-thirds that of aluminum (Al) and one-fourth that of steel. In addition, it has high thermal conductivity, high damping capacity, and good biocompatibility, which have attracted considerable interest over the past two decades for potential use in diverse applications. However, owing to the limited number of independent slip systems originating from their hexagonal close-packed (HCP) crystal structure and the development of a strong basal texture during thermomechanical processing, Mg alloys usually exhibit poor formability at room temperature (RT). Moreover, they exhibit high susceptibility to corrosion, which further impedes widespread applications.'),
    ('Mg–0.03Cu–0.05Ca alloy | thermal conductivity | 157 W/(m·K), AZ31 alloy | thermal conductivity | 87 W/(m·K)', 'Benefiting from the addition of an extremely small amount of the alloying elements, the Mg–0.03Cu–0.05Ca alloy exhibited an excellent thermal conductivity of 157 W/(m·K), which is close to that of pure Mg, and approximately two times higher than that of the widely used AZ31 alloy (87 W/(m·K)), as summarized in Table4 . Fig.11 shows the formability and thermal conductivity of various Mg sheet alloys, including pure Mg, commercial AZ31, and experimental Mg–Zn-based alloys. As can be seen, the newly developed Mg–0.03Cu–0.05Ca alloy shows not only good formability but also excellent thermal conductivity compared to Mg and other Mg sheet alloys.'),
    ('Cu in Mg | maximum solid solubility | 300 PPM by weight at the eutectic temperature of 485°C, Ca in Mg | maximum solid solubility | approximately 1.35 wt.% at the eutectic temperature of 516°C and approximately zero at 200°C.', 'According to the Mg–Cu binary phase diagram, the maximum solid solubility of Cu in Mg is extremely small (300 PPM by weight at the eutectic temperature of 485°C). The maximum solid solubility of Ca in Mg is also rather small (approximately 1.35 wt.% at the eutectic temperature of 516°C and approximately zero at 200°C.'),
    ('pure Al and AlSi10Mg | thermal conductivity | 200 and 90 W/mK (at room temperature) in the as-built condition and 240 and 180 W/mK after heat treatment.', 'In the previous studies, pure Al and AlSi10Mg having thermal conductivities of 200 and 90 W/mK (at room temperature) in the as-built condition were manufactured. These values were enhanced to 240 and 180 W/mK after heat treatment.'),
    ('No information about material properties', 'Furthermore, adding a large amount of solute substantially deteriorates functional properties such as thermal conductivity and damping capacity. Alternately, adding extremely small amounts of alloying elements, which are effective in improving both the mechanical properties and corrosion resistance of pure Mg, could be an effective method for preserving the functional properties.'),
    ('No information about material properties', 'Furthermore, adding a large amount of solute substantially deteriorates functional properties such as thermal conductivity and damping capacity. Alternately, adding extremely small amounts of alloying elements, which are effective in improving both the mechanical properties and corrosion resistance of pure Mg, could be an effective method for preserving the functional properties.'),
    ('as-cast CuMgZn | thermal conductivity | 73.8 W/(m·K)', 'The as-cast microstructure of Mg-xZn-xCu(x = 1,3,5) ternary magnesium alloy consists of Mg and CuMgZn. The CuMgZn compound has a high thermal conductivity of 73.8 W/(m·K), which is comparable to Fe.'),
    ('LPBF-fabricated pure Al part | thermal conductivity | (∼231 W/mK) and nominal value (237 W/mK)', 'This is the main reason behind the difference observed between the thermal conductivity of the LPBF-fabricated pure Al part (∼231 W/mK) and its nominal value (237 W/mK ). Vacancies and dislocations are the next two important extrinsic scattering sites with contributions of 3.2% and 1%, respectively.'),
    ('Mg–2Sn-2.3La alloy | thermal conductivity | 149W/(m·K) and 152W/(m·K) after the solution treatment ', 'La addition could improve the thermal conductivity of Mg–2Sn alloy. The maximum improvement was obtained when the La content was 2.3wt.% with the Sn/La atomic ratio of 1:1. The thermal conductivity was improved by 33% from 112W/(m·K) to 149W/(m·K). The solution treatment further enhanced the thermal conductivity of Mg–2Sn-2.3La alloy to 152W/(m·K).'),
]

In [6]:
# model, text, extraction, 'text:', 'extraction', 'The task is to extract xxx'
TempExtractor = Extractor(model, [e[1] for e in examples], [e[0] for e in examples],
                                  "text:",
                                  "",
                                  "The task is to extract materials and conductivity from the text.")

In [7]:
paper_text = get_text_from_xml(r"./xml", 'text')
input_text = []
doi_Num=[]
for key in paper_text.keys():
    for text in paper_text[key]:
        if len(text.split()) > 10:
            input_text.append(text)
            doi_Num.append(key)


In [8]:
done_list=[]

with open('done_list.txt', 'r') as f:
    lines = f.readlines()
    for line in lines:
        done_list.append(line.strip())

done_list=[]

input_text=[]
with open('text_from_paper.txt', 'r') as f:
    lines = f.readlines()
    for line in lines:
        input_text.append(line.strip())
        
with open('done_list.txt', 'a', encoding='utf-8') as f_done:
    with open('results_total.txt', 'a', encoding='utf-8') as f:
            results = []
            for i in tqdm(range(18, 30, 1)):
                if doi_Num[i] not in done_list:
                    f_done.writelines(doi_Num[i]+'\n')
                    done_list.append(doi_Num[i])
                if doi_Num[i] in done_list and doi_Num[i] != done_list[-1]:
                    continue
                else:
                    try:
                        if 'conductivity' in input_text[i] or 'conductivities' in input_text[i]:
                            # print(input_text[i])
                            generation_text = TempExtractor.extract(input_text[i])
                            f.writelines(doi_Num[i] + '\t' + input_text[i] + '\t' + generation_text + '\n')
                            print(generation_text.split('---')[10])
                            #print(generation_text.split('---')[32])
                            #results.append(generation_text)
                        else:
                            print('Not related!')
                    except Exception as e:
                        print('ERROR: ', e)
            # print(results)

  8%|▊         | 1/12 [15:21<2:48:55, 921.43s/it]


text:The aforementioned studies share these two findings that the LPBF fabricated Al/Al alloys show (i) negligible anisotropy of thermal conductivity and (ii) lower thermal conductivity than that of their wrought counterparts in the as-built condition. Although the former eliminates the need for any post-process heat treatments, the latter makes its implementation inevitable. This study proposes an easily attainable solution to obtain high thermal conductivities for the LPBF fabricated pure Al, AlSi12 (Al-Si binary alloy), and AlSi10Mg (Al-Si-Mg ternary alloy) in the as-built condition and eliminate/alleviate the need for time-consuming and cost-prohibitive post-build heat treatments. The microstructural features of the LPBF fabricated pure Al, AlSi12, and AlSi10Mg parts under the proposed processing condition were examined at nano- to micron-scale and compared to their wrought/cast counterparts as well as the LPBF fabricated parts in the literature. Different factors governing the th

 17%|█▋        | 2/12 [26:01<2:05:57, 755.80s/it]


text:Using the above information, the thermal conductivity ( K ) was calculated at different temperatures as follows :
We can learn from the text:calculated thermal conductivity ( K ) at different temperatures as follows :



 25%|██▌       | 3/12 [34:57<1:38:18, 655.39s/it]


text:Based on Eq. (1), the density of the printed samples is required not only at ambient ( ρ RT ) but also at higher temperatures ( ρ T ) to calculate the thermal conductivity of Al/Al alloys within the desired temperature range of 25–425 °C. To accurately predict the density of samples at higher temperatures, calorimetry test was utilized. The relative change in the physical dimension ( ∆ L / L 0 , L 0 is the initial sample length) versus temperature in the heating cycle is shown in Fig. 3(a-c). The following equation, along with the results shown in Fig. 3(a-c), were used to calculate the density at any desirable temperature:
We can learn from the text:Fig. 3(a-c) | relative change in the physical dimension ( ∆ L / L 0 , L 0 is the initial sample length) versus temperature in the heating cycle



 33%|███▎      | 4/12 [41:54<1:14:52, 561.51s/it]


text:By using the C P and thermal diffusivity results (Fig. 1 and Fig. 4) as well as the calculated ρ ( T ) values, the thermal conductivity ( K ( T ) ) was calculated using Eq. (1). As shown in Fig. 5, the thermal conductivity of pure Al slightly decreases from 230 W/mK at room temperature to 200 W/mK at 425 °C. The thermal conductivities of the AlSi10Mg and AlSi12 samples are almost constant up to 150 and 175 °C, respectively. Then, they dip into their lowest values at 225 and 250 °C. The thermal conductivity of AlSi12 gradually recovers after 250 °C and shows an overall ascending trend thereafter. For AlSi10Mg, there is a local minimum at 300 °C after the gradual recovery from the lowest thermal conductivity value. A gradual increase in thermal conductivity of AlSi10Mg was observed after 300 °C, which finally levels off above 350 °C. Referring to Fig. 1, it can be understood that the C P measurement results are reflected in the thermal conductivity trend. The thermal conductivity o

 42%|████▏     | 5/12 [47:32<56:04, 480.65s/it]  


text:In the previous studies, pure Al and AlSi10Mg having thermal conductivities of 200 and 90 W/mK (at room temperature) in the as-built condition were manufactured. These values were enhanced to 240 and 180 W/mK after heat treatment. However, in this study, the thermal conductivity of the as-built pure Al was almost equal to that of the heat-treated case (∼230 W/mK at room temperature). Moreover, the thermal conductivity of the as-built AlSi10Mg in preheated conditions was ∼80% higher than that of its non-preheated counterpart. For AlSi12, the preheated sample (this study) showed ∼25% higher thermal conductivity than its non-preheated counterpart (115 W/mK) in the as-built state. Therefore, this study has borne out the possibility of 200 °C of preheating to enhance the thermal conductivity of the as-built samples. It must be highlighted that the whole difference observed between the thermal conductivity of the LPBF-fabricated samples in preheated (this study) and non-preheated (lite

 50%|█████     | 6/12 [56:58<50:59, 509.95s/it]Input length of input_ids is 2002, but `max_length` is set to 1950. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.



text:In the case of pure Al, preheating could eliminate the need for post-build heat treatment. For AlSi10Mg and AlSi12, the thermal conductivities of the preheated samples in the as-built condition are equal and greater than those reported for die-cast AlSi10Mg (∼160 W/mK) and AlSi12 (120 W/mK) , respectively. Nevertheless, the post-build heat treatment cannot be ignored for Al alloys. In other words, it is still possible to further enhance the thermal conductivity of AlSi12 and AlSi10Mg alloys printed in this study by ∼20 and ∼12% by employing a suitable post-build heat treatment cycle to reach the nominal values of 170 and 185 W/mK reported for their wrought and heat-treated LPBF-fabricated parts, respectively.
We can learn from the text:preheating | thermal conductivity | equal and greater than those reported for die-cast AlSi10Mg (∼160 W/mK) and AlSi12 (120 W/mK) , respectively, and post-build heat treatment | thermal conductivity | 170 and 185 W/mK reported for their wrought and

 58%|█████▊    | 7/12 [57:03<28:43, 344.67s/it]


text:Heat can be transferred either by electrons or lattice vibrations (phonons as heat carriers). In metallic materials, the main mechanism governing the heat transfer is the movement of valance electrons with high energy from the warmer regions to the cooler ones . During their journey, electrons face obstacles known as scattering sites. The scattering phenomena can be divided into two categories of intrinsic and extrinsic events . While intrinsic scattering events such as thermally-induced vibration of atoms (phonon scattering) are mainly material-dependent , the extrinsic ones are highly sensitive to the processing condition. Referring to the microstructural features shown in Fig. 8, Fig. 9, and Fig. 10, along with the XCT results (Fig. 2), the extrinsic scattering sites in the LPBF-fabricated Pure Al, AlSi12, and AlSi10Mg alloys are (i) Si element in the supersaturated α-Al matrix in Al alloys, (ii) non-equilibrium vacancies and dislocations, (iii) nano-size Si and Mg2Si precipit

 67%|██████▋   | 8/12 [1:04:24<25:01, 375.50s/it]Input length of input_ids is 2002, but `max_length` is set to 1950. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.



text:Some important inferences can be drawn from the mean free path calculation results in Table 1. First, the l e of the LPBF-fabricated pure Al (16.2 nm) is considerably greater than those of AlSi10Mg (11.1 nm) and AlSi12 (9.7 nm). The l e values are consistent with the results shown in Fig. 4 and Fig. 5, in which materials with smaller l e values showed lower thermal diffusivity and conductivity. The absence of foreign atoms in pure Al is the main reason behind its higher l e and thermal diffusivity/conductivity compared to the Al alloys. Second, based on the theoretical calculations for pure Al, in which the scattering events are only dictated by intrinsic phenomena, the l e value is in the nanometer length scale. On this account, it can be inferred that extrinsic scattering phenomena at such length scales or smaller could have a major influence on the obtained overall mean free path in the LPBF-fabricated parts. Finally, the theoretical l e value for pure Al was ∼15% lower than t

 75%|███████▌  | 9/12 [1:04:26<12:56, 258.75s/it]


text:With the same approach as that of vacancies, the contribution of dislocations to the electron scattering ( l dis ) was calculated using the following equation : where n d is the dislocation density or total length of dislocations per unit volume and σ d is the scattering cross-section of the electrons per unit length of dislocations. Referring to GND density measurement results provided in section ‎3.2.3, the magnitude of n d for the LPBF-fabricated pure Al, AlSi12 and AlSi10Mg was found to be 4.6 × 10 13 , 1.3 × 10 14 , and 6.5 × 1013 m−2, respectively. Fiks used the following formula to estimate σ d : in which ∆ ρ d * signifies the resistivity per unit dislocation density . The rest of the parameters have the same definition as before and are tabulated in Table 1. Raeisinia et al. experimentally measured the magnitude of ∆ ρ d * for Al at room temperature, the result of which is shown in Table 2. By knowing all the parameters in above equation, σ d and consequently the l dis wa

 83%|████████▎ | 10/12 [1:11:52<10:32, 316.36s/it]


text:The contribution of each scattering phenomenon (intrinsic and extrinsic) to the overall mean free path of conducting electrons is obtained from the data listed in Table 2 (contribution = 1 / l i 1 / l e LPBF ) and is shown in Fig. 13. Valuable information can be obtained from the results listed in Table 2 and provided in Fig. 13. Before anything else, it should be noted that pores and grains are the weakest extrinsic scattering sites in all materials because their length scale is in the order of µm, while the intrinsic scattering phenomena are happening on the nanometer length scale (their contribution is not shown in Fig. 13). For pure Al, the rest of the extrinsic scattering sites are more or less affecting the thermal conductivity, and their contribution is not insignificant. With that being noted, the nano-size Al2O3 observed in TEM images (Fig. 8) with the lowest mean chord length in the matrix is the most important contributing factor (6.9%). This is the main reason behind 

 92%|█████████▏| 11/12 [1:16:54<05:11, 311.94s/it]


text:For Al alloys, the supersaturation of Si in the α-Al matrix is by far the most important extrinsic scattering site with a contribution of 43.3% (AlSi12) and 34% (AlSi10Mg), comparable to that of the intrinsic scattering by phonons. This is the main reason behind the difference observed between the obtained thermal conductivity for AlSi12 (∼140 W/mK) and AlSi10Mg (∼160 W/mK) alloy and their nominal values (170 and 185 W/mK for the respective materials). Among the other factors playing a minor role in the lower thermal conductivity of the AlSi10Mg part, the nano-size Si precipitates within the cells have the highest impact (7.1%). Yet the l Si is one order of magnitude greater than l SS Si and cannot compete with the solid solution Si atoms in terms of scattering strength. In other words, given that there is a mixture between Al and Si elements in an atomic scale in the supersaturated α-Al matrix, these Si atoms are the most detrimental extrinsic scattering sites. This agrees with 

100%|██████████| 12/12 [1:22:09<00:00, 410.82s/it]


text:Results shown in Table 2 and Fig. 13 also provide valuable insights into the main reasons behind the higher thermal conductivity of as-built Al/Al alloys in this study (preheated) than the non-preheated counterparts reported in the literature. Previously, Kimura et al. reported thermal conductivity of ∼200 W/mK for the as-built pure Al (non-preheated) with an oxygen content of 1240 ppm. According to the oxygen content measurements provided in Fig. 11, the optimum pure Al part in this study has 325 ppm oxygen. Therefore, the considerably lower oxygen content of the optimized pure Al part in this study, along with the lower density of dislocations obtained by preheating, justifies the higher thermal conductivity (31 W/mK). While acknowledging that the contribution of dislocations in the non-preheated condition might be higher, it is still logical to conclude that the reported improvement in thermal conductivity of the post-build heat-treated pure Al samples (from 200 to 240 W/mK) i


