# Scientific Text Classification

@author: Tyler Bikaun

1. Use labelled datasets to predict unlabelled datasets
2. Need to concatenate labels together where there are multiple fields of study

In [0]:
import pandas as pd
import re
import numpy as np
import ast

## Mounting drive to access model data

In [2]:
# Mount google drive to notebook to access data.
# This will prompt you to enter an authentication code by permitting access to the drive.
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
class Data():
    
    def __init__(self, fileName):
        
        self.dataOriginal = pd.read_csv(fileName)
        self.data = self.dataOriginal.copy()
        self.lenOriginalData = len(self.data)
        
        # exe
        self.formatData()
        self.formatLabelData()
        self.dropNanText()
        self.removeDataWithoutLabels()
    
    def formatData(self):
        # subset data for title, abstract, field of study
        self.data = self.data[['Unnamed: 0', '0', '2']]
        # rename columns
        self.data.rename(columns={'Unnamed: 0': 'title', '0': 'abstract', '2':'label'}, inplace=True)
        # Replace NaN
        self.data['abstract'] = self.data['abstract'].replace(np.nan, '', regex=True)
        # concatenate title and abstract
        self.data['text'] = self.data['title'] + ' ' + self.data['abstract']
        # Drop unnecessary columns
        self.data.drop(labels=['title', 'abstract'], axis=1, inplace=True)
        # change order of columns
        self.data = self.data[['label', 'text']]
        # Concatenate labels
        self.data['label'] = self.data['label'].apply(lambda x: self.labelConcat(x))
    
    def labelConcat(self, labelText):
        # Political Science should be made into Political-Science?
        try:
            return ''.join(ast.literal_eval(labelText))
        except:
            return ''
    
    def formatLabelData(self):
        """
        Format labels with FastText style
        """
        
        # Add '__label__' to label text data for Flair model
        self.data['label'] = '__label__' + self.data['label']
        
    def dropNanText(self):
        # drop any text that is NaN
        # Note: Might need to extend this to be just applicable to the abtract.
        self.data.dropna()

    def dataSummary(self):
        print(f'Rows dropped: {self.lenOriginalData - len(self.data)}')
        
    def removeDataWithoutLabels(self):
        self.dataNoLabel = self.data[self.data['label'] == '__label__']
        # print(self.data['text'])
        self.data = self.data[self.data['label'] != '__label__']
        

In [0]:
tcDataCS = Data(r"/content/drive/My Drive/Flair Text Classification/data/papers_of_interest.csv")

In [5]:
tcDataCS.dataSummary()

Rows dropped: 0


In [6]:
tcDataCS.dataOriginal

Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,5,6
0,Texture synthesis for digital painting,The problem of digital painting is considered ...,1984,['Computer Science'],,https://semanticscholar.org/paper/1e9c964d606e...,63,10
1,Texture analysis and artificial neural network...,Clustered microcalcifications on X-ray mammogr...,1997,['Computer Science'],Neural Networks for Signal Processing VII. Pro...,https://semanticscholar.org/paper/b283fccfdeef...,3,9
2,Automatic generation of benchmark and test wor...,"In this tutorial, we describe techniques for a...",2010,['Computer Science'],,https://semanticscholar.org/paper/b56c5cac50dd...,6,8
3,Text Encoding Initiative,,2012,['Computer Science'],,https://semanticscholar.org/paper/4c99b00b844a...,1,1
4,A Method of Text Classification Combining Naiv...,Text classification is one of the main issues ...,2015,['Computer Science'],,https://semanticscholar.org/paper/195e933e65ed...,0,0
...,...,...,...,...,...,...,...,...
1700,Text-Based Information Retrieval Using Relevan...,"Europeana, a freely accessible digital library...",2011,['Computer Science'],,https://semanticscholar.org/paper/c463732a1dd9...,0,4
1701,Natural language generation as neural sequence...,Natural Language Generation (NLG) is the task ...,2017,['Computer Science'],,https://semanticscholar.org/paper/2c0314695d78...,0,212
1702,Little languages: little maintenance?,"textabstractSo-called little, or domain-specif...",1997,['Computer Science'],Journal of Software Maintenance,https://semanticscholar.org/paper/c0d4ef4ab0c8...,5,23
1703,Text Categorization Improvement via User Inter...,"In this paper, we propose an approach to impro...",2018,['Computer Science'],,https://semanticscholar.org/paper/8210bd53bde4...,0,31


In [7]:
tcDataCS.data

Unnamed: 0,label,text
0,__label__Computer Science,Texture synthesis for digital painting The pro...
1,__label__Computer Science,Texture analysis and artificial neural network...
2,__label__Computer Science,Automatic generation of benchmark and test wor...
3,__label__Computer Science,Text Encoding Initiative
4,__label__Computer Science,A Method of Text Classification Combining Naiv...
...,...,...
1700,__label__Computer Science,Text-Based Information Retrieval Using Relevan...
1701,__label__Computer Science,Natural language generation as neural sequence...
1702,__label__Computer Science,Little languages: little maintenance? textabst...
1703,__label__Computer Science,Text Categorization Improvement via User Inter...


In [8]:
tcDataCS.data['text'][0]

'Texture synthesis for digital painting The problem of digital painting is considered from a signal processing viewpoint, and is reconsidered as a problem of directed texture synthesis. It is an important characteristic of natural texture that detail may be evident at many scales, and the detail at each scale may have distinct characteristics. A “sparse convolution” procedure for generating random textures with arbitrary spectral content is described. The capability of specifying the texture spectrum (and thus the amount of detail at each scale) is an improvement over stochastic texture synthesis processes which are scalebound or which have a prescribed 1/f spectrum. This spectral texture synthesis procedure provides the basis for a digital paint system which rivals the textural sophistication of traditional artistic media. Applications in terrain synthesis and texturing computer-rendered objects are also shown.'

In [9]:
tcDataCS.data.head()

Unnamed: 0,label,text
0,__label__Computer Science,Texture synthesis for digital painting The pro...
1,__label__Computer Science,Texture analysis and artificial neural network...
2,__label__Computer Science,Automatic generation of benchmark and test wor...
3,__label__Computer Science,Text Encoding Initiative
4,__label__Computer Science,A Method of Text Classification Combining Naiv...


### Other datasets

In [0]:
tcDataOther = Data(r"/content/drive/My Drive/Flair Text Classification/data/papers_NOT_of_interest.csv")

In [11]:
# Must be due to the NaN values for abstracts.
tcDataOther.dataSummary()

Rows dropped: 443


In [12]:
tcDataOther.dataOriginal

Unnamed: 0.1,Unnamed: 0,0,1,2
0,Use and Safety of Respiratory Medicines in Chi...,textabstractThe lack of appropriately authoris...,2011,['Medicine']
1,Language in Australia: Transplanted languages ...,,1991,['Sociology']
2,Automatic Generation of A High-level Contact S...,"Information of high-level, topological contact...",2007,['Engineering']
3,Textured Thin Films of Transition Metal Dichal...,,1994,['Materials Science']
4,Text Mining with Support Vector Machines and N...,The objective of this thesis is to develop eff...,2007,[]
...,...,...,...,...
2960,Quangos in Dutch government,textabstractThis chapter deals with the establ...,2005,['Political Science']
2961,Texture and Preferred Orientation,,2006,['Materials Science']
2962,Textiles and Earthquake Disaster(10)Aiming for...,,2014,['Engineering']
2963,Textural development of monazite during high-g...,Abstract Monazite has become an important tool...,1999,['Chemistry']


In [0]:
# print(tcDataOther.data.to_string())

In [14]:
tcDataOther.lenOriginalData

2965

In [15]:
len(tcDataOther.data)

2522

In [16]:
len(tcDataOther.dataNoLabel)

443

In [17]:
tcDataOther.dataNoLabel

Unnamed: 0,label,text
4,__label__,Text Mining with Support Vector Machines and N...
15,__label__,Textschicksale: Das Werk Arthur Schnitzlers im...
21,__label__,Texture evolution and Swift effect in NiAl NiA...
22,__label__,Text-Kultur Kommunikation : Translation als Fo...
30,__label__,Textual Features from Multimedia Conten Identi...
...,...,...
2933,__label__,Text Analysis on Punishment Clauses in Korean ...
2938,__label__,Textil och dockor
2942,__label__,Textual Understanding and Historical Experienc...
2945,__label__,Text Categorisation Using Do ument Pro ling


### Create entire dataset

In [0]:
trainingData = tcDataCS.data.append(tcDataOther.data)

In [19]:
trainingData

Unnamed: 0,label,text
0,__label__Computer Science,Texture synthesis for digital painting The pro...
1,__label__Computer Science,Texture analysis and artificial neural network...
2,__label__Computer Science,Automatic generation of benchmark and test wor...
3,__label__Computer Science,Text Encoding Initiative
4,__label__Computer Science,A Method of Text Classification Combining Naiv...
...,...,...
2960,__label__Political Science,Quangos in Dutch government textabstractThis c...
2961,__label__Materials Science,Texture and Preferred Orientation
2962,__label__Engineering,Textiles and Earthquake Disaster(10)Aiming for...
2963,__label__Chemistry,Textural development of monazite during high-g...


#### Test, Dev, Train Split

In [20]:
# attr: https://stackoverflow.com/questions/38250710/how-to-split-data-into-3-sets-train-validation-and-test
train, dev, test = np.split(trainingData.sample(frac=1), [int(.6*len(trainingData)), int(.8*len(trainingData))])
print(f'{len(trainingData)} | {len(train)} | {len(dev)} | {len(test)}')

4227 | 2536 | 845 | 846


In [0]:
train.to_csv(r'/content/drive/My Drive/Flair Text Classification/data/train.csv', sep='\t', index = False, header = False)
test.to_csv(r'/content/drive/My Drive/Flair Text Classification/data/test.csv', sep='\t', index = False, header = False)
dev.to_csv(r'/content/drive/My Drive/Flair Text Classification/data/dev.csv', sep='\t', index = False, header = False)

Review data

In [0]:
dev = pd.read_csv('/content/drive/My Drive/Flair Text Classification/data/dev.csv', sep='\t')

In [23]:
dev.head()

Unnamed: 0,__label__Materials Science,"Texture Formation of Very Thin TiN, TiCN and TiC Films on Single Crystal of Silicon Steel at Incipient Stage during Plasma Coating"
0,__label__Medicine,Textbook of Ophthalmology.
1,__label__Materials Science,"Texture, microhardness and corrosion resistanc..."
2,__label__Medicine,TEXTBOOK OF GERIATRIC MEDICINE AND GERONTOLOGY...
3,__label__Computer Science,Semantics Based Identifier Mining for UML Mode...
4,__label__Medicine,Text message alerts to emergency physicians id...


## Flair Text Classification

https://towardsdatascience.com/text-classification-with-state-of-the-art-nlp-library-flair-b541d7add21f

In [26]:
!pip install flair

Collecting flair
[?25l  Downloading https://files.pythonhosted.org/packages/03/29/81e3c9a829ec50857c23d82560941625f6b42ce76ee7c56ea9529e959d18/flair-0.4.5-py3-none-any.whl (136kB)
[K     |██▍                             | 10kB 17.7MB/s eta 0:00:01[K     |████▉                           | 20kB 844kB/s eta 0:00:01[K     |███████▏                        | 30kB 1.3MB/s eta 0:00:01[K     |█████████▋                      | 40kB 1.7MB/s eta 0:00:01[K     |████████████                    | 51kB 1.0MB/s eta 0:00:01[K     |██████████████▍                 | 61kB 1.2MB/s eta 0:00:01[K     |████████████████▉               | 71kB 1.4MB/s eta 0:00:01[K     |███████████████████▏            | 81kB 1.6MB/s eta 0:00:01[K     |█████████████████████▋          | 92kB 1.2MB/s eta 0:00:01[K     |████████████████████████        | 102kB 1.4MB/s eta 0:00:01[K     |██████████████████████████▍     | 112kB 1.4MB/s eta 0:00:01[K     |████████████████████████████▊   | 122kB 1.4MB/s eta 0:00:0

### Create corpus

In [0]:
from flair.data_fetcher import NLPTaskDataFetcher
from flair.embeddings import WordEmbeddings, FlairEmbeddings, DocumentLSTMEmbeddings
from flair.models import TextClassifier
from flair.trainers import ModelTrainer
from pathlib import Path

In [28]:
corpus = NLPTaskDataFetcher.load_classification_corpus(Path('/content/drive/My Drive/Flair Text Classification/data/'), test_file='test.csv', dev_file='dev.csv', train_file='train.csv')

2020-04-08 08:07:49,010 Reading data from /content/drive/My Drive/Flair Text Classification/data
2020-04-08 08:07:49,011 Train: /content/drive/My Drive/Flair Text Classification/data/train.csv
2020-04-08 08:07:49,012 Dev: /content/drive/My Drive/Flair Text Classification/data/dev.csv
2020-04-08 08:07:49,013 Test: /content/drive/My Drive/Flair Text Classification/data/test.csv


  """Entry point for launching an IPython kernel.


In [29]:
print(len(corpus.train))

2536


In [30]:
corpus.make_label_dictionary()

2020-04-08 08:08:00,295 Computing label dictionary. Progress:


100%|██████████| 2536/2536 [00:00<00:00, 209810.34it/s]

2020-04-08 08:08:00,336 [b'Computer', b'Materials', b'Engineering', b'Political', b'Medicine', b'Geology', b'MathematicsEngineering', b'GeographyComputer', b'History', b'Biology', b'Psychology', b'Art', b'Sociology', b'Physics', b'Environmental', b'PsychologyComputer', b'Geography', b'Chemistry', b'MedicineBiology', b'Mathematics', b'BiologyMedicine', b'Philosophy', b'Economics', b'ChemistryComputer', b'MedicineChemistry', b'PsychologyMedicine', b'MathematicsEconomics', b'MedicineComputer', b'Business', b'MathematicsComputer', b'GeographyEngineering', b'PhysicsComputer', b'SociologyHistory', b'SociologyMathematics', b'EconomicsMathematics', b'BusinessComputer', b'MedicineMaterials', b'MathematicsBiologyComputer', b'ChemistryBiology', b'EngineeringComputer', b'BiologyChemistry', b'MedicinePhysics', b'MathematicsPhysics', b'MedicineEconomics', b'GeographyMaterials', b'BusinessMathematics', b'EngineeringGeography', b'PsychologyHistory', b'PhysicsMedicine', b'PsychologyMedicineComputer', b




<flair.data.Dictionary at 0x7f5e1d41ee48>

In [31]:
word_embeddings = [WordEmbeddings('glove'), FlairEmbeddings('news-forward-fast'), FlairEmbeddings('news-backward-fast')]

2020-04-08 08:08:01,425 https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/glove.gensim.vectors.npy not found in cache, downloading to /tmp/tmplw2m4dft


100%|██████████| 160000128/160000128 [00:17<00:00, 9053009.17B/s] 

2020-04-08 08:08:20,262 copying /tmp/tmplw2m4dft to cache at /root/.flair/embeddings/glove.gensim.vectors.npy





2020-04-08 08:08:20,498 removing temp file /tmp/tmplw2m4dft
2020-04-08 08:08:21,601 https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/glove.gensim not found in cache, downloading to /tmp/tmpz4wcan4c


100%|██████████| 21494764/21494764 [00:03<00:00, 5684948.87B/s]

2020-04-08 08:08:26,532 copying /tmp/tmpz4wcan4c to cache at /root/.flair/embeddings/glove.gensim
2020-04-08 08:08:26,562 removing temp file /tmp/tmpz4wcan4c



  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


2020-04-08 08:08:29,242 https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/lm-news-english-forward-1024-v0.2rc.pt not found in cache, downloading to /tmp/tmpu8tt_quu


100%|██████████| 19689779/19689779 [00:03<00:00, 5656300.61B/s]

2020-04-08 08:08:33,896 copying /tmp/tmpu8tt_quu to cache at /root/.flair/embeddings/lm-news-english-forward-1024-v0.2rc.pt
2020-04-08 08:08:33,925 removing temp file /tmp/tmpu8tt_quu





2020-04-08 08:08:35,521 https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/lm-news-english-backward-1024-v0.2rc.pt not found in cache, downloading to /tmp/tmp_te1cosy


100%|██████████| 19689779/19689779 [00:03<00:00, 5400190.14B/s]

2020-04-08 08:08:40,319 copying /tmp/tmp_te1cosy to cache at /root/.flair/embeddings/lm-news-english-backward-1024-v0.2rc.pt
2020-04-08 08:08:40,348 removing temp file /tmp/tmp_te1cosy





In [32]:
document_embeddings = DocumentLSTMEmbeddings(word_embeddings, hidden_size=512, reproject_words=True, reproject_words_dimension=256)

  """Entry point for launching an IPython kernel.


In [33]:
# Single label for the current dataset, maybe multi-label in the future?
classifier = TextClassifier(document_embeddings, label_dictionary=corpus.make_label_dictionary(), multi_label=False)

2020-04-08 08:08:40,731 Computing label dictionary. Progress:


100%|██████████| 2536/2536 [00:00<00:00, 194307.02it/s]

2020-04-08 08:08:40,748 [b'Computer', b'Materials', b'Engineering', b'Political', b'Medicine', b'Geology', b'MathematicsEngineering', b'GeographyComputer', b'History', b'Biology', b'Psychology', b'Art', b'Sociology', b'Physics', b'Environmental', b'PsychologyComputer', b'Geography', b'Chemistry', b'MedicineBiology', b'Mathematics', b'BiologyMedicine', b'Philosophy', b'Economics', b'ChemistryComputer', b'MedicineChemistry', b'PsychologyMedicine', b'MathematicsEconomics', b'MedicineComputer', b'Business', b'MathematicsComputer', b'GeographyEngineering', b'PhysicsComputer', b'SociologyHistory', b'SociologyMathematics', b'EconomicsMathematics', b'BusinessComputer', b'MedicineMaterials', b'MathematicsBiologyComputer', b'ChemistryBiology', b'EngineeringComputer', b'BiologyChemistry', b'MedicinePhysics', b'MathematicsPhysics', b'MedicineEconomics', b'GeographyMaterials', b'BusinessMathematics', b'EngineeringGeography', b'PsychologyHistory', b'PhysicsMedicine', b'PsychologyMedicineComputer', b




In [34]:
print(corpus.obtain_statistics())

{
    "TRAIN": {
        "dataset": "TRAIN",
        "total_number_of_documents": 2536,
        "number_of_documents_per_class": {
            "Computer": 972,
            "Materials": 182,
            "Engineering": 140,
            "Political": 51,
            "Medicine": 204,
            "Geology": 46,
            "MathematicsEngineering": 2,
            "GeographyComputer": 2,
            "History": 64,
            "Biology": 59,
            "Psychology": 155,
            "Art": 125,
            "Sociology": 118,
            "Physics": 31,
            "Environmental": 17,
            "PsychologyComputer": 4,
            "Geography": 32,
            "Chemistry": 85,
            "MedicineBiology": 10,
            "Mathematics": 42,
            "BiologyMedicine": 5,
            "Philosophy": 54,
            "Economics": 21,
            "ChemistryComputer": 1,
            "MedicineChemistry": 7,
            "PsychologyMedicine": 16,
            "MathematicsEconomics": 1,
            "M

In [0]:
trainer = ModelTrainer(classifier, corpus)

In [0]:
trainer.train('/content/drive/My Drive/Flair Text Classification/model/', max_epochs=10)

2020-04-08 08:08:41,397 ----------------------------------------------------------------------------------------------------
2020-04-08 08:08:41,399 Model: "TextClassifier(
  (document_embeddings): DocumentLSTMEmbeddings(
    (embeddings): StackedEmbeddings(
      (list_embedding_0): WordEmbeddings('glove')
      (list_embedding_1): FlairEmbeddings(
        (lm): LanguageModel(
          (drop): Dropout(p=0.25, inplace=False)
          (encoder): Embedding(275, 100)
          (rnn): LSTM(100, 1024)
          (decoder): Linear(in_features=1024, out_features=275, bias=True)
        )
      )
      (list_embedding_2): FlairEmbeddings(
        (lm): LanguageModel(
          (drop): Dropout(p=0.25, inplace=False)
          (encoder): Embedding(275, 100)
          (rnn): LSTM(100, 1024)
          (decoder): Linear(in_features=1024, out_features=275, bias=True)
        )
      )
    )
    (word_reprojection_map): Linear(in_features=2148, out_features=256, bias=True)
    (rnn): GRU(256, 512)
 

### Making predictions

In [0]:
from flair.models import TextClassifier
from flair.data import Sentence

In [0]:
classifer = TextClassifier.load('/content/drive/My Drive/Flair Text Classification/model/best-model.pt')

Predictions are done on sentences that have no label

In [0]:
tcDataOther.dataNoLabel.tail(10)

In [0]:
sentenceWithoutLabel = tcDataOther.dataNoLabel['text'].iloc[-3]

In [0]:
# sentenceWithoutLabel = 'Textile Based Organic Light Emitting Diodes for Wearable Displays'

In [0]:
# sentenceWithoutLabel = tcDataCS.data['text'].iloc[4]

In [0]:
print(f'Predicting label for {sentenceWithoutLabel}')

In [0]:
sentence = Sentence(sentenceWithoutLabel)

In [0]:
classifier.predict(sentence)

In [0]:
print(sentence.labels)

In [0]:
# help(sentence)

In [0]:
tcDataOther.dataNoLabel['pred_label'] = ''

In [0]:
from tqdm import tqdm

In [0]:
# iterate over data without labels, make and save predictions.
for index, row in tqdm(tcDataOther.dataNoLabel.iterrows()):
  sentence = Sentence(row['text'])
  classifier.predict(sentence)
  tcDataOther.dataNoLabel.at[index, 'pred_label'] = sentence.labels

In [0]:
# extracting label class and probability from predicted label
tcDataOther.dataNoLabel['pred_label_class'] = ''
tcDataOther.dataNoLabel['pred_label_proba'] = ''
for index, row in tcDataOther.dataNoLabel.iterrows():
  tcDataOther.dataNoLabel.at[index, 'pred_label_class'] = str(tcDataOther.dataNoLabel['pred_label'].loc[index][0]).split(' ')[0]
  tcDataOther.dataNoLabel.at[index, 'pred_label_proba'] = re.sub('\(|\)', '', str(tcDataOther.dataNoLabel['pred_label'].loc[index][0]).split(' ')[1])

In [0]:
tcDataOther.dataNoLabel.head(10)

In [0]:
tcDataOther.dataNoLabel.to_csv('/content/drive/My Drive/Flair Text Classification/data/tcDataOtherLabelled.csv')