# Scientific Text Classification

@author: Tyler Bikaun

1. Use labelled datasets to predict unlabelled datasets
2. Need to concatenate labels together where there are multiple fields of study

In [1]:
import pandas as pd
import re
import numpy as np
import ast

## Mounting drive to access model data

In [2]:
class Data():
    
    def __init__(self, fileName):
        
        self.dataOriginal = pd.read_csv(fileName)
        self.data = self.dataOriginal.copy()
        self.lenOriginalData = len(self.data)
        
        # exe
        self.formatData()
        self.formatLabelData()
        self.dropNanText()
        self.removeDataWithoutLabels()
    
    def formatData(self):
        # subset data for title, abstract, field of study
        self.data = self.data[['Unnamed: 0', '0', '2']]
        # rename columns
        self.data.rename(columns={'Unnamed: 0': 'title', '0': 'abstract', '2':'label'}, inplace=True)
        # Replace NaN
        self.data['abstract'] = self.data['abstract'].replace(np.nan, '', regex=True)
        # concatenate title and abstract
        self.data['text'] = self.data['title'] + ' ' + self.data['abstract']
        # Drop unnecessary columns
        self.data.drop(labels=['title', 'abstract'], axis=1, inplace=True)
        # change order of columns
        self.data = self.data[['label', 'text']]
        # Concatenate labels
        self.data['label'] = self.data['label'].apply(lambda x: self.labelConcat(x))
    
    def labelConcat(self, labelText):
        # Political Science should be made into Political-Science?
        try:
            return ''.join(ast.literal_eval(labelText))
        except:
            return ''
    
    def formatLabelData(self):
        """
        Format labels with FastText style
        """
        
        # Add '__label__' to label text data for Flair model
        self.data['label'] = '__label__' + self.data['label']
        
    def dropNanText(self):
        # drop any text that is NaN
        # Note: Might need to extend this to be just applicable to the abtract.
        self.data.dropna()

    def dataSummary(self):
        print(f'Rows dropped: {self.lenOriginalData - len(self.data)}')
        
    def removeDataWithoutLabels(self):
        self.dataNoLabel = self.data[self.data['label'] == '__label__']
        # print(self.data['text'])
        self.data = self.data[self.data['label'] != '__label__']
        

In [3]:
tcDataCS = Data(r"C:/Users/Tyler/Desktop/UWA Repos/literaturesieve/project/notebooks/data/papers_of_interest.csv")

In [4]:
tcDataCS.dataSummary()

Rows dropped: 0


In [5]:
tcDataCS.dataOriginal

Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,5,6
0,Texture synthesis for digital painting,The problem of digital painting is considered ...,1984,['Computer Science'],,https://semanticscholar.org/paper/1e9c964d606e...,63,10
1,Texture analysis and artificial neural network...,Clustered microcalcifications on X-ray mammogr...,1997,['Computer Science'],Neural Networks for Signal Processing VII. Pro...,https://semanticscholar.org/paper/b283fccfdeef...,3,9
2,Automatic generation of benchmark and test wor...,"In this tutorial, we describe techniques for a...",2010,['Computer Science'],,https://semanticscholar.org/paper/b56c5cac50dd...,6,8
3,Text Encoding Initiative,This paper traces the history of the Text Enco...,1995,['Computer Science'],,https://semanticscholar.org/paper/ee77bdaf4841...,16,10
4,A Method of Text Classification Combining Naiv...,Text classification is one of the main issues ...,2015,['Computer Science'],,https://semanticscholar.org/paper/195e933e65ed...,0,0
5,Automatic generation of application-specific a...,We present a design flow for the generation of...,2001,['Computer Science'],Proceedings of the 38th Design Automation Conf...,https://semanticscholar.org/paper/c908eb1f0bb6...,87,6
6,Automatic Generation of Test Cases Based On Mu...,The design of automatic generation technology ...,2015,['Computer Science'],,https://semanticscholar.org/paper/e5575ca2a01f...,4,10
7,Discussion on the University Library Updating ...,Information from investigation and statics sho...,2011,['Computer Science'],,https://semanticscholar.org/paper/cff60c0d407c...,0,0
8,Automatic Generation of Deductive Spreadsheets...,,2008,['Computer Science'],,https://semanticscholar.org/paper/4d44729683e4...,0,20
9,Texture : a study in algorithmic composition,,1991,['Computer Science'],,https://semanticscholar.org/paper/8c49a7a0a165...,1,0


In [6]:
tcDataCS.data

Unnamed: 0,label,text
0,__label__Computer Science,Texture synthesis for digital painting The pro...
1,__label__Computer Science,Texture analysis and artificial neural network...
2,__label__Computer Science,Automatic generation of benchmark and test wor...
3,__label__Computer Science,Text Encoding Initiative This paper traces the...
4,__label__Computer Science,A Method of Text Classification Combining Naiv...
5,__label__Computer Science,Automatic generation of application-specific a...
6,__label__Computer Science,Automatic Generation of Test Cases Based On Mu...
7,__label__Computer Science,Discussion on the University Library Updating ...
8,__label__Computer Science,Automatic Generation of Deductive Spreadsheets...
9,__label__Computer Science,Texture : a study in algorithmic composition


In [7]:
tcDataCS.data['text'][0]

'Texture synthesis for digital painting The problem of digital painting is considered from a signal processing viewpoint, and is reconsidered as a problem of directed texture synthesis. It is an important characteristic of natural texture that detail may be evident at many scales, and the detail at each scale may have distinct characteristics. A “sparse convolution” procedure for generating random textures with arbitrary spectral content is described. The capability of specifying the texture spectrum (and thus the amount of detail at each scale) is an improvement over stochastic texture synthesis processes which are scalebound or which have a prescribed 1/f spectrum. This spectral texture synthesis procedure provides the basis for a digital paint system which rivals the textural sophistication of traditional artistic media. Applications in terrain synthesis and texturing computer-rendered objects are also shown.'

In [8]:
tcDataCS.data.head()

Unnamed: 0,label,text
0,__label__Computer Science,Texture synthesis for digital painting The pro...
1,__label__Computer Science,Texture analysis and artificial neural network...
2,__label__Computer Science,Automatic generation of benchmark and test wor...
3,__label__Computer Science,Text Encoding Initiative This paper traces the...
4,__label__Computer Science,A Method of Text Classification Combining Naiv...


### Other datasets

In [9]:
tcDataOther = Data(r"C:/Users/Tyler/Desktop/UWA Repos/literaturesieve/project/notebooks/data/papers_NOT_of_interest.csv")

In [10]:
# Must be due to the NaN values for abstracts.
tcDataOther.dataSummary()

Rows dropped: 7239


In [11]:
tcDataOther.dataOriginal

Unnamed: 0.1,Unnamed: 0,0,1,2
0,Use and Safety of Respiratory Medicines in Chi...,textabstractThe lack of appropriately authoris...,2011,['Medicine']
1,Language in Australia: Transplanted languages ...,,1991,['Sociology']
2,Automatic Generation of A High-level Contact S...,"Information of high-level, topological contact...",2007,['Engineering']
3,Textured Thin Films of Transition Metal Dichal...,,1994,['Materials Science']
4,Text Mining with Support Vector Machines and N...,The objective of this thesis is to develop eff...,2007,[]
5,Summarization of the Literatures on Human Reso...,Based on the present situation of human resour...,2007,['Engineering']
6,Risk Stratification and Risk Modification in P...,textabstractWhen patients are suffering an acu...,2006,['Medicine']
7,Text-melody relationships: translation of Euro...,"Adopting a qualitative approach, videotaped ex...",2005,['Psychology']
8,Textbooks in Greek and Latin: 2011 Supplementa...,,2011,['History']
9,Textbooks and the Teaching of Fluid Inquiry,,1986,['Mathematics']


In [12]:
# print(tcDataOther.data.to_string())

In [13]:
tcDataOther.lenOriginalData

51813

In [14]:
len(tcDataOther.data)

44574

In [15]:
len(tcDataOther.dataNoLabel)

7239

In [16]:
tcDataOther.dataNoLabel

Unnamed: 0,label,text
4,__label__,Text Mining with Support Vector Machines and N...
15,__label__,Textschicksale: Das Werk Arthur Schnitzlers im...
22,__label__,Text-Kultur Kommunikation : Translation als Fo...
30,__label__,Textual Features from Multimedia Conten Identi...
45,__label__,Language contact and language change – linguis...
48,__label__,Compositional Pre-Training for Semantic Parsin...
59,__label__,Contextual Acquisition of Information Categori...
68,__label__,A PILOT STUDY IN AN APPLICATION OF TEXT MINING...
72,__label__,"Texte, Gattungen, Textsorten und ihre Verwendu..."
87,__label__,Textured Image Synthesis and Segmentation via ...


### Create entire dataset

In [17]:
trainingData = tcDataCS.data.append(tcDataOther.data)

In [18]:
trainingData.head()

Unnamed: 0,label,text
0,__label__Computer Science,Texture synthesis for digital painting The pro...
1,__label__Computer Science,Texture analysis and artificial neural network...
2,__label__Computer Science,Automatic generation of benchmark and test wor...
3,__label__Computer Science,Text Encoding Initiative This paper traces the...
4,__label__Computer Science,A Method of Text Classification Combining Naiv...
5,__label__Computer Science,Automatic generation of application-specific a...
6,__label__Computer Science,Automatic Generation of Test Cases Based On Mu...
7,__label__Computer Science,Discussion on the University Library Updating ...
8,__label__Computer Science,Automatic Generation of Deductive Spreadsheets...
9,__label__Computer Science,Texture : a study in algorithmic composition


#### Test, Dev, Train Split

In [19]:
# attr: https://stackoverflow.com/questions/38250710/how-to-split-data-into-3-sets-train-validation-and-test
train, dev, test = np.split(trainingData.sample(frac=1), [int(.6*len(trainingData)), int(.8*len(trainingData))])
print(f'{len(trainingData)} | {len(train)} | {len(dev)} | {len(test)}')

74444 | 44666 | 14889 | 14889


In [20]:
train.to_csv(r'C:/Users/Tyler/Desktop/UWA Repos/literaturesieve/project/notebooks/data/train.csv', sep='\t', index = False, header = False)
test.to_csv(r'C:/Users/Tyler/Desktop/UWA Repos/literaturesieve/project/notebooks/data/test.csv', sep='\t', index = False, header = False)
dev.to_csv(r'C:/Users/Tyler/Desktop/UWA Repos/literaturesieve/project/notebooks/data/dev.csv', sep='\t', index = False, header = False)

Review data to ensure it's been saved correctly.

In [23]:
dev = pd.read_csv('C:/Users/Tyler/Desktop/UWA Repos/literaturesieve/project/notebooks/data/dev.csv', sep='\t')

In [24]:
dev.head()

Unnamed: 0,__label__Art,Texto Refundido de Régimen Local: Art. 368
0,__label__Computer Science,Automatic Timeiseries Model Generation for Rea...
1,__label__Computer Science,Text-based English-Arabic sentence alignment I...
2,__label__Computer Science,Natural language interaction with an expert sy...
3,__label__Psychology,Language use and language learning in CLIL
4,__label__Psychology,Auditieve hersenstampotentialen bij de mens : ...


## Flair Text Classification

https://towardsdatascience.com/text-classification-with-state-of-the-art-nlp-library-flair-b541d7add21f

### Create corpus

In [64]:
from flair.data_fetcher import NLPTaskDataFetcher
from flair.embeddings import WordEmbeddings, FlairEmbeddings, DocumentLSTMEmbeddings
from flair.models import TextClassifier
from flair.trainers import ModelTrainer
from pathlib import Path

import flair, torch

# Use GPU if available - Having issues with Flair model with GPU; switched to CPU for now.
device = None
# if torch.cuda.is_available():
#     device = torch.device('cuda:0')
# else:
#     device = torch.device('cpu')
device = torch.device('cpu')
print(f'Using device: {device}')

Using device: cpu


In [66]:
flair.device = torch.device('cpu') 

In [67]:
flair.device

device(type='cpu')

In [68]:
corpus = NLPTaskDataFetcher.load_classification_corpus(Path('C:/Users/Tyler/Desktop/UWA Repos/literaturesieve/project/notebooks/data/'),
                                                       test_file='test.csv',
                                                       dev_file='dev.csv',
                                                       train_file='train.csv')

  after removing the cwd from sys.path.


2020-04-09 11:37:59,778 Reading data from C:\Users\Tyler\Desktop\UWA Repos\literaturesieve\project\notebooks\data
2020-04-09 11:37:59,779 Train: C:\Users\Tyler\Desktop\UWA Repos\literaturesieve\project\notebooks\data\train.csv
2020-04-09 11:37:59,780 Dev: C:\Users\Tyler\Desktop\UWA Repos\literaturesieve\project\notebooks\data\dev.csv
2020-04-09 11:37:59,782 Test: C:\Users\Tyler\Desktop\UWA Repos\literaturesieve\project\notebooks\data\test.csv


In [69]:
print(len(corpus.train))

44666


In [70]:
corpus.make_label_dictionary()

2020-04-09 11:42:57,075 Computing label dictionary. Progress:


100%|█████████████████████████████████████████████████████████████████████████| 44666/44666 [00:00<00:00, 95491.31it/s]


2020-04-09 11:42:57,715 [b'Mathematics', b'Computer', b'Business', b'Materials', b'Geology', b'Art', b'Geography', b'Medicine', b'Psychology', b'Engineering', b'Biology', b'History', b'Environmental', b'Philosophy', b'EngineeringComputer', b'SociologyPhilosophy', b'Economics', b'Sociology', b'Political', b'Chemistry', b'ChemistryMedicine', b'Physics', b'PsychologyMedicine', b'MathematicsComputer', b'BiologyMedicine', b'GeographyMedicine', b'MedicineComputer', b'MedicineChemistry', b'MedicineEngineering', b'MedicineMathematicsComputer', b'MathematicsEngineering', b'SociologyMedicine', b'SociologyComputer', b'MedicinePhysics', b'PhysicsMedicine', b'PhysicsComputer', b'ChemistryPhysics', b'MedicineBiology', b'SociologyGeography', b'EconomicsMathematics', b'SociologyMedicineComputer', b'MedicineMathematics', b'MathematicsMedicine', b'PsychologyComputer', b'MedicineMaterials', b'EconomicsGeography', b'EngineeringGeography', b'EngineeringMathematics', b'GeologyMedicine', b'PsychologyArt', b'

<flair.data.Dictionary at 0x17ac75a2c88>

In [71]:
word_embeddings = [WordEmbeddings('glove'), FlairEmbeddings('news-forward-fast'), FlairEmbeddings('news-backward-fast')]

In [72]:
document_embeddings = DocumentLSTMEmbeddings(word_embeddings, hidden_size=512, reproject_words=True, reproject_words_dimension=256)

In [73]:
# Single label for the current dataset, maybe multi-label in the future?
classifier = TextClassifier(document_embeddings, label_dictionary=corpus.make_label_dictionary(), multi_label=False)

2020-04-09 11:43:00,062 Computing label dictionary. Progress:


100%|████████████████████████████████████████████████████████████████████████| 44666/44666 [00:00<00:00, 255918.08it/s]


2020-04-09 11:43:00,294 [b'Mathematics', b'Computer', b'Business', b'Materials', b'Geology', b'Art', b'Geography', b'Medicine', b'Psychology', b'Engineering', b'Biology', b'History', b'Environmental', b'Philosophy', b'EngineeringComputer', b'SociologyPhilosophy', b'Economics', b'Sociology', b'Political', b'Chemistry', b'ChemistryMedicine', b'Physics', b'PsychologyMedicine', b'MathematicsComputer', b'BiologyMedicine', b'GeographyMedicine', b'MedicineComputer', b'MedicineChemistry', b'MedicineEngineering', b'MedicineMathematicsComputer', b'MathematicsEngineering', b'SociologyMedicine', b'SociologyComputer', b'MedicinePhysics', b'PhysicsMedicine', b'PhysicsComputer', b'ChemistryPhysics', b'MedicineBiology', b'SociologyGeography', b'EconomicsMathematics', b'SociologyMedicineComputer', b'MedicineMathematics', b'MathematicsMedicine', b'PsychologyComputer', b'MedicineMaterials', b'EconomicsGeography', b'EngineeringGeography', b'EngineeringMathematics', b'GeologyMedicine', b'PsychologyArt', b'

In [74]:
# print(corpus.obtain_statistics())

In [75]:
trainer = ModelTrainer(classifier, corpus)

In [None]:
trainer.train('/content/drive/My Drive/Flair Text Classification/model/', max_epochs=10, embeddings_storage_mode='cpu')

2020-04-09 11:43:00,338 ----------------------------------------------------------------------------------------------------
2020-04-09 11:43:00,340 Model: "TextClassifier(
  (document_embeddings): DocumentLSTMEmbeddings(
    (embeddings): StackedEmbeddings(
      (list_embedding_0): WordEmbeddings('glove')
      (list_embedding_1): FlairEmbeddings(
        (lm): LanguageModel(
          (drop): Dropout(p=0.25, inplace=False)
          (encoder): Embedding(275, 100)
          (rnn): LSTM(100, 1024)
          (decoder): Linear(in_features=1024, out_features=275, bias=True)
        )
      )
      (list_embedding_2): FlairEmbeddings(
        (lm): LanguageModel(
          (drop): Dropout(p=0.25, inplace=False)
          (encoder): Embedding(275, 100)
          (rnn): LSTM(100, 1024)
          (decoder): Linear(in_features=1024, out_features=275, bias=True)
        )
      )
    )
    (word_reprojection_map): Linear(in_features=2148, out_features=256, bias=True)
    (rnn): GRU(256, 512)
 

### Making predictions

In [None]:
from flair.models import TextClassifier
from flair.data import Sentence

In [None]:
classifer = TextClassifier.load('/content/drive/My Drive/Flair Text Classification/model/best-model.pt')

Predictions are done on sentences that have no label

In [None]:
tcDataOther.dataNoLabel.tail(10)

In [None]:
sentenceWithoutLabel = tcDataOther.dataNoLabel['text'].iloc[-3]

In [None]:
# sentenceWithoutLabel = 'Textile Based Organic Light Emitting Diodes for Wearable Displays'

In [None]:
# sentenceWithoutLabel = tcDataCS.data['text'].iloc[4]

In [None]:
print(f'Predicting label for {sentenceWithoutLabel}')

In [None]:
sentence = Sentence(sentenceWithoutLabel)

In [None]:
classifier.predict(sentence)

In [None]:
print(sentence.labels)

In [None]:
# help(sentence)

In [None]:
tcDataOther.dataNoLabel['pred_label'] = ''

In [None]:
from tqdm import tqdm

In [None]:
# iterate over data without labels, make and save predictions.
for index, row in tqdm(tcDataOther.dataNoLabel.iterrows()):
  sentence = Sentence(row['text'])
  classifier.predict(sentence)
  tcDataOther.dataNoLabel.at[index, 'pred_label'] = sentence.labels

In [None]:
# extracting label class and probability from predicted label
tcDataOther.dataNoLabel['pred_label_class'] = ''
tcDataOther.dataNoLabel['pred_label_proba'] = ''
for index, row in tcDataOther.dataNoLabel.iterrows():
  tcDataOther.dataNoLabel.at[index, 'pred_label_class'] = str(tcDataOther.dataNoLabel['pred_label'].loc[index][0]).split(' ')[0]
  tcDataOther.dataNoLabel.at[index, 'pred_label_proba'] = re.sub('\(|\)', '', str(tcDataOther.dataNoLabel['pred_label'].loc[index][0]).split(' ')[1])

In [None]:
tcDataOther.dataNoLabel.head(10)

In [None]:
tcDataOther.dataNoLabel.to_csv('/content/drive/My Drive/Flair Text Classification/data/tcDataOtherLabelled.csv')