# Text Classification

1. Use labelled datasets to predict unlabelled datasets
2. Need to concatenate labels together where there are multiple fields of study

In [148]:
import pandas as pd
import re
import numpy as np

In [198]:
class Data():
    
    def __init__(self, fileName, fosLabel):
        
        self.dataOriginal = pd.read_csv(fileName)
        self.data = self.dataOriginal.copy()
        self.fosLabel = fosLabel
        self.lenOriginalData = len(self.data)
        
        # exe
        self.formatData()
        self.formatLabelData()
        self.dropNanText()
        self.removeDataWithoutLabels()
    
    def formatData(self):
        # subset data for title, abstract, field of study
        self.data = self.data[['Unnamed: 0', '0', '2']]
        # rename columns
        self.data.rename(columns={'Unnamed: 0': 'Title', '0': 'Abstract', '2':'Label'}, inplace=True)
        # Replace NaN
        self.data['Abstract'] = self.data['Abstract'].replace(np.nan, '', regex=True)
        # concatenate title and abstract
        self.data['Text'] = self.data['Title'] + ' ' + self.data['Abstract']
        # Drop unnecessary columns
        self.data.drop(labels=['Title', 'Abstract'], axis=1, inplace=True)
        # change order of columns
        self.data = self.data[['Text', 'Label']]
        
        # Concatenate labels
        self.data['Label'] = self.data['Label'].apply(lambda x: self.labelConcat(x))
    
    def labelConcat(self, labelText):
        # Political Science should be made into Political-Science?
        try:
            return ''.join(ast.literal_eval(labelText))
        except:
            return ''
    
    def formatLabelData(self):
        """
        FastText format
        """
        
        # Add '__label__' to data for Flair model
        self.data['Label'] = '__label__' + self.data['Label']
        
    def dropNanText(self):
        # drop any text that is NaN
        self.data['Text'] = self.data[self.data['Text'].notna()]
        
    def dataSummary(self):
        print(f'Rows dropped: {self.lenOriginalData - len(self.data)}')
        
    def removeDataWithoutLabels(self):
        
        self.dataNoLabel = self.data[self.data['Label'] == '__label__']
        self.data = self.data[self.data['Label'] != '__label__']
        

In [199]:
tcDataCS = Data(r"C:\Users\22917746\Desktop\Semantic Scholar EDA\papers_of_interest.csv", fosLabel='CS')

In [200]:
tcDataCS.dataSummary()

Rows dropped: 0


In [201]:
tcDataCS.dataOriginal

Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,5,6
0,Texture synthesis for digital painting,The problem of digital painting is considered ...,1984,['Computer Science'],,https://semanticscholar.org/paper/1e9c964d606e...,63,10
1,Texture analysis and artificial neural network...,Clustered microcalcifications on X-ray mammogr...,1997,['Computer Science'],Neural Networks for Signal Processing VII. Pro...,https://semanticscholar.org/paper/b283fccfdeef...,3,9
2,Automatic generation of benchmark and test wor...,"In this tutorial, we describe techniques for a...",2010,['Computer Science'],,https://semanticscholar.org/paper/b56c5cac50dd...,6,8
3,Text Encoding Initiative,,2012,['Computer Science'],,https://semanticscholar.org/paper/4c99b00b844a...,1,1
4,A Method of Text Classification Combining Naiv...,Text classification is one of the main issues ...,2015,['Computer Science'],,https://semanticscholar.org/paper/195e933e65ed...,0,0
...,...,...,...,...,...,...,...,...
171,Pattern-Oriented Workflow Generation and Optim...,Automatic workflow generation is becoming an a...,2009,['Computer Science'],J. UCS,https://semanticscholar.org/paper/152204729490...,3,13
172,Texture analysis using level-crossing statistics,We present a novel statistical texture descrip...,2004,['Computer Science'],Proceedings of the 17th International Conferen...,https://semanticscholar.org/paper/1fc9846ff18d...,0,14
173,Text Generation: Reexamining G-TAG with Abstra...,G-TAG is a formalism dedicated to text generat...,2014,['Computer Science'],,https://semanticscholar.org/paper/2838884adbdc...,0,22
174,Open Source Native XML Database Architectures ...,Text-based and model-based architectures are t...,2013,['Computer Science'],2013 15th International Conference on Advanced...,https://semanticscholar.org/paper/ab85b0f4cec9...,2,2


In [202]:
tcDataCS.data

Unnamed: 0,Text,Label
0,Texture synthesis for digital painting The pro...,__label__Computer Science
1,Texture analysis and artificial neural network...,__label__Computer Science
2,Automatic generation of benchmark and test wor...,__label__Computer Science
3,Text Encoding Initiative,__label__Computer Science
4,A Method of Text Classification Combining Naiv...,__label__Computer Science
...,...,...
171,Pattern-Oriented Workflow Generation and Optim...,__label__Computer Science
172,Texture analysis using level-crossing statisti...,__label__Computer Science
173,Text Generation: Reexamining G-TAG with Abstra...,__label__Computer Science
174,Open Source Native XML Database Architectures ...,__label__Computer Science


In [203]:
tcDataCS.data['Text'][0]

'Texture synthesis for digital painting The problem of digital painting is considered from a signal processing viewpoint, and is reconsidered as a problem of directed texture synthesis. It is an important characteristic of natural texture that detail may be evident at many scales, and the detail at each scale may have distinct characteristics. A “sparse convolution” procedure for generating random textures with arbitrary spectral content is described. The capability of specifying the texture spectrum (and thus the amount of detail at each scale) is an improvement over stochastic texture synthesis processes which are scalebound or which have a prescribed 1/f spectrum. This spectral texture synthesis procedure provides the basis for a digital paint system which rivals the textural sophistication of traditional artistic media. Applications in terrain synthesis and texturing computer-rendered objects are also shown.'

In [204]:
tcDataCS.data.head()

Unnamed: 0,Text,Label
0,Texture synthesis for digital painting The pro...,__label__Computer Science
1,Texture analysis and artificial neural network...,__label__Computer Science
2,Automatic generation of benchmark and test wor...,__label__Computer Science
3,Text Encoding Initiative,__label__Computer Science
4,A Method of Text Classification Combining Naiv...,__label__Computer Science


### Other datasets

In [205]:
tcDataOther = Data(r"C:\Users\22917746\Desktop\Semantic Scholar EDA\papers_NOT_of_interest.csv", "Other")

In [206]:
tcDataOther.dataSummary()

Rows dropped: 44


In [207]:
tcDataOther.dataOriginal

Unnamed: 0.1,Unnamed: 0,0,1,2
0,Use and Safety of Respiratory Medicines in Chi...,textabstractThe lack of appropriately authoris...,2011,['Medicine']
1,Language in Australia: Transplanted languages ...,,1991,['Sociology']
2,Automatic Generation of A High-level Contact S...,"Information of high-level, topological contact...",2007,['Engineering']
3,Textured Thin Films of Transition Metal Dichal...,,1994,['Materials Science']
4,Text Mining with Support Vector Machines and N...,The objective of this thesis is to develop eff...,2007,[]
...,...,...,...,...
314,Texture Characteristics of Chips Produced by H...,,2019,['Materials Science']
315,Text-type conventions and translating,,1997,['Sociology']
316,Origin of classis with ring-handle on the door...,Textual research is made on the historical ori...,2006,['Engineering']
317,Texto para discussão 6: a tecnologia da inform...,,1996,['Philosophy']


In [208]:
print(tcDataOther.data.to_string())

                                                  Text                           Label
0    Use and Safety of Respiratory Medicines in Chi...               __label__Medicine
1    Language in Australia: Transplanted languages ...              __label__Sociology
2    Automatic Generation of A High-level Contact S...            __label__Engineering
3    Textured Thin Films of Transition Metal Dichal...      __label__Materials Science
5    Summarization of the Literatures on Human Reso...            __label__Engineering
6    Risk Stratification and Risk Modification in P...               __label__Medicine
7    Text-melody relationships: translation of Euro...             __label__Psychology
8    Textbooks in Greek and Latin: 2011 Supplementa...                __label__History
9         Textbooks and the Teaching of Fluid Inquiry             __label__Mathematics
10   Texture evolution during severe plastic deform...      __label__Materials Science
11   Integrated SCM/PDM/CRM and delivery of

In [212]:
tcDataOther.lenOriginalData

319

In [213]:
len(tcDataOther.data)

275

In [214]:
len(tcDataOther.dataNoLabel)

44

In [210]:
tcDataOther.dataNoLabel

Unnamed: 0,Text,Label
4,Text Mining with Support Vector Machines and N...,__label__
15,Textschicksale: Das Werk Arthur Schnitzlers im...,__label__
21,Texture evolution and Swift effect in NiAl NiA...,__label__
22,Text-Kultur Kommunikation : Translation als Fo...,__label__
30,Textual Features from Multimedia Conten Identi...,__label__
44,Language contact and language change – linguis...,__label__
47,Compositional Pre-Training for Semantic Parsin...,__label__
58,Contextual Acquisition of Information Categori...,__label__
67,A PILOT STUDY IN AN APPLICATION OF TEXT MINING...,__label__
71,"Texte, Gattungen, Textsorten und ihre Verwendu...",__label__


### Create entire dataset

In [217]:
trainingData = tcData.data.append(tcDataOther.data)

In [218]:
trainingData

Unnamed: 0,Text,Label
0,Texture synthesis for digital painting The pro...,__label__Computer Science
1,Texture analysis and artificial neural network...,__label__Computer Science
2,Automatic generation of benchmark and test wor...,__label__Computer Science
3,Text Encoding Initiative,__label__Computer Science
6,A Method of Text Classification Combining Naiv...,__label__Computer Science
...,...,...
314,Texture Characteristics of Chips Produced by H...,__label__Materials Science
315,Text-type conventions and translating,__label__Sociology
316,Origin of classis with ring-handle on the door...,__label__Engineering
317,Texto para discussão 6: a tecnologia da inform...,__label__Philosophy


#### Test, Dev, Train Split

In [231]:
# attr: https://stackoverflow.com/questions/38250710/how-to-split-data-into-3-sets-train-validation-and-test
train, dev, test = np.split(trainingData.sample(frac=1), [int(.6*len(trainingData)), int(.8*len(trainingData))])
print(f'{len(trainingData)} | {len(train)} | {len(dev)} | {len(test)}')

432 | 259 | 86 | 87


In [233]:
train.to_csv(r'ml_data/train.csv')
test.to_csv(r'ml_data/test.csv')
dev.to_csv(r'ml_data/dev.csv')

## Flair Text Classification

https://towardsdatascience.com/text-classification-with-state-of-the-art-nlp-library-flair-b541d7add21f

In [237]:
from flair.data_fetcher import NLPTaskDataFetcher
from flair.embeddings import WordEmbeddings, FlairEmbeddings, DocumentLSTMEmbeddings
from flair.models import TextClassifier
from flair.trainers import ModelTrainer
from pathlib import Path

In [238]:
corpus = NLPTaskDataFetcher.load_classification_corpus(Path('./ml_data'), test_file='test.csv', dev_file='dev.csv', train_file='train.csv')

2020-04-08 06:35:45,868 Reading data from ml_data
2020-04-08 06:35:45,869 Train: ml_data\train.csv
2020-04-08 06:35:45,870 Dev: ml_data\dev.csv
2020-04-08 06:35:45,871 Test: ml_data\test.csv


  """Entry point for launching an IPython kernel.
  train_file, tokenizer=tokenizer, max_tokens_per_doc=max_tokens_per_doc
  test_file, tokenizer=tokenizer, max_tokens_per_doc=max_tokens_per_doc
  dev_file, tokenizer=tokenizer, max_tokens_per_doc=max_tokens_per_doc


In [239]:
word_embeddings = [WordEmbeddings('glove'), FlairEmbeddings('news-forward-fast'), FlairEmbeddings('news-backward-fast')]

ValueError: invalid literal for int() with base 10: '0+cpu'

In [None]:
document_embeddings = DocumentLSTMEmbeddings(word_embeddings, hidden_size=512, reproject_words=True, reproject_words_dimension=256)

In [None]:
classifier = TextClassifier(document_embeddings, label_dictionary=corpus.make_label_dictionary(), multi_label=False)

In [None]:
trainer = ModelTrainer(classifier, corpus)

In [None]:
trainer.train('./', max_epochs=10)