# Binary Classifier - Cleaning

## Date: February 22, 2020

Let's get a subset of the 8k+ samples that we have that is clean and could be use later on to train a classification model on it.

In [1]:
import json
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.corpus import stopwords
from sklearn.feature_selection import chi2
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import pickle

In [11]:
import en_core_web_sm
nlp = en_core_web_sm.load()

In [31]:
import re

In [None]:
os.chdir('../../../../../')

### 1. Cleaning steps

#### 1.1. Special character cleaning

Special characters we are removing:

\r \
\n \
\ before possessive pronouns (government's = government\'s) \
\ before possessive pronouns 2 (Yukos' = Yukos\') \
" when quoting text

#### 1.2 Punctuation removal

#### 1.3 Stop Words removal

Using Spacy stop words

#### 1.4 Non alphabetical characters removal

#### 1.5 Lemmatization

Using Spacy lemmatizer

In [96]:
#class used to underline part of the text in a specific color
#in our specific case, we want to highlight the words that suggest the presence of a dataset
#namely: data, dataset, database
class color:
    purple = '\033[95m'
    cyan = '\033[96m'
    darkcyan = '\033[36m'
    blue = '\033[94m'
    green = '\033[92m'
    yellow = '\033[93m'
    red = '\033[91m'
    bold = '\033[1m'
    underline = '\033[4m'
    end = '\033[0m'

In [102]:
def cleaning_txt(path):
    '''
    This function open the json file in path
    Then the text in this json file is cleaned in several steps:
    - removing special characters
    - removing punctuation
    - removing stop words
    - removing non alphabetical characters
    - removing stop words of less than three letters
    - lemmatizing
    - highlight the driving words
    '''
    
    with open(path) as file:
        txt = json.load(file)
    
    txt = txt.replace("\r", " ")
    txt = txt.replace("\n", " ")
    txt = txt.replace("    ", " ")
    txt = txt.replace('"', '')
    txt = txt.lower()
    txt = txt.replace("'s", "")
    
    #remove punctuation
    txt = txt.translate(str.maketrans('', '', string.punctuation))
    
    #stop words removal
    txt = [t.text for t in nlp(txt) if not t.is_stop]
    
    # remove remaining tokens that are not alphabetic
    txt = [t for t in txt if t.isalpha()]
    
    #remove single letters or two letters words
    txt = [t for t in txt if len(t)>2]
    
    #lemmatization
    txt = nlp(' '.join(txt))
    to_keep = ['data', 'dataset', 'database', 'datasets', 'databases']
    txt = [token.lemma_ if token.text not in to_keep else token.text for token in txt]
    
    #join words
    txt = ' '.join(txt)
    
    #highligh data in the text to help the annotation process
    for word in to_keep:
        txt = re.sub('({})'.format(word), r'{}\1{}'.format(color.red, color.end), txt, flags=re.I)
        
    return txt

In [153]:
def get_cleaned_papers(path):
    '''
    This function takes the path, clean the text and save it in a new folder with only cleaned texts
    '''
    try: txt = cleaning_txt(path)
    except: return
    if not os.path.exists('projects/ai4good/cleaned_papers'):
        os.mkdir('projects/ai4good/cleaned_papers')
    
    nme = 'projects/ai4good/cleaned_papers/{}'.format(path.split('/')[-1])
    with open(nme, 'w') as raw: 
        json.dump(txt, raw, indent=4, sort_keys=False)
    return

#### test of the cleaning_txt function:

In [7]:
path_test = 'projects/ai4good/data_aiminer/pages_selected_ter/Sugar Cane to Fuel-Ethanol... to green power? clean water? recycle sludge? reclaim soils?.pdf_page_648.json'

In [103]:
txt = cleaning_txt(path_test)

In [104]:
print(txt)

table agriculture model assumption assumption note feedstock production national average corn yield approximately buacre buacre annual increase baseline year consistent usda projection national average soybean yield approximately buacre annual increase international corn yield increase time example argentina buacre annual increase brazil buacre annual increase fapri model international soybean yield increase time example argentina buacre annual increase brazil buacre annual increase corn residue removal rate allow till practice removal rate allow reduce till practice removal conventional till derive graham agronomy journal ﬁcurrent potential corn stover suppliesﬂ perlack wright turhollow graham stoke erbach biomass feedstock bioenergy bioproduct industry technical feasibility billionton annual supply report prepared department energy department agricutlure conservation reserve program crp minimum limit million acre enrol program give time farm bill usda baseline assumption fertilizer u

#### we now use the get_cleaned_papers function on whole set of documents that we have

In [108]:
papers_folder = 'projects/ai4good/data_aiminer/pages_selected_ter'

In [139]:
papers_paths = [''.join([papers_folder, '/', doc]) for doc in os.listdir(papers_folder) if os.path.isfile(os.path.join(papers_folder, doc))]

#### cleaning all the documents we have and storing them in the cleaned_papers folder

In [154]:
list(map(get_cleaned_papers, papers_paths))

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,