In [1]:
import pandas as pd
import numpy as np
import re
import tensorflow_text as tf_text
from nltk.tokenize import sent_tokenize
import glob
import os
import string

In [2]:
def to_data_frame(csv_file):
    '''
    Takes a csv object, path to csv or json and returns a pandas dataframe
    '''
    data = pd.read_csv(csv_file)
    data_frame = pd.DataFrame(data)
    return data_frame

In [9]:
def english_papers (data_frame):
    '''
    uses a dataframe generated from a core_data api response and filters the results by English

    '''
    for x, paper in enumerate(data_frame['language'].tolist()):
        if paper != "{'code': 'en', 'name': 'English'}":
            data_frame = data_frame.drop(index = x)
    data_frame = data_frame.reset_index(drop=True)
    return data_frame

In [5]:
# this is not an ideal solution but will suffice
# for the purpose of cleaning text for learning NLP
def text_clean(text):
    '''
    custom function for cleaning and tokenizing text from scientific papers.
    '''
    text_data = text
    sentences = []
    for text in text_data:
        text = text.lower()
        tf_text.normalize_utf8(text)
        text = text[:text.find('references')] # not a nice way to remove references
        text = ''.join([i for i in text if not i.isdigit()]) # remove digits
        tokens = sent_tokenize(text)
        tokens = tokens[20:] # slice the front of the text off to remove messy document identifiers
        for token in tokens:
            # token = token.translate(str.maketrans('', '', string.punctuation)) #used to remove punctuation
            if len(token) > 30 and token.count('.') < 5 and token.count(',') < 6:
                sentences.append(token)
    return sentences


In [6]:
def remove_hyperlinks(data_frame):
    '''
    removes http strings
    '''
    data_frame = data_frame.replace('http\S+', '', regex=True)
    return data_frame

In [10]:
files = glob.glob(os.path.join('API_responses', '*.csv'))
all_sentences = pd.DataFrame()
for file in files:
    df = to_data_frame(file)
    df = english_papers(df)
    df = remove_hyperlinks(df)
    tokens = text_clean(df['fullText'])
    tokens = pd.DataFrame(tokens)
    tokens['Category'] = str(file)
    all_sentences = all_sentences.append(tokens)

all_sentences

Unnamed: 0,0,Category
0,information inserted in\nthe blockchain is pub...,API_responses\Blockchain.csv
1,smart contracts are self-executing\ncontracts ...,API_responses\Blockchain.csv
2,"recently, blockchain and its relations with sm...",API_responses\Blockchain.csv
3,"in [], blockchain has been compared to inventi...",API_responses\Blockchain.csv
4,"according to gartner’s hype cycle, blockchain ...",API_responses\Blockchain.csv
...,...,...
641,the objectives for this session for attendees ...,API_responses\Virtual reality.csv
642,"as the definition told us, autism spectrum dis...",API_responses\Virtual reality.csv
643,"when designing instruction or interventions, \...",API_responses\Virtual reality.csv
644,the primary focus of any instructional \nmetho...,API_responses\Virtual reality.csv


In [7]:
# Rename columns, text tokens and Category
all_sentences = all_sentences.rename(columns = {0:'Text'})
all_sentences['Category'] = all_sentences['Category'].replace(['API_responses', '.csv',r'\\'], '', regex=True)
all_sentences

Unnamed: 0,Text,Category
0,information inserted in\nthe blockchain is pub...,Blockchain
1,smart contracts are self-executing\ncontracts ...,Blockchain
2,"recently, blockchain and its relations with sm...",Blockchain
3,"in [], blockchain has been compared to inventi...",Blockchain
4,"according to gartner’s hype cycle, blockchain ...",Blockchain
...,...,...
641,the objectives for this session for attendees ...,Virtual reality
642,"as the definition told us, autism spectrum dis...",Virtual reality
643,"when designing instruction or interventions, \...",Virtual reality
644,the primary focus of any instructional \nmetho...,Virtual reality


In [21]:
#encode Catagory into neumeric representation 
#view only one catagory
#apply neumerical encoding
#append to new df

df2 = pd.DataFrame()
for i, topic in enumerate(all_sentences['Category'].unique()):
    catagory = all_sentences.loc[all_sentences['Category']==topic]
    catagory['Code'] = catagory['Category'].apply(lambda x: i if x == topic else 0)
    df2 = df2.append(catagory)
df2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,Text,Category,Code
0,information inserted in\nthe blockchain is pub...,Blockchain,0
1,smart contracts are self-executing\ncontracts ...,Blockchain,0
2,"recently, blockchain and its relations with sm...",Blockchain,0
3,"in [], blockchain has been compared to inventi...",Blockchain,0
4,"according to gartner’s hype cycle, blockchain ...",Blockchain,0
...,...,...,...
641,the objectives for this session for attendees ...,Virtual reality,9
642,"as the definition told us, autism spectrum dis...",Virtual reality,9
643,"when designing instruction or interventions, \...",Virtual reality,9
644,the primary focus of any instructional \nmetho...,Virtual reality,9


In [23]:
df2.to_csv('processed_text/NLP_data_1.csv', index = False)