In [7]:
#Run once and then comment out

#!pip install gdown
#!gdown --id 1uRgJ5MzqoGh-XYQUFAVBUGQYlLi7aMXx 
#!unzip ABSA_glove_absa.zip 

In [8]:
import os

# Install java
! apt-get update -qq
! apt-get install -y openjdk-8-jdk-headless -qq > /dev/null

os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]
! java -version

# Install pyspark
! pip install --ignore-installed -q pyspark==2.4.4
! pip install --ignore-installed -q spark-nlp==2.6.3-rc1

openjdk version "1.8.0_275"
OpenJDK Runtime Environment (build 1.8.0_275-8u275-b01-0ubuntu1~18.04-b01)
OpenJDK 64-Bit Server VM (build 25.275-b01, mixed mode)


In [9]:
import sparknlp

spark = sparknlp.start()

# params =>> gpu=False, spark23=False (start with spark 2.3)

print("Spark NLP version", sparknlp.version())

print("Apache Spark version:", spark.version)


Spark NLP version 2.6.3-rc1
Apache Spark version: 2.4.4


In [10]:
from sparknlp.annotator import *
from sparknlp.base import *

In [11]:
document = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")

sentence = SentenceDetector()\
    .setInputCols(['document'])\
    .setOutputCol('sentence')

token = Tokenizer()\
    .setInputCols(['document'])\
    .setOutputCol('token')

glove_embeddings = WordEmbeddingsModel.pretrained("glove_840B_300", "xx")\
    .setInputCols(["document", "token"])\
    .setOutputCol("embeddings")
    
loaded_ner_model = NerDLModel.load("ABSA_glove_absa")\
    .setInputCols(["document", "token", "embeddings"])\
    .setOutputCol("absa")

converter = NerConverter()\
    .setInputCols(["document", "token", "absa"])\
    .setOutputCol("absa_span")

ner_prediction_pipeline = Pipeline(
    stages = [
        document,
        token,
        glove_embeddings,
        loaded_ner_model,
        converter])

glove_840B_300 download started this may take some time.
Approximate size to download 2.3 GB
[OK!]


In [12]:
import pandas as pd
from google.colab import drive
drive.mount('/gdrive')
%cd /gdrive

Mounted at /gdrive
/gdrive


In [13]:
path     = '/gdrive/My Drive/Colab Notebooks//'
file_name = 'text_pp3.csv'
data_file = path + file_name
data = pd.read_csv (data_file , error_bad_lines=False ,encoding = "ISO-8859-1")

In [14]:
def do (text,ner_prediction_pipeline,document,sentence):
    empty_data = spark.createDataFrame([['']]).toDF("text")
    prediction_model = ner_prediction_pipeline.fit(empty_data)
    sent_pipeline = Pipeline(stages = [document, sentence])
    df = spark.createDataFrame(pd.DataFrame({'text': [text]}))
    df1 = prediction_model.transform(df).toPandas()
    df2 = sent_pipeline.fit(empty_data).transform(df).toPandas()
    all_sents = df2['sentence'][0]
    sentences = []
    aspects = []
    sentiments = []
    for result in df1['absa_span'][0]:
        start, end = result['begin'], result['end']
        for sent in all_sents:
            if sent['begin'] <= start and sent['end'] >= end:
                sentences.append(sent['result'])
        aspects.append(result['result'])
        sentiment = "positive" if result['metadata']['entity'] == "POS" else "negative"
        sentiments.append(sentiment)
    final_result = pd.DataFrame.from_dict({"sentence": sentences, "aspect": aspects, "sentiment": sentiments})
    #print (final_result)
    #del glove_embeddings
    return final_result


In [None]:
path     = '/gdrive/My Drive/Colab Notebooks//'
file_name = 'text_pp3_annotated.csv'
data_file = path + file_name
initial_df = pd.DataFrame (columns = ['sentence' , 'aspect' , 'sentiment'])
counter    = 0 
for description in data ['text'].dropna().values[55000:60000]:
     try:
      #print (description)
      counter = counter  + 1
      print (counter , "out of" , data.shape [0] )
      combo_df = do (description,ner_prediction_pipeline,document,sentence)
      print (combo_df)
      combo_df.to_csv (data_file,mode= 'a' ,header = False)
      #initial_df = initial_df.append ( combo_df)
      del combo_df
     except Exception as e:
       #print ("**" * 100) 
       print("Error Detected",e)
       continue 

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Empty DataFrame
Columns: [sentence, aspect, sentiment]
Index: []
1220 out of 92862
                                            sentence  aspect sentiment
0  [" 'http://t.co/wZ1IxNKCwe happened to all see...  lights  negative
1  [" 'http://t.co/wZ1IxNKCwe happened to all see...  lights  negative
1221 out of 92862
                                            sentence           aspect sentiment
0  [' making no sound.I was watching the firework...  upstairs window  positive
1  I thought they may have been lanterns', ' but ...     lanterns', '  negative
1222 out of 92862
                                            sentence  aspect sentiment
0  [' my sister-in-law', ' and myself. We were at...  window  negative
1  [' my sister-in-law', ' and myself. We were at...  lights  negative
2  [' my sister-in-law', ' and myself. We were at...  lights  negative
3  The lights appeared to move independently of e...  lights  negative
1223 out