In [None]:
# https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/streamlit_notebooks/T5TRANSFORMER.ipynb
# SparkNLP: Jon Snow Labs
# Apache Spark and SparkML

In [1]:
! pip install -q pyspark==3.1.2 spark-nlp==4.2.8

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.4/212.4 MB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m453.8/453.8 kB[0m [31m44.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m198.6/198.6 kB[0m [31m28.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for pyspark (setup.py) ... [?25l[?25hdone


In [2]:
import pandas as pd
import numpy as np
import json

from pyspark.ml import Pipeline
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

import sparknlp
from sparknlp.annotator import *
from sparknlp.base import *
from sparknlp.pretrained import PretrainedPipeline
from pyspark.sql.types import StringType, IntegerType

In [4]:
spark = sparknlp.start()
print ("Spark NLP Version :", sparknlp.version())
spark

Spark NLP Version : 4.2.8


In [6]:
document_assembler = DocumentAssembler()\
.setInputCol("text")\
.setOutputCol("documents")


In [7]:

t5 = T5Transformer() \
  .pretrained("t5_small", 'en') \
  .setTask("summarize:")\
  .setMaxOutputLength(200)\
  .setInputCols(["documents"]) \
  .setOutputCol("summaries")

summarizer_pp = Pipeline(stages=[
    document_assembler, t5
])

t5_small download started this may take some time.
Approximate size to download 141.1 MB
[OK!]


In [8]:
empty_df = spark.createDataFrame([['']]).toDF('text')
pipeline_model = summarizer_pp.fit(empty_df)
sum_lmodel = LightPipeline(pipeline_model)

example_txt = """

The company's roots extend back to 1975, when Noosa Air began operating in December of that year between Maroochydore and Brisbane using a Britten-Norman Islander.[2] Maryborough businessman Bevan Whitaker,[3] owner of the parent company of Noosa Air, Whitaker Pty Ltd, set up a second airline that commenced operations in December 1981, serving intrastate routes in Queensland vacated by Trans Australia Airlines (TAA) with Embraer EMB 110 Bandeirante aircraft. This second airline was called Sunstate Airlines.

Initially the two airlines used separate airline codes. In 1983, Sunstate changed its code to that of Noosa Air,[2] and by the end of the year, the two airlines had merged fully. From 1 January 1984, all flights were conducted under the Sunstate name as part of TAA's Queensland network.[4] The diverse, combined fleet consisted of two Islanders, two Bandeirantes, three de Havilland Canada DHC-6 Twin Otters and a Short 330. Within a few months, the Islanders and the Short 330 were replaced by a GAF N.24A Nomad and a Short 360,[4] with the Islanders going to associated company Whitaker Air Services.[5]

In 1986 Sunstate purchased a share in Victorian airline, Mildura-based Murray Valley Airlines (MVA), which was established in 1975 but ceased operations in October 1986 due to financial problems.[6] Operations recommenced on 9 November 1986 as Sunstate Airlines (Mildura) on the old MVA routes from Mildura to Melbourne, to Adelaide via Renmark, and to Broken Hill.[6] The airline now had two unconnected networks. The following year Sunstate expanded in its original territory when financially troubled Air Queensland gave up its routes in South East Queensland; Sunstate took over these routes on 1 June. The airline had prepared for the route handover by acquiring more aircraft, its fleet by then consisted of four Nomads (two N24As and two N.22s), three Bandeirantes, three Twin Otters, two Short 360s and a Short 330 in Queensland; and a Short 360 and Cessna 404 in Victoria.[6]

In 1989 Australian Airlines, the successor to TAA and previously the owner of now-defunct Air Queensland, took a one-third share in Sunstate.[7] Shortly afterwards Sunstate commenced operations out of Cairns and the fleet was somewhat rationalised, now consisting of 3 Short 360s, 2 Short 330s, 2 Twin Otters, and a Bandeirante transferred from the Mildura operation.[8] Meanwhile, Sunstate Airlines (Mildura) was awarded a five-year contract to operate air ambulance flights on behalf of the Ambulance services of Victoria. To serve the contract it took a Cessna 404 the Queensland operation had acquired from the break-up of Air Queensland, its own Cessna 404, and four others. To maintain its airline operations a succession of Cessna 404s were leased one at a time from Eastern Australia Airlines, Australian Airlines' regional subsidiary in New South Wales.[8]

"""

res = sum_lmodel.fullAnnotate(example_txt)[0]


print ('Summary:', res['summaries'][0].result)

Before _validateStagesInputCols
Summary: noosa air began operating in 1975 using a Britten-Norman Islander . the company's roots extend back to 1975, when noosa air began operating in 1981 . the company's first airline was called Sunstate Airlines .
