# ANLI - GloVE

https://towardsdatascience.com/glove-elmo-bert-9dbbc9226934

In [1]:
!java -version

openjdk version "11.0.19" 2023-04-18
OpenJDK Runtime Environment Homebrew (build 11.0.19+0)
OpenJDK 64-Bit Server VM Homebrew (build 11.0.19+0, mixed mode)


In [2]:
!python3 -m venv .sparknlp-env

In [3]:
!source .sparknlp-env/bin/activate

In [4]:
pip install -q torch

Note: you may need to restart the kernel to use updated packages.


In [5]:
pip install -q spark-nlp==4.4.1

Note: you may need to restart the kernel to use updated packages.


In [6]:
pip install -q pyspark==3.3.1

Note: you may need to restart the kernel to use updated packages.


In [7]:
import pandas as pd
from datasets import load_dataset
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from torchtext.vocab import GloVe
import pandas as pd

In [8]:
import pyspark
import sparknlp

In [None]:
# load datasets and convert them into dataframes
dataset = load_dataset('anli')
train_r1 = pd.DataFrame(dataset['train_r1'])
dev_r1 = pd.DataFrame(dataset['dev_r1'])
test_r1 = pd.DataFrame(dataset['test_r1'])
train_r2 = pd.DataFrame(dataset['train_r1'])
dev_r2 = pd.DataFrame(dataset['dev_r2'])
test_r2 = pd.DataFrame(dataset['test_r2'])
train_r3 = pd.DataFrame(dataset['train_r3'])
dev_r3 = pd.DataFrame(dataset['dev_r3'])
test_r3 = pd.DataFrame(dataset['test_r3'])

In [None]:
def convert_to_glove(df):
    new_df = {'label': df['label'], 'text': df['hypothesis'] + df['premise']}
    return pd.DataFrame(data=new_df)

In [None]:
glove_train_r1 = convert_to_glove(train_r1)
glove_dev_r1 = convert_to_glove(dev_r1)
glove_test_r1 = convert_to_glove(test_r1)

glove_train_r2 = convert_to_glove(train_r2)
glove_dev_r2 = convert_to_glove(dev_r2)
glove_test_r2 = convert_to_glove(test_r2)

glove_train_r3 = convert_to_glove(train_r3)
glove_dev_r3 = convert_to_glove(dev_r3)
glove_test_r3 = convert_to_glove(test_r3)
glove_train_r1

In [None]:
spark = sparknlp.start(spark33=True)

In [None]:
document_assembler = DocumentAssembler() \
.setInputCol("text") \
.setOutputCol("document") \
.setCleanupMode("shrink")
    
tokenizer = Tokenizer() \
.setInputCols(["document"]) \
.setOutputCol("token") \
.setSplitChars(['-']) \
.setContextChars(['(', ')', '?', '!', '#', '@']) 

normalizer = Normalizer() \
.setInputCols(["token"]) \
.setOutputCol("normalized")\
.setCleanupPatterns(["[^\w\d\s]"]) 

stopwords_cleaner = StopWordsCleaner()\
.setInputCols("normalized")\
.setOutputCol("cleanTokens")\
.setCaseSensitive(False)

lemma = LemmatizerModel.pretrained('lemma_antbnc') \
.setInputCols(["cleanTokens"]) \
.setOutputCol("lemma")

glove_embeddings = WordEmbeddingsModel().pretrained() \
.setInputCols(["document", 'lemma'])\
.setOutputCol("embeddings")\
.setCaseSensitive(False)

embeddingsSentence = SentenceEmbeddings() \
.setInputCols(["document", "embeddings"]) \
.setOutputCol("sentence_embeddings") \
.setPoolingStrategy("AVERAGE")

classsifierdl = ClassifierDLApproach()\
.setInputCols(["sentence_embeddings"])\
.setOutputCol("class")\
.setLabelColumn("label")\
.setMaxEpochs(5)\
.setLr(0.001)\
.setBatchSize(8)\
.setEnableOutputLogs(True)
#.setOutputLogsPath('logs')

glove_clf_pipeline = Pipeline(
    stages=[document_assembler, 
            tokenizer,
            normalizer,
            stopwords_cleaner, 
            lemma, 
            glove_embeddings,
            embeddingsSentence,
            classsifierdl])

In [None]:
train_1 = glove_train_r1.dropna(subset=['text', 'label'])
sparkDF_1=spark.createDataFrame(train_1) 
train_2 = glove_train_r2.dropna(subset=['text', 'label'])
sparkDF_2=spark.createDataFrame(train_2) 
train_3 = glove_train_r3.dropna(subset=['text', 'label'])
sparkDF_3=spark.createDataFrame(train_3) 

glove_clf_pipelineModel = glove_clf_pipeline.fit(sparkDF_1)
glove_clf_pipelineModel = glove_clf_pipeline.fit(sparkDF_2)
glove_clf_pipelineModel = glove_clf_pipeline.fit(sparkDF_3)

In [None]:
model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self

# If we save using the predefined names, we can load using `from_pretrained`
output_model_file = os.path.join(OUTPUT_DIR, WEIGHTS_NAME)
output_config_file = os.path.join(OUTPUT_DIR, CONFIG_NAME)

torch.save(model_to_save.state_dict(), output_model_file)
model_to_save.config.to_json_file(output_config_file)
tokenizer.save_vocabulary(OUTPUT_DIR)

https://pytorch.org/docs/stable/nn.html#torch.nn.Module

https://huggingface.co/docs/transformers/model_doc/bert

https://huggingface.co/docs/transformers/v4.28.1/en/main_classes/model#transformers.PreTrainedModel

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=ef800559-4ff5-4f9a-a563-f7fcfd96a58b' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>