## Bibliotecas

In [1]:
import sys
sys.path.append('../../../')

In [2]:
## Spark SQL
from pyspark.sql import SparkSession
import pyspark.sql.functions as f

# Spark ML
from pyspark.ml.pipeline import Pipeline#, PipelineModel
from pyspark.ml.feature import CountVectorizer,HashingTF, IDF

In [3]:
import mlflow.pyspark.ml

In [4]:
spark = (
    SparkSession
    .builder
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
    .config("spark.driver.memory", "6g")
    .getOrCreate()
)

## Data

In [305]:
df_train = spark.read.csv('D:/projects/pyspark_dev/JEOPARDY_CSV.csv', header=True).withColumn('questions', f.regexp_replace(f.col(" Question"), '"', ''))
df_train.limit(5).toPandas()

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer,questions
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus,"For the last 8 years of his life, Galileo was ..."
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe,No. 2: 1912 Olympian; football star at Carlisl...
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona,The city of Yuma in this state has a record av...
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"""In 1963, live on """"The Art Linkletter Show""""","this company served its billionth burger""","In 1963, live on The Art Linkletter Show"
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams,"Signer of the Dec. of Indep., framer of the Co..."


## Function

In [316]:
from pyspark.ml.feature import Word2Vec, Tokenizer, HashingTF, IDF, CountVectorizer

In [307]:
tokenizer = Tokenizer(inputCol='questions', outputCol='tokens')

In [308]:
df_tokens = tokenizer.transform(df_train)
df_tokens.limit(5).toPandas()

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer,questions,tokens
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus,"For the last 8 years of his life, Galileo was ...","[for, the, last, 8, years, of, his, life,, gal..."
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe,No. 2: 1912 Olympian; football star at Carlisl...,"[no., 2:, 1912, olympian;, football, star, at,..."
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona,The city of Yuma in this state has a record av...,"[the, city, of, yuma, in, this, state, has, a,..."
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"""In 1963, live on """"The Art Linkletter Show""""","this company served its billionth burger""","In 1963, live on The Art Linkletter Show","[in, 1963,, live, on, the, art, linkletter, show]"
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams,"Signer of the Dec. of Indep., framer of the Co...","[signer, of, the, dec., of, indep.,, framer, o..."


In [317]:
hashingTF = HashingTF(inputCol='tokens', outputCol='tf')

df_tf = hashingTF.transform(df_tokens)
df_tf.limit(5).toPandas()

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer,questions,tokens,tf
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus,"For the last 8 years of his life, Galileo was ...","[for, the, last, 8, years, of, his, life,, gal...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe,No. 2: 1912 Olympian; football star at Carlisl...,"[no., 2:, 1912, olympian;, football, star, at,...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona,The city of Yuma in this state has a record av...,"[the, city, of, yuma, in, this, state, has, a,...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"""In 1963, live on """"The Art Linkletter Show""""","this company served its billionth burger""","In 1963, live on The Art Linkletter Show","[in, 1963,, live, on, the, art, linkletter, show]","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams,"Signer of the Dec. of Indep., framer of the Co...","[signer, of, the, dec., of, indep.,, framer, o...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [341]:
count_vec = CountVectorizer(inputCol='tokens', outputCol='count_vec')

df_count = count_vec.fit(df_tf).transform(df_tf)
df_count.limit(5).toPandas()

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer,questions,tokens,tf,count_vec
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus,"For the last 8 years of his life, Galileo was ...","[for, the, last, 8, years, of, his, life,, gal...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, ..."
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe,No. 2: 1912 Olympian; football star at Carlisl...,"[no., 2:, 1912, olympian;, football, star, at,...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ..."
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona,The city of Yuma in this state has a record av...,"[the, city, of, yuma, in, this, state, has, a,...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.0, 1.0, 3.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, ..."
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"""In 1963, live on """"The Art Linkletter Show""""","this company served its billionth burger""","In 1963, live on The Art Linkletter Show","[in, 1963,, live, on, the, art, linkletter, show]","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams,"Signer of the Dec. of Indep., framer of the Co...","[signer, of, the, dec., of, indep.,, framer, o...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(3.0, 0.0, 5.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [311]:
idf = IDF(inputCol="tf", outputCol="features")
idfModel = idf.fit(df_count)
rescaledData = idfModel.transform(df_count)

rescaledData.limit(5).toPandas()

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer,questions,tokens,tf,features
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus,"For the last 8 years of his life, Galileo was ...","[for, the, last, 8, years, of, his, life,, gal...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe,No. 2: 1912 Olympian; football star at Carlisl...,"[no., 2:, 1912, olympian;, football, star, at,...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona,The city of Yuma in this state has a record av...,"[the, city, of, yuma, in, this, state, has, a,...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"""In 1963, live on """"The Art Linkletter Show""""","this company served its billionth burger""","In 1963, live on The Art Linkletter Show","[in, 1963,, live, on, the, art, linkletter, show]","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams,"Signer of the Dec. of Indep., framer of the Co...","[signer, of, the, dec., of, indep.,, framer, o...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [326]:
word2Vec = Word2Vec(vectorSize=3, minCount=0, inputCol="tokens", outputCol="result")
model = word2Vec.fit(rescaledData)

result = model.transform(rescaledData)

result.limit(5).toPandas()

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer,questions,tokens,tf,features,result
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus,"For the last 8 years of his life, Galileo was ...","[for, the, last, 8, years, of, his, life,, gal...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-0.11015197454899962, -0.2252612228815754, 0...."
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe,No. 2: 1912 Olympian; football star at Carlisl...,"[no., 2:, 1912, olympian;, football, star, at,...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-0.05510267963338839, -0.19572221568638556, 0..."
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona,The city of Yuma in this state has a record av...,"[the, city, of, yuma, in, this, state, has, a,...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-0.1251570734505852, -0.2910678359524657, -0...."
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"""In 1963, live on """"The Art Linkletter Show""""","this company served its billionth burger""","In 1963, live on The Art Linkletter Show","[in, 1963,, live, on, the, art, linkletter, show]","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-0.06598642090102658, -0.2612116835080087, 0...."
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams,"Signer of the Dec. of Indep., framer of the Co...","[signer, of, the, dec., of, indep.,, framer, o...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-0.012280954875879817, -0.42235086402959293, ..."


In [355]:
from pyspark.ml.feature import Word2Vec, Tokenizer, HashingTF, IDF
from pyspark.ml.pipeline import Pipeline


class TextVectorizer:
    
    def __init__(self, inputCol, method, tokenized = False, **kwargs):
        """
        Constructor
        
    	Parameters
    	----------            
        vectorizer_cols : dict
                       Receives a dict with the name of the vectorizer to be 
                       performed and which are the columns
                       Ex: vectorizer_cols = {'embedding_median': ['col'], 
                                              'embedding_mean': ['col'],
                                              'tf_idf': ['col'],
                                              'bag_of_words' : [col]}
    	Returns
    	-------
        Normalization
        """
        self.inputCol = inputCol
        self.tokenized = tokenized
        methods_dict = {
            'hashing_tfidf': HashingTF, 
            'tfidf': CountVectorizer,
            'word2vec': Word2Vec,
        }
        algorithm = methods_dict[method](inputCol=inputCol, outputCol='word_vectors', **kwargs)
        stages = []
        if not tokenized:
            tokenizer = Tokenizer(inputCol=inputCol, outputCol='tokens')
            algorithm = algorithm.setInputCol('tokens')
            stages.append(tokenizer)
        stages.append(algorithm)
        if method in ['hashing_tfidf', 'tfidf']:
            algorithm = algorithm.setOutputCol('unscaled_vectors')
            idf = IDF(inputCol="unscaled_vectors", outputCol="word_vectors")
            stages.append(idf)
        if len(stages) > 0:
            self.vectorizer = Pipeline(stages=stages)
        else:
            self.vectorizer = algorithm
            
    def fit(self, df):
        """
        Generates the vectorizer object for each column. The text must be preprocessed.
        
    	Parameters
    	----------            
        df : pd.DataFrame
             dataframe with columns to be vectorizer
                    
    	Returns
    	-------
        None
        """
        if isinstance(self.vectorizer, (Word2Vec, Pipeline)):
            self.vectorizer = self.vectorizer.fit(df)
        else:
            raise Exception(f"{type(self.vectorizer)} doesn't have a `fit()` method.")

    def transform(self, df):
        """
        Apply the vectorizer object for each column. The text must be preprocessed.
        
    	Parameters
    	----------            
        df : pd.DataFrame
             dataframe with columns to be vectorizer
                    
    	Returns
    	-------
        pd.DataFrame
        """
        if isinstance(self.vectorizer, (Word2Vec, Pipeline)):
            raise Exception("Estimator not fitted.")
        
        return self.vectorizer.transform(df)

In [356]:
text_vectorizer = TextVectorizer('questions', 'word2vec')

In [357]:
text_vectorizer.fit(df_train)

In [359]:
text_vectorizer.transform(df_train).limit(5).toPandas()

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer,questions,tokens,word_vectors
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus,"For the last 8 years of his life, Galileo was ...","[for, the, last, 8, years, of, his, life,, gal...","[0.03199014771315786, -0.03464356282105048, 0...."
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe,No. 2: 1912 Olympian; football star at Carlisl...,"[no., 2:, 1912, olympian;, football, star, at,...","[-0.054371646605432034, -0.03740343884074766, ..."
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona,The city of Yuma in this state has a record av...,"[the, city, of, yuma, in, this, state, has, a,...","[0.046375213760054775, -0.004481588438567188, ..."
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"""In 1963, live on """"The Art Linkletter Show""""","this company served its billionth burger""","In 1963, live on The Art Linkletter Show","[in, 1963,, live, on, the, art, linkletter, show]","[-0.14786331076174974, -0.04333596816286445, 0..."
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams,"Signer of the Dec. of Indep., framer of the Co...","[signer, of, the, dec., of, indep.,, framer, o...","[0.10280782180941767, 0.021427887440141704, 0...."
