In [47]:
import findspark
findspark.init()

import pyspark

from pyspark.sql import SparkSession
from pyspark.sql import *
import pyspark.sql.types as T
import pyspark.sql.functions as F
from pyspark import ml
from nltk.tokenize import TweetTokenizer
import config
import os
from itertools import product
import plotly.express as px
import numpy as np
import pandas as pd
import re
from custom_transformer import EmojiExtractor

spark = SparkSession.builder.appName("Train NB")\
    .config("spark.executor.memory","4g")\
    .config("spark.driver.memory","5g")\
    .master('local[12]')\
    .getOrCreate()

In [105]:
sentiment_model = ml.PipelineModel.load(config.MODEL_NAME)

In [108]:
examples_filename = r'D:\datasets\chatbot_data\sample.csv'
test_filename = r'D:\datasets\twitter_sentiment_data\second_source\train.csv'
train_data = r'D:\datasets\chatbot_data\twcs\twcs.csv'

In [109]:
sample = spark.read.load(train_data, format = 'csv', inferSchema = True, header = True)

In [111]:
extractor = EmojiExtractor(inputCol = 'text', outputCol = 'emojis')

counter = ml.feature.CountVectorizer(inputCol = 'emojis', outputCol = 'emoji_count', 
                                     binary = True, vocabSize = 100)

emoji_pipeline = ml.Pipeline(stages = [extractor, counter])

In [113]:
sample = extractor.transform(sample)

In [115]:
sample = sample.withColumn('num_emojis', F.size(F.array_distinct('emojis'))).orderBy('num_emojis', ascending = False)
sample.persist()

DataFrame[tweet_id: string, author_id: string, inbound: string, created_at: string, text: string, response_tweet_id: string, in_response_to_tweet_id: string, emojis: array<string>, num_emojis: int]

In [120]:
emoji_training_data = sample.toPandas()

In [121]:
emoji_training_data = emoji_training_data.iloc[0:2000]

In [131]:
emoji_training_data['label'] = 0
df = emoji_training_data

In [133]:
df['rand'] = np.random.rand(len(df))
msk = df.rand < 0.8
train = df[msk]
test = df[~msk]

In [134]:
train[['label','text']].to_csv('./emoji_train.csv', index = False)
test[['label', 'text']].to_csv('./emoji_test.csv', index = False)

In [89]:
assembler = ml.feature.VectorAssembler(inputCols = ['prediction', 'emoji_count'], outputCol = 'nb_features')

full_pipeline = ml.Pipeline(stages = [trained_emoji_pipe, sentiment_model, assembler])