# NLP_PySpark

SparkML도 다른 머신러닝 라이브러리들과 마찬가지로 학습을 위한 전처리, 모델 알고리즘, 성능을 극대화하기 위한 도구들을 지원합니다. 다만, 다른 라이브러리에 비해 스파크는 대중적으로 사용되는 몇몇 알고리즘만 구현되어 있습니다. 새롭거나 핫한 모델이 나와도 스파크에서 쓰려면 다른 라이브러리보다는 조금 더 기다려야 합니다.

학습에 필요한 전처리를 스파크로 진행하고 모델링은 텐서플로우와 같은 타 라이브러리로 진행하거나, 스파크 지원 모델로 충분한 프로젝트라면 모델링까지 스파크로 마무리하여 작업의 속도를 높일 수 있습니다.




In [None]:
pip install pyspark

In [None]:
from pyspark.sql import SparkSession

spark=SparkSession.builder.appName('nlp').getOrCreate()

In [None]:
df=spark.createDataFrame([(1,'I really liked this movie'),
                         (2,'I would recommend this movie to my friends'),
                         (3,'movie was alright but acting was horrible'),
                         (4,'I am never watching that movie ever again')],
                        ['user_id','review'])


In [None]:
df.show(5,False)

In [None]:
from pyspark.ml.feature import Tokenizer

In [None]:
tokenization=Tokenizer(inputCol='review',outputCol='tokens')

In [None]:
tokenized_df=tokenization.transform(df)

In [None]:
tokenized_df.show(4,False)

StringIndexer -  StringIndexer로 변환해서 0, 1, 2로 바꿔줍니다. 라벨뿐만 아니라 문자열 피쳐에도 숫자 인덱스로 변환할 때

tokenizer - MK는 밥을 먹는다라는 문장을 [MK는, 밥을, 먹는다]라는 토큰으로 나누어 줍니다.

remover - 은, 는, 이, 가, 을, 를 처럼 조사 제거 [MK, 밥, 먹는다] 

word2vec - 단어 목록의 패턴을 계산해서 단어와 단어 사이의 관계를 수치로 표현하는 방법 [MK, 밥, 먹는다], [JK, 밥, 먹는다], [고양이, 소파, 눕는다]

TF-IDF - 문장(review)에 들어있는 모든 단어를 세면 그게 Term Frequency(TF)
[이거 게임 정말 좋아 정말 최고야], [이거 게임 별로임]
* 일련의 문장으로 시작합니다. 를 사용하여 각 문장을 단어로 나눕니다 Tokenizer. 각 문장 (단어 HashingTF모음)에 대해 문장을 특징 벡터로 해시하는 데 사용 합니다. IDF특징 벡터의 크기를 조정하는 데 사용 합니다

In [None]:
from pyspark.ml.feature import StopWordsRemover

In [None]:
stopword_removal=StopWordsRemover(inputCol='tokens',outputCol='refined_tokens')

In [None]:
refined_df=stopword_removal.transform(tokenized_df)

In [None]:
refined_df.select(['user_id','tokens','refined_tokens']).show(10,False)

In [None]:
from pyspark.ml.feature import CountVectorizer

In [None]:
count_vec=CountVectorizer(inputCol='refined_tokens',outputCol='features')

In [None]:
cv_df=count_vec.fit(refined_df).transform(refined_df)

In [None]:
cv_df.select(['user_id','refined_tokens','features']).show(4,False)

In [None]:
count_vec.fit(refined_df).vocabulary

In [None]:
from pyspark.ml.feature import HashingTF,IDF

In [None]:
hashing_vec=HashingTF(inputCol='refined_tokens',outputCol='tf_features')

In [None]:
hashing_df=hashing_vec.transform(refined_df)

In [None]:
hashing_df.select(['user_id','refined_tokens','tf_features']).show(4,False)

In [None]:
tf_idf_vec=IDF(inputCol='tf_features',outputCol='tf_idf_features')

In [None]:
tf_idf_df=tf_idf_vec.fit(hashing_df).transform(hashing_df)

In [None]:
tf_idf_df.select(['user_id','tf_idf_features']).show(4,False)

In [None]:
text_df=spark.read.csv('/content/drive/MyDrive/datacamp/Movie_reviews.csv',inferSchema=True,header=True,sep=',')

In [None]:
text_df.printSchema()

In [None]:
text_df.count()

In [None]:
from pyspark.sql.functions import rand

In [None]:
text_df.orderBy(rand()).show(10,False)

In [None]:
text_df=text_df.filter(((text_df.Sentiment =='1') | (text_df.Sentiment =='0')))

In [None]:
text_df.count()

In [None]:
text_df.groupBy('Sentiment').count().show()

In [None]:
text_df.printSchema()

In [None]:
text_df = text_df.withColumn("Label", text_df.Sentiment.cast('float')).drop('Sentiment')

In [None]:
text_df.orderBy(rand()).show(10,False)

In [None]:
text_df.groupBy('label').count().show()

In [None]:
from pyspark.sql.functions import length

In [None]:
text_df=text_df.withColumn('length',length(text_df['Review']))

In [None]:
text_df.orderBy(rand()).show(10,False)

In [None]:
text_df.groupBy('Label').agg({'Length':'mean'}).show()

In [None]:
tokenization=Tokenizer(inputCol='Review',outputCol='tokens')

In [None]:
tokenized_df=tokenization.transform(text_df)

In [None]:
tokenized_df.show()

In [None]:
stopword_removal=StopWordsRemover(inputCol='tokens',outputCol='refined_tokens')

In [None]:
refined_text_df=stopword_removal.transform(tokenized_df)

In [None]:
refined_text_df.show()

In [None]:
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import *

In [None]:
len_udf = udf(lambda s: len(s), IntegerType())

refined_text_df = refined_text_df.withColumn("token_count", len_udf(col('refined_tokens')))

In [None]:
count_vec=CountVectorizer(inputCol='refined_tokens',outputCol='features')

In [None]:
cv_text_df=count_vec.fit(refined_text_df).transform(refined_text_df)

In [None]:
cv_text_df.select(['refined_tokens','token_count','features','Label']).show(10)

In [None]:
model_text_df=cv_text_df.select(['features','token_count','Label'])

In [None]:
from pyspark.ml.feature import VectorAssembler

In [None]:
df_assembler = VectorAssembler(inputCols=['features','token_count'],outputCol='features_vec')
model_text_df = df_assembler.transform(model_text_df)

In [None]:
model_text_df.printSchema()

In [None]:
from pyspark.ml.classification import LogisticRegression

In [None]:
training_df,test_df=model_text_df.randomSplit([0.75,0.25])

In [None]:
training_df.groupBy('Label').count().show()

In [None]:
test_df.groupBy('Label').count().show()

In [None]:
log_reg=LogisticRegression(featuresCol='features_vec',labelCol='Label').fit(training_df)

In [None]:
results=log_reg.evaluate(test_df).predictions

In [None]:
results.show()

In [None]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [None]:
true_postives = results[(results.Label == 1) & (results.prediction == 1)].count()
true_negatives = results[(results.Label == 0) & (results.prediction == 0)].count()
false_positives = results[(results.Label == 0) & (results.prediction == 1)].count()
false_negatives = results[(results.Label == 1) & (results.prediction == 0)].count()

In [None]:
recall = float(true_postives)/(true_postives + false_negatives)
print(recall)

In [None]:
precision = float(true_postives) / (true_postives + false_positives)
print(precision)

In [None]:
accuracy=float((true_postives+true_negatives) /(results.count()))
print(accuracy)

# Sequence_Embeddings_PySpark

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *

from pyspark.ml.feature import StringIndexer
from pyspark.sql.window import Window

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.dummy import DummyClassifier
from statsmodels.api import Logit
from sklearn.decomposition import PCA
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
%matplotlib inline

import pandas as pd
import numpy as np
import sys
import itertools
import re
from random import sample
import time

In [None]:
from gensim.models.doc2vec import LabeledSentence
from gensim.models import Doc2Vec
from gensim.models import Word2Vec

In [None]:
spark=SparkSession.builder.appName('seq_embedding').getOrCreate()

In [None]:
df = spark.read.csv('embedding_dataset.csv',header=True,inferSchema=True)
