In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
import os

# Install java
! apt-get update -qq
! apt-get install -y openjdk-8-jdk-headless -qq > /dev/null

os.environ["JAVA_HOME"] = "/content/java-8"
os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]
! java -version

# Install pyspark
! pip install --ignore-installed pyspark==2.4.4

# Install Spark NLP
! pip install --ignore-installed spark-nlp

openjdk version "11.0.11" 2021-04-20
OpenJDK Runtime Environment (build 11.0.11+9-Ubuntu-0ubuntu2.18.04)
OpenJDK 64-Bit Server VM (build 11.0.11+9-Ubuntu-0ubuntu2.18.04, mixed mode, sharing)
Processing /root/.cache/pip/wheels/ab/09/4d/0d184230058e654eb1b04467dbc1292f00eaa186544604b471/pyspark-2.4.4-py2.py3-none-any.whl
Collecting py4j==0.10.7
  Using cached https://files.pythonhosted.org/packages/e3/53/c737818eb9a7dc32a7cd4f1396e787bd94200c3997c72c1dbe028587bd76/py4j-0.10.7-py2.py3-none-any.whl
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9 pyspark-3.1.2


Collecting spark-nlp
[?25l  Downloading https://files.pythonhosted.org/packages/6c/35/3d06b93fefdeab0f6f544b1fc48e5e49c049697c38611ef870383031380b/spark_nlp-3.0.3-py2.py3-none-any.whl (43kB)
[K     |███████▌                        | 10kB 17.9MB/s eta 0:00:01[K     |███████████████                 | 20kB 15.8MB/s eta 0:00:01[K     |██████████████████████▋         | 30kB 9.6MB/s eta 0:00:01[K     |██████████████████████████████  | 40kB 8.1MB/s eta 0:00:01[K     |████████████████████████████████| 51kB 3.2MB/s 
[?25hInstalling collected packages: spark-nlp
Successfully installed spark-nlp-3.0.3


In [None]:

from pyspark.sql.types import StructType,StructField,DoubleType
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

In [9]:
from pyspark.sql import SQLContext
from pyspark import SparkContext
sc =SparkContext()
sqlContext = SQLContext(sc)

In [17]:
Review_df =sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load('Reviews.csv')
Review_df.show(5)

+---+----------+--------------+--------------------+--------------------+----------------------+-----+----------+--------------------+--------------------+
| Id| ProductId|        UserId|         ProfileName|HelpfulnessNumerator|HelpfulnessDenominator|Score|      Time|             Summary|                Text|
+---+----------+--------------+--------------------+--------------------+----------------------+-----+----------+--------------------+--------------------+
|  1|B001E4KFG0|A3SGXH7AUHU8GW|          delmartian|                   1|                     1|    5|1303862400|Good Quality Dog ...|I have bought sev...|
|  2|B00813GRG4|A1D87F6ZCVE5NK|              dll pa|                   0|                     0|    1|1346976000|   Not as Advertised|"Product arrived ...|
|  3|B000LQOCH0| ABXLMWJIXXAIN|"Natalia Corres "...|                   1|                     1|    4|1219017600|"""Delight"" says...|"This is a confec...|
|  4|B000UA0QIQ|A395BORC6FGVXV|                Karl|            

In [18]:
Review_df.printSchema()

root
 |-- Id: integer (nullable = true)
 |-- ProductId: string (nullable = true)
 |-- UserId: string (nullable = true)
 |-- ProfileName: string (nullable = true)
 |-- HelpfulnessNumerator: string (nullable = true)
 |-- HelpfulnessDenominator: string (nullable = true)
 |-- Score: string (nullable = true)
 |-- Time: string (nullable = true)
 |-- Summary: string (nullable = true)
 |-- Text: string (nullable = true)



In [19]:
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import udf

In [20]:
def partition(x):
   return 1 if str(x)>str(3) else 0
my_udf = udf(partition, IntegerType())  

In [21]:
Review_df = Review_df.withColumn('Score', my_udf('Score'))
Review_df.show(5)

+---+----------+--------------+--------------------+--------------------+----------------------+-----+----------+--------------------+--------------------+
| Id| ProductId|        UserId|         ProfileName|HelpfulnessNumerator|HelpfulnessDenominator|Score|      Time|             Summary|                Text|
+---+----------+--------------+--------------------+--------------------+----------------------+-----+----------+--------------------+--------------------+
|  1|B001E4KFG0|A3SGXH7AUHU8GW|          delmartian|                   1|                     1|    1|1303862400|Good Quality Dog ...|I have bought sev...|
|  2|B00813GRG4|A1D87F6ZCVE5NK|              dll pa|                   0|                     0|    0|1346976000|   Not as Advertised|"Product arrived ...|
|  3|B000LQOCH0| ABXLMWJIXXAIN|"Natalia Corres "...|                   1|                     1|    1|1219017600|"""Delight"" says...|"This is a confec...|
|  4|B000UA0QIQ|A395BORC6FGVXV|                Karl|            

In [22]:
if Review_df.count() > Review_df.dropDuplicates(['UserId','ProfileName','Time','Text']).count():
   print ('Data has duplicates')
print ("Row count Now:",Review_df.count())
Review_df = Review_df.dropDuplicates(['UserId','ProfileName','Time','Text'])
print ("After Removing the duplicates, row count becomes:")
Review_df.count()

Data has duplicates
Row count Now: 568454
After Removing the duplicates, row count becomes:


393559

In [23]:
Review_df.groupBy('Score').count().show()

+-----+------+
|Score| count|
+-----+------+
|    1|305599|
|    0| 87960|
+-----+------+



In [24]:
import gensim.parsing.preprocessing as gsp
from pyspark.sql.types import StringType
from gensim import utils


In [25]:
filters = [
           gsp.strip_tags, 
           gsp.strip_punctuation,
           gsp.strip_multiple_whitespaces,
           gsp.strip_numeric,
           gsp.remove_stopwords, 
           gsp.strip_short, 
           gsp.stem_text
          ]

In [27]:
def clean_text(x):
    s = x[9]
    s = s.lower()
    s = utils.to_unicode(s)
    for f in filters:
        s = f(s)
    return (x[6],s)

In [28]:
input_rdd = Review_df.rdd.map(lambda x : clean_text(x))

In [29]:
input_df = input_rdd.toDF(['Score','Text'])
input_df.show(10)

+-----+--------------------+
|Score|                Text|
+-----+--------------------+
|    1|water water right...|
|    0|grow visit grandp...|
|    0|thought get good ...|
|    1|order product rea...|
|    1|long time fan cry...|
|    1|product past have...|
|    1|delic candi love ...|
|    1|faint heart best ...|
|    1|real fan bamboo s...|
|    1|gave differ flavo...|
+-----+--------------------+
only showing top 10 rows



In [30]:
input_df = input_df.dropna()
train_df, test_df = input_df.randomSplit([0.8, 0.2])

In [35]:
from pyspark.ml.feature import Word2Vec
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.ml import Pipeline

In [None]:
tokenizer = Tokenizer(inputCol="Text",outputCol="words")
w2v = Word2Vec(vectorSize=300, minCount=0, inputCol="words", outputCol="features")
doc2vec_pipeline = Pipeline(stages=[tokenizer,w2v])
doc2vec_model = doc2vec_pipeline.fit(train_df)
train_df = doc2vec_model.transform(train_df)
test_df = doc2vec_model.transform(test_df)
print ("few rows from train df")
train_df.show(3)

In [None]:
def RandomForestCV(train_df,test_df):
    rf = RandomForestClassifier(labelCol="Score", featuresCol="features")
    pipeline = Pipeline(stages=[rf])
    paramGrid = ParamGridBuilder().addGrid(rf.maxDepth, [5, 10]).addGrid(rf.maxBins, [25, 31]) .addGrid(rf.minInfoGain, [0.01, 0.001])
    .addGrid(rf.numTrees, [20, 60]) .addGrid(rf.impurity, ["gini", "entropy"]) .build()
    evaluator = BinaryClassificationEvaluator(labelCol="Score")
    crossValidator = CrossValidator(estimator=pipeline,evaluator=evaluator,estimatorParamMaps=paramGrid,numFolds=10)
    cv = crossValidator.fit(train_df)
    best_model = cv.bestModel.stages[0]
    prediction = best_model.transform(test_df)
    metric = evaluator.evaluate(prediction)
    print ("The metric of test's accuracy= %g" % metric)    
RandomForestCV(train_df,test_df)

In [None]:
import xgboost as xgb
import numpy as np
from sklearn.metrics import accuracy_score

train_features = train_df.select("features").collect()
train_lables = train_df.select("Score").collect()
test_features = test_df.select("features").collect()
test_labels = test_df.select("Score").collect()


X_train = np.asarray([v[0].toArray() for v in train_features])
Y_train = np.asarray([v[0] for v in train_lables])
X_test =  np.asarray([v[0].toArray() for v in test_features])
Y_test = np.asarray([v[0] for v in test_labels])

xgbClassifier = xgb.XGBClassifier(max_depth=10, seed=18238, objective='multi:softmax',num_class = 2)
model = xgbClassifier.fit(X_train, Y_train)
pred = model.predict(X_test)

auc_score = accuracy_score(Y_test,pred)
print ("The accuracy score for XGboost model : ",auc_score)
