### Analyse search terms on the e-commerce web server


##### In this assignment you will download the search term data set for the e-commerce web server and run analytic queries on it.


In [None]:
# Install spark

In [None]:
!pip install pyspark
!pip install findspark

In [None]:
import findspark
findspark.init()

In [None]:
import pandas as pd
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

In [None]:
# Start session

In [None]:
# Creating a spark context class
sc = SparkContext()

# Creating a spark session
spark = SparkSession \
    .builder \
    .appName("Python Spark DataFrames basic example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

In [None]:
spark

In [None]:
# Download The search term dataset from the below url
# https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DB0321EN-SkillsNetwork/Bigdata%20and%20Spark/searchterms.csv

In [None]:
# !wget https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DB0321EN-SkillsNetwork/Bigdata%20and%20Spark/searchterms.csv

In [None]:
# Load the csv into a spark dataframe

In [None]:
search_df = pd.read_csv("https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DB0321EN-SkillsNetwork/Bigdata%20and%20Spark/searchterms.csv")

In [None]:
search_df.head()

In [None]:
spark_df = spark.createDataFrame(search_df)

In [None]:
# Print the number of rows and columns
# Take a screenshot of the code and name it as shape.jpg)

In [None]:
rows = spark_df.count()
col = len(spark_df.columns)
print(f'Number of rows: {rows}')
print(f'Number of columns: {col}')

In [None]:
# Print the top 5 rows
# Take a screenshot of the code and name it as top5rows.jpg)

In [None]:
spark_df.show(5)

In [None]:
# Find out the datatype of the column searchterm?
# Take a screenshot of the code and name it as datatype.jpg)

In [None]:
spark_df.printSchema()

In [None]:
spark_df.schema["searchterm"].dataType

In [None]:
# How many times was the term `gaming laptop` searched?
# Take a screenshot of the code and name it as gaminglaptop.jpg)

In [None]:
count_df = spark_df.groupby(['searchterm']).count()
count_df.filter(count_df['searchterm'] == 'gaming laptop').show()

In [None]:
# Print the top 5 most frequently used search terms?
# Take a screenshot of the code and name it as top5terms.jpg)

In [None]:
spark_df.groupby(['searchterm']).count().sort('count', ascending=False).show(5)

In [None]:
# The pretrained sales forecasting model is available at  the below url
# https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DB0321EN-SkillsNetwork/Bigdata%20and%20Spark/model.gzip

In [None]:
# You need LinearRegressionModel to load the model
from pyspark.ml.regression import LinearRegressionModel
from pyspark.ml.feature import VectorAssembler


In [None]:
import gzip

In [None]:
!wget https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DB0321EN-SkillsNetwork/Bigdata%20and%20Spark/model.gzip

In [None]:
!tar -xvzf model.gzip

In [None]:
# Load the sales forecast model.
# Take a screenshot of the code and name it as loadmodel.jpg)

In [None]:
model = LinearRegressionModel.load('sales_prediction.model')

In [None]:
# Using the sales forecast model, predict the sales for the year of 2023.
# Take a screenshot of the code and name it as forecast.jpg

In [None]:
def predict(weight):
    assembler = VectorAssembler(inputCols=["weight"],outputCol="features")
    data = [[weight,0]]
    columns = ["weight", "height"]
    _ = spark.createDataFrame(data, columns)
    __ = assembler.transform(_).select('features','height')
    predictions = model.transform(__)
    predictions.select('prediction').show()

In [None]:
predict(2023)