In [1]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.0.tar.gz (316.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.9/316.9 MB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l- done
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l- \ | done
[?25h  Created wheel for pyspark: filename=pyspark-3.5.0-py2.py3-none-any.whl size=317425350 sha256=af01d452bc9f299e22ce3f41a111c179503e9cd4b6ca28065ab78a143eb9da2a
  Stored in directory: /root/.cache/pip/wheels/41/4e/10/c2cf2467f71c678cfc8a6b9ac9241e5e44a01940da8fbb17fc
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.0


In [2]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession, SQLContext

from pyspark.sql.types import *
import pyspark.sql.functions as F
from pyspark.sql.functions import udf, col

from pyspark.ml.regression import LinearRegression
from pyspark.mllib.evaluation import RegressionMetrics

from pyspark.ml.tuning import ParamGridBuilder, CrossValidator, CrossValidatorModel
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.evaluation import RegressionEvaluator

In [3]:
spark = SparkSession.\
            builder.\
            master("local[2]").\
            appName("testing-NLP").getOrCreate()
spark

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/10/05 22:43:59 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
sc = spark.sparkContext
sc

In [5]:
# Sample data
data = [([1.0, 1.0, 1.0, 1.0], [1.0, 1.0, 1.0, 1.0],1.0), 
        ([2.0, 2.0, 2.0, 2.0],[1.0, 1.0, 1.0, 1.0], 1.0), 
        ([3.0, 3.0, 3.0, 3.0],[1.0, 1.0, 1.0, 1.0], 1.0)]

# Define the schema for the DataFrame
schema = StructType([
    StructField("features", ArrayType(DoubleType()), True),
    StructField("features2", ArrayType(DoubleType()), True),
    StructField("label", DoubleType(), True)
])

# Create a DataFrame
df = spark.createDataFrame(data, schema=schema)
# Show the DataFrame
df.show()

                                                                                

+--------------------+--------------------+-----+
|            features|           features2|label|
+--------------------+--------------------+-----+
|[1.0, 1.0, 1.0, 1.0]|[1.0, 1.0, 1.0, 1.0]|  1.0|
|[2.0, 2.0, 2.0, 2.0]|[1.0, 1.0, 1.0, 1.0]|  1.0|
|[3.0, 3.0, 3.0, 3.0]|[1.0, 1.0, 1.0, 1.0]|  1.0|
+--------------------+--------------------+-----+



In [6]:
df_dot_product = df.withColumn("dot_product",
                               F.expr("transform(features, (x, i) -> x * features2[i])"))
df_dot_product.show()

+--------------------+--------------------+-----+--------------------+
|            features|           features2|label|         dot_product|
+--------------------+--------------------+-----+--------------------+
|[1.0, 1.0, 1.0, 1.0]|[1.0, 1.0, 1.0, 1.0]|  1.0|[1.0, 1.0, 1.0, 1.0]|
|[2.0, 2.0, 2.0, 2.0]|[1.0, 1.0, 1.0, 1.0]|  1.0|[2.0, 2.0, 2.0, 2.0]|
|[3.0, 3.0, 3.0, 3.0]|[1.0, 1.0, 1.0, 1.0]|  1.0|[3.0, 3.0, 3.0, 3.0]|
+--------------------+--------------------+-----+--------------------+



In [7]:
# Calculate the sum of the array elements and create a new column
df_dot_product = df_dot_product.withColumn("dot_product_sum",
                                           F.expr('aggregate(dot_product, 0D, (acc, x) -> acc + x)'))
df_dot_product.show()

+--------------------+--------------------+-----+--------------------+---------------+
|            features|           features2|label|         dot_product|dot_product_sum|
+--------------------+--------------------+-----+--------------------+---------------+
|[1.0, 1.0, 1.0, 1.0]|[1.0, 1.0, 1.0, 1.0]|  1.0|[1.0, 1.0, 1.0, 1.0]|            4.0|
|[2.0, 2.0, 2.0, 2.0]|[1.0, 1.0, 1.0, 1.0]|  1.0|[2.0, 2.0, 2.0, 2.0]|            8.0|
|[3.0, 3.0, 3.0, 3.0]|[1.0, 1.0, 1.0, 1.0]|  1.0|[3.0, 3.0, 3.0, 3.0]|           12.0|
+--------------------+--------------------+-----+--------------------+---------------+



In [8]:
# Calculate the magnitude of each vector
df_dot_product = df_dot_product.withColumn("mag_list", 
                    F.expr("transform(features, x -> x * x)"))
# Calculate the magnitude of each vector
df_dot_product = df_dot_product.withColumn("mag_list2", 
                    F.expr("transform(features2, x -> x * x)"))
df_dot_product.show()

+--------------------+--------------------+-----+--------------------+---------------+--------------------+--------------------+
|            features|           features2|label|         dot_product|dot_product_sum|            mag_list|           mag_list2|
+--------------------+--------------------+-----+--------------------+---------------+--------------------+--------------------+
|[1.0, 1.0, 1.0, 1.0]|[1.0, 1.0, 1.0, 1.0]|  1.0|[1.0, 1.0, 1.0, 1.0]|            4.0|[1.0, 1.0, 1.0, 1.0]|[1.0, 1.0, 1.0, 1.0]|
|[2.0, 2.0, 2.0, 2.0]|[1.0, 1.0, 1.0, 1.0]|  1.0|[2.0, 2.0, 2.0, 2.0]|            8.0|[4.0, 4.0, 4.0, 4.0]|[1.0, 1.0, 1.0, 1.0]|
|[3.0, 3.0, 3.0, 3.0]|[1.0, 1.0, 1.0, 1.0]|  1.0|[3.0, 3.0, 3.0, 3.0]|           12.0|[9.0, 9.0, 9.0, 9.0]|[1.0, 1.0, 1.0, 1.0]|
+--------------------+--------------------+-----+--------------------+---------------+--------------------+--------------------+



In [9]:
df_dot_product = df_dot_product.drop('label') 
df_dot_product = df_dot_product.drop('label') 
df_dot_product = df_dot_product.withColumn("mag_list_sum",
                    F.sqrt(F.expr('aggregate(mag_list, 0D, (acc, x) -> acc + x)')))
df_dot_product = df_dot_product.withColumn("mag_list_sum2",
                    F.sqrt(F.expr('aggregate(mag_list2, 0D, (acc, x) -> acc + x)')))
df_dot_product.show()

+--------------------+--------------------+--------------------+---------------+--------------------+--------------------+------------+-------------+
|            features|           features2|         dot_product|dot_product_sum|            mag_list|           mag_list2|mag_list_sum|mag_list_sum2|
+--------------------+--------------------+--------------------+---------------+--------------------+--------------------+------------+-------------+
|[1.0, 1.0, 1.0, 1.0]|[1.0, 1.0, 1.0, 1.0]|[1.0, 1.0, 1.0, 1.0]|            4.0|[1.0, 1.0, 1.0, 1.0]|[1.0, 1.0, 1.0, 1.0]|         2.0|          2.0|
|[2.0, 2.0, 2.0, 2.0]|[1.0, 1.0, 1.0, 1.0]|[2.0, 2.0, 2.0, 2.0]|            8.0|[4.0, 4.0, 4.0, 4.0]|[1.0, 1.0, 1.0, 1.0]|         4.0|          2.0|
|[3.0, 3.0, 3.0, 3.0]|[1.0, 1.0, 1.0, 1.0]|[3.0, 3.0, 3.0, 3.0]|           12.0|[9.0, 9.0, 9.0, 9.0]|[1.0, 1.0, 1.0, 1.0]|         6.0|          2.0|
+--------------------+--------------------+--------------------+---------------+--------------------

In [10]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session