<a href="https://colab.research.google.com/github/AshkanSamavatian/AMD-Final-Project/blob/main/Algorithms_for_Massive_Data_Final_Project_(Ashkan_Samavatian)_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### **Importing necessary libraries and Import Data from Kaggle**

In [3]:
#Importing necessary libraries for the project
import os
import sys
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [2]:
#Importing Dataset from Kaggle website with my kaggle_username and kaggle_key
os.environ['KAGGLE_USERNAME'] = "ashkansam"      #For presenteing the project, I substituted my kaggle username with "*****"
os.environ['KAGGLE_KEY'] = "e7d90846728fb2b990e1cea1412d357e"          #For presenteing the project, I substituted my kaggle key with "*****"
!kaggle datasets download -d xhlulu/medal-emnlp

Downloading medal-emnlp.zip to /content
100% 6.82G/6.82G [03:42<00:00, 36.2MB/s]
100% 6.82G/6.82G [03:43<00:00, 32.8MB/s]


In [4]:
#Unzipping only the "full_data.csv" file
!unzip medal-emnlp.zip full_data.csv

Archive:  medal-emnlp.zip
  inflating: full_data.csv           


### **Extracting a subset for the project**

In [5]:
#Reading and storing the dataset (trimmed down version)
sample_fraction = 0.1
MeDAL_df = pd.read_csv("full_data.csv", skiprows=lambda i: i>0 and np.random.rand() > sample_fraction)

#MeDAL_df=pd.read_csv("full_data.csv")    #My first code in this section

In [6]:
#Extracting a random subset and reseting all the indexes in the subset
MeDAL_subset_df = MeDAL_df.sample(n=10000)
MeDAL_subset_df = MeDAL_subset_df.reset_index(drop=True)

In [7]:
#Monitoring the subset
MeDAL_subset_df.head()

Unnamed: 0,TEXT,LOCATION,LABEL
0,stable isotope probing sip is a method used fo...,21|28|40|80,substrate|after|substrate|substrate
1,a genomic library of mycobacterium smegmatis d...,54,physical
2,genetic responsibility has emerged as a key no...,17,identification
3,honey bee apis mellifera workers are character...,10,behavior
4,in the years since it was first described CP r...,8|43|51|68,cardiopulmonary|history|sudden cardiac arrest|...


In [8]:
#Overviewing the subset
MeDAL_subset_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   TEXT      10000 non-null  object
 1   LOCATION  10000 non-null  object
 2   LABEL     10000 non-null  object
dtypes: object(3)
memory usage: 234.5+ KB


In [9]:
#Saving the subset to CSV format for the upcoming processes
MeDAL_subset_df.to_csv('MeDAL_subset_df.csv', index=False)

### **PySpark Setup**

In [10]:
#Setup Java, Downloading Spark, Extracting its files and Installing FindSpark
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-3.2.0/spark-3.2.0-bin-hadoop3.2.tgz
!tar xf spark-3.2.0-bin-hadoop3.2.tgz
!pip install -q findspark

In [11]:
#Setting the Environment Paths
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["JAVA_OPTS"] = "-Xms512m -Xmx4g"
os.environ["SPARK_HOME"] = "/content/spark-3.2.0-bin-hadoop3.2"
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

In [12]:
#Initializing FindSpark
import findspark
findspark.init()

In [13]:
#Importing necessary libraries for PySpark
from pyspark.sql import SparkSession, Window
from pyspark.sql.functions import lower, regexp_replace, concat_ws, udf, col, size, sum as sql_sum, abs, row_number, monotonically_increasing_id
from pyspark.sql.types import ArrayType, StringType
from pyspark.ml.feature import Tokenizer, StopWordsRemover, MinHashLSH, CountVectorizer
from pyspark.ml.linalg import SparseVector

In [14]:
#Starting a PySpark Session
spark = SparkSession.builder \
    .master("local[*]") \
    .appName('AMDProject') \
    .config("spark.driver.memory", "10g") \
    .config("spark.executor.memory", "10g") \
    .config("spark.default.parallelism", "100") \
    .getOrCreate()

### **The Main Process on the subset**

In [15]:
#Loading the subset on PySpark
df = spark.read.csv("MeDAL_subset_df.csv", header=True, inferSchema=True)

In [16]:
#"TEXT" column preprocessing
df = df.withColumn("TEXT", lower(col("TEXT")))  #Convert to lowercase
df = df.withColumn("TEXT", regexp_replace(col("TEXT"), '[^a-zA-Z\s]', ' ')) #remove the punctuations

#Tokenizing the "TEXT" column
tokenizer = Tokenizer(inputCol="TEXT", outputCol="tokenized_TEXT")
df = tokenizer.transform(df)

#Removing the stop words
remover = StopWordsRemover(inputCol="tokenized_TEXT", outputCol="filtered_TEXT")
df = remover.transform(df)

#Concatenating the words back together
df = df.withColumn("filtered_TEXT_str", concat_ws(" ", col("filtered_TEXT")))

In [17]:
#Monitoring the subset on PySpark after the preprocessing processes
df.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                TEXT|            LOCATION|               LABEL|      tokenized_TEXT|       filtered_TEXT|   filtered_TEXT_str|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|stable isotope pr...|         21|28|40|80|substrate|after|s...|[stable, isotope,...|[stable, isotope,...|stable isotope pr...|
|a genomic library...|                  54|            physical|[a, genomic, libr...|[genomic, library...|genomic library m...|
|genetic responsib...|                  17|      identification|[genetic, respons...|[genetic, respons...|genetic responsib...|
|honey bee apis me...|                  10|            behavior|[honey, bee, apis...|[honey, bee, apis...|honey bee apis me...|
|in the years sinc...|          8|43|51|68|cardiopulmonary|h...|[in, the, years, ...|[years, since, fi..

In [18]:
#Overviewing the subset on PySpark after the preprocessing processes
df.printSchema()

root
 |-- TEXT: string (nullable = true)
 |-- LOCATION: string (nullable = true)
 |-- LABEL: string (nullable = true)
 |-- tokenized_TEXT: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- filtered_TEXT: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- filtered_TEXT_str: string (nullable = false)



In [19]:
#Checking again the number of the rows in the subset
df.count()

10000

In [20]:
#Applying Shingling process on the subset

k = 10

def shingle_document(string):
    return [string[i:i+k] for i in range(len(string) - k + 1)]

shingle_udf = udf(shingle_document, ArrayType(StringType()))

df_shingled = df.withColumn("shingles", shingle_udf(df["filtered_TEXT_str"]))

In [21]:
#Converting shingles to vectors for minhash
cv = CountVectorizer(inputCol="shingles", outputCol="features")
model = cv.fit(df_shingled)
df_vectorized = model.transform(df_shingled)

#Filtering out null vectors
df_vectorized = df_vectorized.filter(col('features').isNotNull())

#Filtering out zero vectors
def is_nonzero(v):
    if isinstance(v, SparseVector):
        return v.numNonzeros() > 0
    else:  # DenseVector
        return any(i != 0 for i in v)

#Apply the filtering using RDD and then convert back to data frame
df_vectorized = df_vectorized.rdd.filter(lambda row: is_nonzero(row['features'])).toDF()


#Creating a window specification without any partitioning, and order by the original columns
window_spec = Window.orderBy(df_vectorized.columns)

#Adding a unique ID to the vectorized data frame
df_vectorized = df_vectorized.withColumn("id", row_number().over(window_spec) - 1)

#Applying MinHashLSH
mh = MinHashLSH(inputCol="features", outputCol="hashes", numHashTables=5)
model_lsh = mh.fit(df_vectorized)

#Querying for Similar Items
threshold = 0.8
results = model_lsh.approxSimilarityJoin(df_vectorized, df_vectorized, threshold)\
                   .filter(col("datasetA.id") < col("datasetB.id"))\
                   .select("datasetA.id", "datasetB.id").collect()


In [22]:
#Checking the number of similar pairs
len(results)

10

In [23]:
#Printing the similar pairs
for row in results:
    print(f"({row[0]}, {row[1]})")

(8699, 8700)
(4109, 8833)
(3439, 6250)
(4891, 7678)
(8827, 9055)
(4004, 4891)
(4004, 7678)
(6299, 9056)
(6293, 8895)
(1935, 8911)
