#### You can refer to the [commands](./commands.md) file for the list of commands utilized in this demo.

In [1]:
import pandas as pd
import os
from glob import glob

### Check your data folder to ensure availability of files to process

In [2]:
os.listdir(f"/home/{os.environ.get('USER')}/setu/examples/sample_data")

['sample_english_pdf.parquet', 'sample_english_crawl.json']

### Run the JSON2Parquet Stage to convert your data jsons into parquets

In [9]:
!SETU_DIR=/home/$USER/setu SETU_TMP_DIR=/home/$USER/tmp/ FILTER_DATA_ROOT=/home/$USER/setu/setu/data \
    spark-submit \
    --master spark://YDEARYZEN.:7077 \
    --deploy-mode client \
    --driver-java-options -Djava.io.tmpdir=/home/$USER/tmp/ \
    --conf "spark.driver.extraJavaOptions=-Djava.io.tmpdir=/home/$USER/tmp/" \
    --conf "spark.executor.extraJavaOptions=-Djava.io.tmpdir=/home/$USER/tmp/" \
    --conf spark.worker.dir="/home/$USER/tmp/" \
    --conf spark.local.dir="/home/$USER/tmp/" \
    --num-executors 4 \
    --executor-cores 2 \
    --executor-memory 3G \
    --driver-memory 6G \
    --archives "/home/$USER/setu/dataproc/envs/setu.zip" \
    --conf 'spark.executorEnv.PYTHONPATH=setu.zip' \
    --conf 'spark.executorEnv.FILTER_DATA_ROOT=setu.zip/data' \
    /home/$USER/setu/setu/run.py \
    --config /home/$USER/setu/configs/crawls/spark_english_config.json \
    --mode crawl \
    --run_local True \
    JSON2ParquetStage \
    --json_glob_path "/home/$USER/setu/examples/sample_data/*.json" \
    --language english \
    --j2p_samples_per_partition 1500 \
    --j2p_verbose False \
    --j2p_run_mode data \
    --j2p_parquet_output_path /home/$USER/setu/examples/output/j2p_output

24/03/07 12:33:56 WARN Utils: Your hostname, YDEARYZEN resolves to a loopback address: 127.0.1.1; using 172.31.108.238 instead (on interface eth0)
24/03/07 12:33:56 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
24/03/07 12:33:58 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Command used to run this script:  /home/shanks/setu/setu/run.py --config /home/shanks/setu/configs/crawls/spark_english_config.json --mode crawl --run_local True JSON2ParquetStage --json_glob_path /home/shanks/setu/examples/sample_data/*.json --language english --j2p_samples_per_partition 1500 --j2p_verbose False --j2p_run_mode data --j2p_parquet_output_path /home/shanks/setu/examples/output/j2p_output
shanks
------------------------------------------------ Setting Environment Variables --------------------------------------------------
------------------------------------------------ END --------------------------

### Check the JSON2Parquet Stage Output

In [4]:
json_output_path = f"/home/{os.environ.get('USER')}/setu/examples/output/j2p_output"

In [5]:
parquets = glob(f"{json_output_path}/*.parquet")

In [6]:
df = pd.read_parquet(parquets[0])

In [7]:
df.head()

Unnamed: 0,doc_id,url,source,language,text,timestamp
0,8bc919b9-c288-42fd-b503-d0f366e06a14,https://www.91mobiles.com/nokia-t10-lte-price-...,91mobiles,english,"<html lang=""en"">\n<head>\n \n <m...",10/07/23 10:54


### Run the Text Extraction State to extract text from the previous stage output

In [10]:
!SETU_DIR=/home/$USER/setu SETU_TMP_DIR=/home/$USER/tmp/ FILTER_DATA_ROOT=/home/$USER/setu/setu/data \
    spark-submit \
    --master spark://YDEARYZEN.:7077 \
    --deploy-mode client \
    --driver-java-options -Djava.io.tmpdir=/home/$USER/tmp/ \
    --conf "spark.driver.extraJavaOptions=-Djava.io.tmpdir=/home/$USER/tmp/" \
    --conf "spark.executor.extraJavaOptions=-Djava.io.tmpdir=/home/$USER/tmp/" \
    --conf spark.worker.dir="/home/$USER/tmp/" \
    --conf spark.local.dir="/home/$USER/tmp/" \
    --num-executors 4 \
    --executor-cores 2 \
    --executor-memory 3G \
    --driver-memory 6G \
    --archives "/home/$USER/setu/dataproc/envs/setu.zip" \
    --conf 'spark.executorEnv.PYTHONPATH=setu.zip' \
    --conf 'spark.executorEnv.FILTER_DATA_ROOT=setu.zip/data' \
    /home/$USER/setu/setu/run.py \
    --config /home/$USER/setu/configs/crawls/spark_english_config.json \
    --mode crawl \
    --run_local True \
    ExtractTextStage \
    --te_parquets_path "/home/$USER/setu/examples/output/j2p_output/*.parquet" \
    --te_samples_per_partition 1500 \
    --te_run_mode data \
    --te_output_path "/home/$USER/setu/examples/output/te_output"


24/03/07 12:36:02 WARN Utils: Your hostname, YDEARYZEN resolves to a loopback address: 127.0.1.1; using 172.31.108.238 instead (on interface eth0)
24/03/07 12:36:02 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
24/03/07 12:36:03 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Command used to run this script:  /home/shanks/setu/setu/run.py --config /home/shanks/setu/configs/crawls/spark_english_config.json --mode crawl --run_local True ExtractTextStage --te_parquets_path /home/shanks/setu/examples/output/j2p_output/*.parquet --te_samples_per_partition 1500 --te_run_mode data --te_output_path /home/shanks/setu/examples/output/te_output
shanks
------------------------------------------------ Setting Environment Variables --------------------------------------------------
------------------------------------------------ END --------------------------------------------------
----------------

### Check the TextExtract Stage Output

In [11]:
te_output_path = f"/home/{os.environ.get('USER')}/setu/examples/output/te_output"

In [12]:
parquets = glob(f"{te_output_path}/*.parquet")

In [13]:
df = pd.read_parquet(parquets[0])

In [14]:
df.head()

Unnamed: 0,doc_id,url,source,timestamp,language,successful_extraction,title,description,text,comments,...,categories,tags,fingerprint,id,license,body,commentsbody,raw_text,image,pagetype
0,8bc919b9-c288-42fd-b503-d0f366e06a14,https://www.91mobiles.com/nokia-t10-lte-price-...,91mobiles,10/07/23 10:54,english,True,Nokia T10 LTE,,"1. Trusted Brand\n2. Originally from Finland, ...",,...,[],[],,,,,,,,


### Run DocCleanStage on the Text Extraction Stage output

In [15]:
!SETU_DIR=/home/$USER/setu SETU_TMP_DIR=/home/$USER/tmp/ FILTER_DATA_ROOT=/home/$USER/setu/setu/data \
    spark-submit \
    --master spark://YDEARYZEN.:7077 \
    --deploy-mode client \
    --driver-java-options -Djava.io.tmpdir=/home/$USER/tmp/ \
    --conf "spark.driver.extraJavaOptions=-Djava.io.tmpdir=/home/$USER/tmp/" \
    --conf "spark.executor.extraJavaOptions=-Djava.io.tmpdir=/home/$USER/tmp/" \
    --conf spark.worker.dir="/home/$USER/tmp/" \
    --conf spark.local.dir="/home/$USER/tmp/" \
    --num-executors 4 \
    --executor-cores 2 \
    --executor-memory 3G \
    --driver-memory 6G \
    --archives "/home/$USER/setu/dataproc/envs/setu.zip" \
    --conf 'spark.executorEnv.PYTHONPATH=setu.zip' \
    --conf 'spark.executorEnv.FILTER_DATA_ROOT=setu.zip/data' \
    /home/$USER/setu/setu/run.py \
    --config /home/$USER/setu/configs/crawls/spark_english_config.json \
    --mode crawl \
    --run_local True \
    DocCleanStage \
    --doc_df_parquets_path "/home/$USER/setu/examples/output/te_output/*.parquet" \
    --is_doc_df_path_batched False \
    --doc_clean_additional_cols_to_use "url,source,language" \
    --use_symbol_filter True \
    --doc_clean_samples_per_partition 1500 \
    --doc_clean_verbose False \
    --doc_clean_run_mode data \
    --save_symbol_heavy_docs True \
    --symbol_filter_output_path "/home/$USER/setu/examples/output/symbol_filter/" \
    --cleaned_doc_output_path "/home/$USER/setu/examples/output/cleaned_docs/"

24/03/07 12:38:50 WARN Utils: Your hostname, YDEARYZEN resolves to a loopback address: 127.0.1.1; using 172.31.108.238 instead (on interface eth0)
24/03/07 12:38:50 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
24/03/07 12:38:51 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Command used to run this script:  /home/shanks/setu/setu/run.py --config /home/shanks/setu/configs/crawls/spark_english_config.json --mode crawl --run_local True DocCleanStage --doc_df_parquets_path /home/shanks/setu/examples/output/te_output/*.parquet --is_doc_df_path_batched False --doc_clean_additional_cols_to_use url,source,language --use_symbol_filter True --doc_clean_samples_per_partition 1500 --doc_clean_verbose False --doc_clean_run_mode data --save_symbol_heavy_docs True --symbol_filter_output_path /home/shanks/setu/examples/output/symbol_filter/ --cleaned_doc_output_path /home/shanks/setu/examples/output/

### Check DocClean Stage Output

In [16]:
dc_output_path = f"/home/{os.environ.get('USER')}/setu/examples/output/cleaned_docs"

In [17]:
parquets = glob(f"{dc_output_path}/*.parquet")

In [18]:
df = pd.read_parquet(parquets[0])

In [19]:
df.head()

Unnamed: 0,doc_id,text,url,source,language,code_spans,uncleaned_chars_count,uncleaned_words_count,uncleaned_bytes,symbol_ratio,invalid_char_count,uncleaned_text
0,8bc919b9-c288-42fd-b503-d0f366e06a14,"2. Originally from Finland, Nokia ruled the mo...",https://www.91mobiles.com/nokia-t10-lte-price-...,91mobiles,english,,3181,366,3181,0.094624,301,"1. Trusted Brand\n2. Originally from Finland, ..."


In [22]:
print(df.iloc[0]["text"])

2. Originally from Finland, Nokia ruled the mobile industry for almost a decade.Now owned by and sold under Microsoft, it is usually known for devices that run on Windows, and is popular for its Lumia series.
Nokia T10 LTE price in India starts from Rs. 11,999. The lowest price of Nokia T10 LTE is Rs. 11,999 at flipkart.com. This is 3 GB RAM / 32 GB internal storage variant of Nokia T10 which is available in Ocean Blue colour.
|October 15, 2022 (Official)
|SIM Slot(s)
5G: Not Supported in India,
4G: Available Supported in India,
|8.0 inches (20.32 cm)
|Octa core (1.6 GHz, Dual core, Cortex A75 + 1.6 GHz, Hexa Core, Cortex A55)
4G Bands:
TD-LTE 2600(band 38) / 2300(band 40) / 2500(band 41)
FD-LTE 2100(band 1) / 1800(band 3) / 2600(band 7) / 900(band 8) / 700(band 28) / 850(band 5) / 800(band 20)
3G Bands:
2G Bands:
GPRS:
EDGE:
|Nokia T10 3 GB RAM 32 GB ROM 8 inch with 4G Tablet (Blue)
|NOKIA T10 Wi-Fi + LTE Android Tablet (8 Inch, 3GB RAM, 32GB ROM, Ocean Blue)



### Run the LIDStage

In [25]:
!SETU_DIR=/home/$USER/setu SETU_TMP_DIR=/home/$USER/tmp/ FILTER_DATA_ROOT=/home/$USER/setu/setu/data \
    spark-submit \
    --master spark://YDEARYZEN.:7077 \
    --deploy-mode client \
    --driver-java-options -Djava.io.tmpdir=/home/$USER/tmp/ \
    --conf "spark.driver.extraJavaOptions=-Djava.io.tmpdir=/home/$USER/tmp/" \
    --conf "spark.executor.extraJavaOptions=-Djava.io.tmpdir=/home/$USER/tmp/" \
    --conf spark.worker.dir="/home/$USER/tmp/" \
    --conf spark.local.dir="/home/$USER/tmp/" \
    --num-executors 4 \
    --executor-cores 2 \
    --executor-memory 3G \
    --driver-memory 6G \
    --archives "/home/$USER/setu/dataproc/envs/setu.zip" \
    --conf 'spark.executorEnv.PYTHONPATH=setu.zip' \
    --conf 'spark.executorEnv.FILTER_DATA_ROOT=setu.zip/data' \
    /home/$USER/setu/setu/run.py \
    --config /home/$USER/setu/configs/crawls/spark_english_config.json \
    --mode crawl \
    --run_local True \
    LIDStage \
    --lid_df_parquets_path "/home/$USER/setu/examples/output/cleaned_docs/*.parquet" \
    --is_lid_df_path_batched False \
    --lid_additional_cols "url,source,language" \
    --lid_samples_per_partition 1500 \
    --lid_verbose False \
    --lid_run_mode data \
    --doc_lid_output_path "/home/$USER/setu/examples/output/lid/"

24/03/07 16:12:46 WARN Utils: Your hostname, YDEARYZEN resolves to a loopback address: 127.0.1.1; using 172.31.108.238 instead (on interface eth0)
24/03/07 16:12:46 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
24/03/07 16:12:47 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Command used to run this script:  /home/shanks/setu/setu/run.py --config /home/shanks/setu/configs/crawls/spark_english_config.json --mode crawl --run_local True LIDStage --lid_df_parquets_path /home/shanks/setu/examples/output/cleaned_docs/*.parquet --is_lid_df_path_batched False --lid_additional_cols url,source,language --lid_samples_per_partition 1500 --lid_verbose False --lid_run_mode data --doc_lid_output_path /home/shanks/setu/examples/output/lid/
shanks
------------------------------------------------ Setting Environment Variables --------------------------------------------------
---------------------------

### Check LIDStage Output 

In [26]:
lid_output_path = f"/home/{os.environ.get('USER')}/setu/examples/output/lid/doc_lang_partition=english"

In [27]:
parquets = glob(f"{lid_output_path}/*.parquet")

In [28]:
df = pd.read_parquet(parquets[0])

In [29]:
df.head()

Unnamed: 0,doc_id,url,source,language,text,doc_lang,doc_lang_iso,indiclid_code,indiclid_script,indiclid_lang,nllb_lang,cld3_lang,indiclid_logit,nllb_probability,cld3_probability
0,8bc919b9-c288-42fd-b503-d0f366e06a14,https://www.91mobiles.com/nokia-t10-lte-price-...,91mobiles,english,"2. Originally from Finland, Nokia ruled the mo...",english,eng,eng_Latn,latin,english,english,english,1.000024,0.952707,0.99995


### Run the Analysis Stages

In [30]:
!SETU_DIR=/home/$USER/setu SETU_TMP_DIR=/home/$USER/tmp/ FILTER_DATA_ROOT=/home/$USER/setu/setu/data \
    spark-submit \
    --master spark://YDEARYZEN.:7077 \
    --deploy-mode client \
    --driver-java-options -Djava.io.tmpdir=/home/$USER/tmp/ \
    --conf "spark.driver.extraJavaOptions=-Djava.io.tmpdir=/home/$USER/tmp/" \
    --conf "spark.executor.extraJavaOptions=-Djava.io.tmpdir=/home/$USER/tmp/" \
    --conf spark.worker.dir="/home/$USER/tmp/" \
    --conf spark.local.dir="/home/$USER/tmp/" \
    --num-executors 4 \
    --executor-cores 2 \
    --executor-memory 3G \
    --driver-memory 6G \
    --archives "/home/$USER/setu/dataproc/envs/setu.zip" \
    --conf 'spark.executorEnv.PYTHONPATH=setu.zip' \
    --conf 'spark.executorEnv.FILTER_DATA_ROOT=setu.zip/data' \
    /home/$USER/setu/setu/run.py \
    --config /home/$USER/setu/configs/crawls/spark_english_config.json \
    --mode crawl \
    --run_local True \
    AnalysisStage \
    --analysis_df_parquets_path "/home/$USER/setu/examples/output/lid/*/*.parquet" \
    --is_analysis_df_path_batched False \
    --analysis_additional_cols_to_use "url,source,language,doc_lang,doc_lang_iso" \
    --analysis_samples_per_partition 1500 \
    --analysis_verbose False \
    --analysis_run_mode stage \
    --line_stats_output_path "/home/$USER/setu/examples/output/line_stats/" \
    --doc_stats_output_path "/home/$USER/setu/examples/output/doc_stats/" \
    --analysis_output_path "/home/$USER/setu/examples/output/analysis/"

24/03/07 17:31:27 WARN Utils: Your hostname, YDEARYZEN resolves to a loopback address: 127.0.1.1; using 172.31.108.238 instead (on interface eth0)
24/03/07 17:31:27 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
24/03/07 17:31:28 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Command used to run this script:  /home/shanks/setu/setu/run.py --config /home/shanks/setu/configs/crawls/spark_english_config.json --mode crawl --run_local True AnalysisStage --analysis_df_parquets_path /home/shanks/setu/examples/output/lid/*/*.parquet --is_analysis_df_path_batched False --analysis_additional_cols_to_use url,source,language,doc_lang,doc_lang_iso --analysis_samples_per_partition 1500 --analysis_verbose False --analysis_run_mode stage --line_stats_output_path /home/shanks/setu/examples/output/line_stats/ --doc_stats_output_path /home/shanks/setu/examples/output/doc_stats/ --analysis_output_path /hom

### View Analysis Output

In [58]:
al_output_path = f"/home/{os.environ.get('USER')}/setu/examples/output/doc_stats/doc_lang_partition=english"

In [59]:
parquets = glob(f"{al_output_path}/*.parquet")

In [60]:
parquets

['/home/shanks/setu/examples/output/doc_stats/doc_lang_partition=english/part-00000-5eb4fd30-e028-4dfc-b325-2ad5cf8c12d6.c000.snappy.parquet']

In [61]:
df = pd.read_parquet(parquets[0])

In [62]:
df.head()

Unnamed: 0,doc_id,bytes,words_count,char_count,lines_count,mean_line_length,min_line_length,max_line_length,nsfw_words_count,non_li_char_count,char_ngram_repetition_score,10_gram_characters_repetition_score,word_ngram_repetition_score,5_gram_words_repetition_score,doc_lang
0,8bc919b9-c288-42fd-b503-d0f366e06a14,975,184,975,20,9.2,1,37,0,117,"[(10_gram_characters_repetition_score, 0.06418...",0.064182,"[(5_gram_words_repetition_score, None)]",,english


# Run the Flagging and Filtering Stage

In [63]:
!SETU_DIR=/home/$USER/setu SETU_TMP_DIR=/home/$USER/tmp/ FILTER_DATA_ROOT=/home/$USER/setu/setu/data \
    spark-submit \
    --master spark://YDEARYZEN.:7077 \
    --deploy-mode client \
    --driver-java-options -Djava.io.tmpdir=/home/$USER/tmp/ \
    --conf "spark.driver.extraJavaOptions=-Djava.io.tmpdir=/home/$USER/tmp/" \
    --conf "spark.executor.extraJavaOptions=-Djava.io.tmpdir=/home/$USER/tmp/" \
    --conf spark.worker.dir="/home/$USER/tmp/" \
    --conf spark.local.dir="/home/$USER/tmp/" \
    --num-executors 4 \
    --executor-cores 2 \
    --executor-memory 3G \
    --driver-memory 6G \
    --archives "/home/$USER/setu/dataproc/envs/setu.zip" \
    --conf 'spark.executorEnv.PYTHONPATH=setu.zip' \
    --conf 'spark.executorEnv.FILTER_DATA_ROOT=setu.zip/data' \
    /home/$USER/setu/setu/run.py \
    --config /home/$USER/setu/configs/crawls/spark_english_config.json \
    --mode crawl \
    --run_local True \
    FlaggingAndFilteringStage \
    --doc_stats_parquets_path "/home/$USER/setu/examples/output/doc_stats/*/*.parquet" \
    --is_doc_stats_path_batched False \
    --fnf_samples_per_partition 1500 \
    --fnf_verbose False \
    --fnf_run_mode stage \
    --save_nsfw_data True \
    --nsfw_output_path "/home/$USER/setu/examples/output/nsfw/" \
    --filtered_doc_stats_output_path "/home/$USER/setu/examples/output/filtered_doc_stats/"

24/03/07 17:43:52 WARN Utils: Your hostname, YDEARYZEN resolves to a loopback address: 127.0.1.1; using 172.31.108.238 instead (on interface eth0)
24/03/07 17:43:52 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
24/03/07 17:43:53 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Command used to run this script:  /home/shanks/setu/setu/run.py --config /home/shanks/setu/configs/crawls/spark_english_config.json --mode crawl --run_local True FlaggingAndFilteringStage --doc_stats_parquets_path /home/shanks/setu/examples/output/doc_stats/*/*.parquet --is_doc_stats_path_batched False --fnf_samples_per_partition 1500 --fnf_verbose False --fnf_run_mode stage --save_nsfw_data True --nsfw_output_path /home/shanks/setu/examples/output/nsfw/ --filtered_doc_stats_output_path /home/shanks/setu/examples/output/filtered_doc_stats/
shanks
------------------------------------------------ Setting Environment 

In [64]:
fd_output_path = f"/home/{os.environ.get('USER')}/setu/examples/output/filtered_doc_stats/"

In [65]:
parquets = glob(f"{fd_output_path}/*.parquet")

In [66]:
parquets

['/home/shanks/setu/examples/output/filtered_doc_stats/part-00000-12631a93-117a-4bf9-b2c5-8e21b47dab59-c000.snappy.parquet']

In [67]:
df = pd.read_parquet(parquets[0])

In [69]:
df

Unnamed: 0,doc_id,bytes,words_count,char_count,lines_count,mean_line_length,min_line_length,max_line_length,nsfw_words_count,non_li_char_count,10_gram_characters_repetition_score,5_gram_words_repetition_score,doc_lang,has_less_lines,is_short_lines_heavy,is_nsfw_heavy,is_non_li_heavy,has_char_repetition,has_word_repetition


Note : Since the example was run on single document, There might be no output for Flagging and Filtering Stage, Similarily if there are no NSFW content present the nsfw directory wont be created

#### You can run the DocumentRemovalStage for removing documents based on the computed stats and removing duplicates