<a href="https://colab.research.google.com/github/BenjamminYang/NLP-tutorial/blob/master/Test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pyspark
from pyspark.sql import SparkSession


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.3.2.tar.gz (281.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m281.4/281.4 MB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting py4j==0.10.9.5
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.7/199.7 KB[0m [31m21.7 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.3.2-py2.py3-none-any.whl size=281824028 sha256=8ac502da1db81762597e36b33afc955635c4b55f825aa0f8f1799a9ec784d38e
  Stored in directory: /root/.cache/pip/wheels/6c/e3/9b/0525ce8a69478916513509d43693511463c6468db0de237c86
Successfully built pyspark
Installing collected packages: py4j, pyspa

In [3]:
from pyspark.conf import SparkConf
from pyspark import SparkContext
from pyspark.sql import SQLContext

spark_test = SparkContext() 
config = spark_test.getConf()
config.set('spark.cores.max','4')
config.set('spark.executor.memory', '8G')
config.set('spark.driver.maxResultSize', '8g')
config.set('spark.kryoserializer.buffer.max', '512m')
config.set("spark.driver.cores", "4")

spark_test.stop()

In [4]:
spark_test = SparkContext(conf = config) 
sqlContext = SQLContext(spark_test)
print("Using Apache Spark Version", spark_test.version)

Using Apache Spark Version 3.3.2




In [5]:
#read csv
cb_file = "crunchbase_odm_orgs.csv"
cb_sdf = sqlContext.read.option("header", "true").option("delimiter", ",").option("inferSchema", "true").csv(cb_file)
cb_sdf.count()

1127735

In [6]:
cb_sdf.columns

['uuid',
 'name',
 'type',
 'primary_role',
 'cb_url',
 'domain',
 'homepage_url',
 'logo_url',
 'facebook_url',
 'twitter_url',
 'linkedin_url',
 'combined_stock_symbols',
 'city',
 'region',
 'country_code',
 'short_description']

In [9]:
from pyspark.sql.functions import col, size, split

# Filter companies with name that is only two words
two_word_companies = cb_sdf.filter(size(split(col("name"), " ")) == 2)

# Print count of such companies
print("Number of companies with name that is only two words:", two_word_companies.count())

# Show name and location (city, region, country_code) of such companies
two_word_companies.select("name", "city", "region", "country_code").show()


Number of companies with name that is only two words: 362534
+--------------------+----------------+----------+------------+
|                name|            city|    region|country_code|
+--------------------+----------------+----------+------------+
|         Time Warner|        New York|  New York|         USA|
|       Goldman Sachs|        New York|  New York|         USA|
|     Jingle Networks|        New York|  New York|         USA|
|Hearst Communicat...|        New York|  New York|         USA|
|    Ning Interactive|      Menlo Park|California|         USA|
| Prosper Marketplace|   San Francisco|California|         USA|
|       Tribune Media|         Chicago|  Illinois|         USA|
| Aggregate Knowledge|       San Mateo|California|         USA|
|        Zing Systems|   Mountain View|California|         USA|
|         Amie Street|Long Island City|  New York|         USA|
|          Legg Mason|       Baltimore|  Maryland|         USA|
|        Haute Secure|         Seattle|Wash

In [10]:
# Filter companies located in California
california_companies = cb_sdf.filter(cb_sdf.region == "California")

# Print count of such companies
print("Number of companies located in California:", california_companies.count())

# Show name and location (city, region, country_code) of such companies
california_companies.select("name", "city", "region", "country_code").show()


Number of companies located in California: 94871
+--------------------+--------------+----------+------------+
|                name|          city|    region|country_code|
+--------------------+--------------+----------+------------+
|                Zoho|    Pleasanton|California|         USA|
|            Facebook|    Menlo Park|California|         USA|
|               Accel|     Palo Alto|California|         USA|
|           Omnidrive|     Palo Alto|California|         USA|
|                Geni|West Hollywood|California|         USA|
|             Flektor|   Culver City|California|         USA|
|Fox Interactive M...| Beverly Hills|California|         USA|
|             Twitter| San Francisco|California|         USA|
|         StumbleUpon| San Francisco|California|         USA|
|              Scribd| San Francisco|California|         USA|
|             Slacker|     San Diego|California|         USA|
|                Lala|     Palo Alto|California|         USA|
|               Helio

In [11]:
from pyspark.sql.functions import when

# Add a "Blog" column with row entries set to 1 if the "domain" field contains "blogspot.com", and 0 otherwise
cb_sdf_with_blog = cb_sdf.withColumn("Blog", when(cb_sdf.domain.contains("blogspot.com"), 1).otherwise(0))

# Filter companies with Blog marked as 1
blog_companies = cb_sdf_with_blog.filter(cb_sdf_with_blog.Blog == 1)

# Show name, location (city, region, country_code), and "Blog" column for companies with Blog marked as 1
blog_companies.select("name", "city", "region", "country_code", "Blog").show()


+--------------------+-------------+------------+------------+----+
|                name|         city|      region|country_code|Blog|
+--------------------+-------------+------------+------------+----+
|     Sad Urdu Poetry|  San Antonio|       Texas|         USA|   1|
|      The Tech-Freak|    Sheffield|   Sheffield|         GBR|   1|
|           Ma.Gnolia|San Francisco|  California|         USA|   1|
|      Dynasty Online|         null|        null|        null|   1|
|            Hire-seo|         null|        null|        null|   1|
|          YelloYello|     Rijswijk|Zuid-Holland|         NLD|   1|
|       Youtubehiphop|    São Paulo|   Sao Paulo|         BRA|   1|
|     Payday advances|         null|        null|        null|   1|
|Blog Traffic Exch...|   Menlo Park|  California|         USA|   1|
|Sirius Forex Trad...|         null|        null|        null|   1|
|          Utilsforge|     Delaware|        Ohio|         USA|   1|
|      Discover India|    Faridabad|     Haryana

In [12]:
# Filter the DataFrame for the company with name "Sad Urdu Poetry"
sad_urdu_poetry = cb_sdf.filter(cb_sdf.name == "Sad Urdu Poetry")

# Show the row with name "Sad Urdu Poetry"
sad_urdu_poetry.show(truncate=False)


+------------------------------------+---------------+------------+------------+----------------------------------------------------------------------------------------------------------------+-----------------------------+--------------------------------------+--------------------------------------------------------------------------------------------------------------+---------------------------------------+-----------------------------+------------+----------------------+-----------+------+------------+---------------------------------------------------------------------------------+
|uuid                                |name           |type        |primary_role|cb_url                                                                                                          |domain                       |homepage_url                          |logo_url                                                                                                      |facebook_url                     

In [14]:
from pyspark.sql.functions import udf
from pyspark.sql.types import BooleanType

# Define a Python function to check whether a string is a palindrome
def is_palindrome(s):
    if s is None:
        return False
    return s == s[::-1]

# Register the function as a Spark UDF
udf_is_palindrome = udf(is_palindrome, BooleanType())

# Use the UDF to filter the DataFrame for palindromic company names
palindromic_companies = cb_sdf.filter(udf_is_palindrome(cb_sdf.name))

# Print the count of palindromic companies and show the name and location columns
print("Count of palindromic companies:", palindromic_companies.count())
palindromic_companies.select("name", "city", "region", "country_code").show()


Count of palindromic companies: 808
+------+-------------+--------------+------------+
|  name|         city|        region|country_code|
+------+-------------+--------------+------------+
| KAYAK|     Stamford|   Connecticut|         USA|
| ooVoo|     New York|      New York|         USA|
| 63336|       London|       England|         GBR|
| TipiT|        Delft|  Zuid-Holland|         NLD|
| beweb|     Auckland|      Auckland|         NZL|
|   CSC| Falls Church|      Virginia|         USA|
|   CBC|       Ottawa|       Ontario|         CAN|
|   OQO|San Francisco|    California|         USA|
|   SAS|         Cary|North Carolina|         USA|
|   e4e|  Santa Clara|    California|         USA|
|   PHP|  Little Rock|      Arkansas|         USA|
|   ivi|       Moscow|   Moscow City|         RUS|
|  ADDA|    Bangalore|     Karnataka|         IND|
|izeezi|   Chippenham|     Wiltshire|         GBR|
| siXis|       Durham|North Carolina|         USA|
| STATS|      Chicago|      Illinois|         