In [1]:
import boto3

In [2]:
s3 = boto3.client("s3")

In [3]:
s3.download_file("blossom-data-engs", "all-us-stocks-tickers-company-info-logos.zip", "all-us-stocks-tickers-company-info-logos.zip")
s3.download_file("blossom-data-engs", "data-scientist-job-market-in-the-us.zip", "data-scientist-job-market-in-the-us.zip")

In [4]:
import pyspark
from pyspark.sql.types import BooleanType
from pyspark.sql.functions import udf
from pyspark.sql import functions as F
from pyspark.sql import SparkSession
from pyspark.sql import Window
from pyspark.sql import DataFrame

In [5]:
spark = SparkSession.builder.getOrCreate()

In [6]:
# create a new dataframe from csv file
# when inferSchema is True spark scans the file once to detect the schema
companies = spark.read.csv(
            "companies.csv", 
            header=True, inferSchema=True)
    
alldata = spark.read.csv(
            "alldata.csv", 
            header=True, inferSchema=True)

In [7]:
companies.select("industry", "description", "website").show()

+--------------------+--------------------+--------------------+
|            industry|         description|             website|
+--------------------+--------------------+--------------------+
|Medical Diagnosti...|Agilent Technolog...|http://www.agilen...|
|     Metals & Mining|Alcoa Corp is an ...|http://www.alcoa.com|
|    Asset Management|Altaba Inc is an ...|http://www.altaba...|
|Health Care Provi...|AAC Holdings Inc ...|http://www.americ...|
|                null|The investment se...|                null|
|            AADR.png|                null|           NYSE Arca|
|            Airlines|American Airlines...|   http://www.aa.com|
|    Asset Management|Altisource Asset ...|http://www.altiso...|
|    Insurance - Life|Atlantic American...|http://www.atlam.com|
|Consulting & Outs...|Aaron's Inc is a ...|http://www.aarons...|
|      Semiconductors|Applied Optoelect...|http://www.ao-inc...|
|  Building Materials|AAON Inc is a hea...| http://www.aaon.com|
|Retail - Apparel ...|Adv

In [8]:
companies.count()

7310

In [9]:
alldata.count()

184071

In [10]:
type(alldata)

pyspark.sql.dataframe.DataFrame

In [11]:
alldata.columns

['position', 'company', 'description', 'reviews', 'location']

In [27]:
# Renaming the Description column in alldata to alldataDesc 
alldata = alldata.withColumnRenamed('description', 'alldataDesc')

In [28]:
alldata.columns

['position', 'company', 'alldataDesc', 'reviews', 'location']

In [29]:
alldata.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+
|            position|             company|         alldataDesc|             reviews|            location|
+--------------------+--------------------+--------------------+--------------------+--------------------+
|Development Director|             ALS TDI|Development Director|                null|                null|
|ALS Therapy Devel...| the Development ...| generating aware...| prospects and do...|                 GA.|
|       Requirements:|                null|                null|                null|                null|
|Bachelor's Degree...| written and pres...| as well as the a...|         spreadsheet|            database|
|About ALS Therapy...|                null|                null|                null|                null|
|The ALS Therapy D...| the charity unde...|  based in Cambridge|                  MA| has served as on...|
|            To Apply|               

In [30]:
#aliasing the dataframe
ta = companies.alias('ta')
tb = alldata.alias('tb')

In [31]:
#joining two datasets (Inner join). 
#Filter out non-US companies from the other companies list

joint_table = ta.join(tb, tb['company'] == ta['company name'])

In [32]:
joint_table.columns

['ticker',
 'company name',
 'short name',
 'industry',
 'description',
 'website',
 'logo',
 'ceo',
 'exchange',
 'market cap',
 'sector',
 'tag 1',
 'tag 2',
 'tag 3',
 'position',
 'company',
 'alldataDesc',
 'reviews',
 'location']

In [33]:
#showing info from both pages
joint_table.select("industry", "website", 'company', 'ta.description', "tb.position", "ta.exchange").show(10)

+--------------------+--------------------+---------------+--------------------+--------------------+---------+
|            industry|             website|        company|         description|            position| exchange|
+--------------------+--------------------+---------------+--------------------+--------------------+---------+
|http://www.invesc...|                null|         design|             PSJ.png|Experience follow...|     null|
| sale or distribu...|             PJP.png|    development|http://www.invesc...|Facilitates the d...|476136000|
| services and pro...|http://www.invesc...|    development| etc. It is non-d...|Facilitates the d...|NYSE Arca|
| they provide inv...|            SGOL.png| transportation|http://www.etfsec...|              Travel|886515000|
| sale or distribu...|             PJP.png|    development|http://www.invesc...|Apply health scie...|476136000|
| services and pro...|http://www.invesc...|    development| etc. It is non-d...|Apply health scie...|NYS

In [34]:
joint_table = joint_table.filter(ta.industry.isNotNull())

In [35]:
joint_table = joint_table.filter(ta.description.isNotNull())

In [79]:
joint_table = joint_table.filter(tb.location.isNotNull())
joint_table.count()

242

In [22]:
from pyspark.sql.functions import *

In [37]:
joint_table.columns

['ticker',
 'company name',
 'short name',
 'industry',
 'description',
 'website',
 'logo',
 'ceo',
 'exchange',
 'market cap',
 'sector',
 'tag 1',
 'tag 2',
 'tag 3',
 'position',
 'company',
 'alldataDesc',
 'reviews',
 'location']

In [82]:
joint_table.select('location', F.split(joint_table['location'], ',')[0].alias('city')).show()

+--------------------+--------------------+
|            location|                city|
+--------------------+--------------------+
| and risk factors...| and risk factors...|
| and risk factors...| and risk factors...|
| implement and/or...| implement and/or...|
| and deployment o...| and deployment o...|
| and deployment o...| and deployment o...|
| re-designing cod...| re-designing cod...|
| re-designing inf...| re-designing inf...|
|        publications|        publications|
| and water sector...| and water sector...|
| and Use public t...| and Use public t...|
| land use &amp; m...| land use &amp; m...|
| FPGAs); leveragi...| FPGAs); leveragi...|
| and senior manag...| and senior manag...|
| FPGAs); leveragi...| FPGAs); leveragi...|
| and support deci...| and support deci...|
| Accounts Receiva...| Accounts Receiva...|
| Accounts Receiva...| Accounts Receiva...|
| Accounts Receiva...| Accounts Receiva...|
| re-designing inf...| re-designing inf...|
| and troubleshoot...| and troub

## ngram function to generate unigram and bigram

In [83]:
import numpy
from pyspark.ml.feature import NGram
from pyspark.ml.feature import NGram, Tokenizer

In [85]:
#Initializing the tokennizer
tokens = Tokenizer(inputCol = 'description', outputCol = 'token')
joint_table = tokens.transform(joint_table)

In [88]:
ngrams = NGram(n=1, inputCol = 'tokens', outputCol = 'ngram')
joint_table = ngrams.transform(joint_table)

In [46]:
joint_table.select(['ngrams', 'description']).limit(1).show()

+--------------------+--------------------+
|              ngrams|         description|
+--------------------+--------------------+
|[http://www.inves...|http://www.invesc...|
+--------------------+--------------------+



In [49]:
description = joint_table.select(['ngrams', 'description']).select(
    'description', F.explode('ngrams').alias('unigrams')).show()

+--------------------+--------------------+
|         description|            unigrams|
+--------------------+--------------------+
|http://www.invesc...|http://www.invesc...|
| etc. It is non-d...|                    |
| etc. It is non-d...|                etc.|
| etc. It is non-d...|                  it|
| etc. It is non-d...|                  is|
| etc. It is non-d...|   non-diversified."|
|             PSJ.png|             psj.png|
|http://www.invesc...|http://www.invesc...|
| etc. It is non-d...|                    |
| etc. It is non-d...|                etc.|
| etc. It is non-d...|                  it|
| etc. It is non-d...|                  is|
| etc. It is non-d...|   non-diversified."|
|             PSJ.png|             psj.png|
|             PSJ.png|             psj.png|
|             PSJ.png|             psj.png|
|            SILJ.png|            silj.png|
|http://www.etfsec...|http://www.etfsec...|
|             PSJ.png|             psj.png|
|             PSJ.png|          

In [57]:
description = joint_table.select(['ngrams', 'description']).select(
    'description', F.explode('ngrams').alias('unigrams'))

In [61]:
description.count()

884

In [62]:
description.show()

+--------------------+--------------------+
|         description|            unigrams|
+--------------------+--------------------+
|http://www.invesc...|http://www.invesc...|
| etc. It is non-d...|                    |
| etc. It is non-d...|                etc.|
| etc. It is non-d...|                  it|
| etc. It is non-d...|                  is|
| etc. It is non-d...|   non-diversified."|
|             PSJ.png|             psj.png|
|http://www.invesc...|http://www.invesc...|
| etc. It is non-d...|                    |
| etc. It is non-d...|                etc.|
| etc. It is non-d...|                  it|
| etc. It is non-d...|                  is|
| etc. It is non-d...|   non-diversified."|
|             PSJ.png|             psj.png|
|             PSJ.png|             psj.png|
|             PSJ.png|             psj.png|
|            SILJ.png|            silj.png|
|http://www.etfsec...|http://www.etfsec...|
|             PSJ.png|             psj.png|
|             PSJ.png|          

In [73]:
new = joint_table.select(
    'description', 
    F.explode('ngrams').alias('ngrams')
).groupBy(['desciption', 'ngram'])

In [None]:
count = joint_table.select(['unigrams', 'description']).select(
    'description', F.explode('unigrams').alias('unigrams')).count()

---

In [126]:
joint_table.select(['unigrams', 'description']).select(
    'city', F.explode('unigrams').alias('unigrams')).groupBy(['city', 'unigrams', 'count()']).show()

AnalysisException: "cannot resolve '`city`' given input columns: [unigrams, ta.description, unigrams];;\n'Project ['city, unigrams#1017]\n+- Generate explode(unigrams#467), false, [unigrams#1017]\n   +- Project [unigrams#467, description#14]\n      +- Project [ticker#10, company name#11, short name#12, industry#13, description#14, website#15, logo#16, ceo#17, exchange#18, market cap#19, sector#20, tag 1#21, tag 2#22, tag 3#23, position#48, company#49, alldataDesc#286, reviews#51, location#52, tokens#446, unigrams#467, ngrams#503, token#795, ngram#819, UDF(tokens#446) AS bigrams#852]\n         +- Project [ticker#10, company name#11, short name#12, industry#13, description#14, website#15, logo#16, ceo#17, exchange#18, market cap#19, sector#20, tag 1#21, tag 2#22, tag 3#23, position#48, company#49, alldataDesc#286, reviews#51, location#52, tokens#446, unigrams#467, ngrams#503, token#795, UDF(tokens#446) AS ngram#819]\n            +- Project [ticker#10, company name#11, short name#12, industry#13, description#14, website#15, logo#16, ceo#17, exchange#18, market cap#19, sector#20, tag 1#21, tag 2#22, tag 3#23, position#48, company#49, alldataDesc#286, reviews#51, location#52, tokens#446, unigrams#467, ngrams#503, UDF(description#14) AS token#795]\n               +- Filter isnotnull(location#52)\n                  +- Project [ticker#10, company name#11, short name#12, industry#13, description#14, website#15, logo#16, ceo#17, exchange#18, market cap#19, sector#20, tag 1#21, tag 2#22, tag 3#23, position#48, company#49, alldataDesc#286, reviews#51, location#52, tokens#446, unigrams#467, UDF(tokens#446) AS ngrams#503]\n                     +- Project [ticker#10, company name#11, short name#12, industry#13, description#14, website#15, logo#16, ceo#17, exchange#18, market cap#19, sector#20, tag 1#21, tag 2#22, tag 3#23, position#48, company#49, alldataDesc#286, reviews#51, location#52, tokens#446, UDF(tokens#446) AS unigrams#467]\n                        +- Project [ticker#10, company name#11, short name#12, industry#13, description#14, website#15, logo#16, ceo#17, exchange#18, market cap#19, sector#20, tag 1#21, tag 2#22, tag 3#23, position#48, company#49, alldataDesc#286, reviews#51, location#52, UDF(description#14) AS tokens#446]\n                           +- Filter isnotnull(location#52)\n                              +- Filter isnotnull(description#14)\n                                 +- Filter isnotnull(industry#13)\n                                    +- Join Inner, (company#49 = company name#11)\n                                       :- SubqueryAlias `ta`\n                                       :  +- Relation[ticker#10,company name#11,short name#12,industry#13,description#14,website#15,logo#16,ceo#17,exchange#18,market cap#19,sector#20,tag 1#21,tag 2#22,tag 3#23] csv\n                                       +- SubqueryAlias `tb`\n                                          +- Project [position#48, company#49, description#50 AS alldataDesc#286, reviews#51, location#52]\n                                             +- Relation[position#48,company#49,description#50,reviews#51,location#52] csv\n"

---

In [97]:
ngrams = NGram(n=2, inputCol = 'tokens', outputCol = 'bigrams')
joint_table = ngrams.transform(joint_table)

In [98]:
joint_table.select('bigrams').limit(1).take(1)

[Row(bigrams=[])]

In [100]:
new2 = joint_table.select('bigrams').limit(1).take(1)

In [111]:
location = joint_table.select(['bigrams', 'description']).select(
    'description', F.explode('bigrams').alias('bigrams'))

In [106]:
location.show()

+--------------------+--------------------+
|         description|             bigrams|
+--------------------+--------------------+
| etc. It is non-d...|                etc.|
| etc. It is non-d...|             etc. it|
| etc. It is non-d...|               it is|
| etc. It is non-d...|is non-diversified."|
| etc. It is non-d...|                etc.|
| etc. It is non-d...|             etc. it|
| etc. It is non-d...|               it is|
| etc. It is non-d...|is non-diversified."|
| leisure products...|             leisure|
| leisure products...|    leisure products|
| leisure products...|        products and|
| leisure products...|        and services|
| perpetual subord...|           perpetual|
| perpetual subord...|perpetual subordi...|
| perpetual subord...|   subordinated debt|
| perpetual subord...|            debt and|
| perpetual subord...|         and certain|
| perpetual subord...|     certain capital|
| perpetual subord...| capital securities.|
| perpetual subord...|      secu

In [None]:
industry = joint_table.select(['ngrams', 'industry']).select(
    'industry', F.explode('ngrams').alias('ngrams'))

In [112]:
joint_table.columns

['ticker',
 'company name',
 'short name',
 'industry',
 'description',
 'website',
 'logo',
 'ceo',
 'exchange',
 'market cap',
 'sector',
 'tag 1',
 'tag 2',
 'tag 3',
 'position',
 'company',
 'alldataDesc',
 'reviews',
 'location',
 'tokens',
 'unigrams',
 'ngrams',
 'token',
 'ngram',
 'bigrams']

In [122]:
location = joint_table.select(['location', 'bigrams'])
location.count()

242

---

## plotting using matplotlib