# Creating spark context

This pyspark script applies spark ML method on spark dataframes and RDD's. Unfortunately the function allowing to load the Google word2vec model is broken and was fixed only for (scala) spark. Nonetheless, we use a word2vec model trained on our data and match the corresponding vectors. Unfortunately results are less precise and the only merging algorithms that yield the same results as our python notebooks are for MinHashLSH or RandomBucketizedProjection. We tried those out but only obtained very poor results. As such we make this notebook available, which may save some time on preprocessing and vectorization but gives up the accuracy of the Google Word2Vec model. 

In [None]:
import os
import findspark
your_path = os.getcwd() # Path to spark if needed!
findspark.init(your_path + 'spark-2.4.3-bin-hadoop2.7')

In [1]:
import pyspark 
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

from pyspark.sql import SQLContext
sqlContext = SQLContext(spark)
sc = spark.sparkContext

import pandas as pd 
import numpy as np

import pickle
import copy

from pyspark.sql import udf
from pyspark.sql.types import StructType,StringType,StructField,FloatType
from pyspark.sql.functions import *

from pyspark.ml import Pipeline
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, NGram, Word2Vec, HashingTF, MinHashLSH



# Loading spark dataframes

### Course schema

In [2]:
courseSchema = StructType([ #StructField('index', StringType(), True),
                           StructField('University', StringType(), True)\
                           ,StructField("Program", StringType(), True)\
                           ,StructField("Courses", StringType(), True)\
                           ,StructField("text", StringType(), True)])

courses_pd = pd.read_csv("proto1/universities_full.csv",index_col = 0)
courses = spark.createDataFrame(courses_pd,schema=courseSchema)

#courses = (spark.read
#    .schema(courseSchema)
#    .option("header", "true")
#    .option("mode", "DROPMALFORMED")
#    .csv("proto1/universities_full.csv"))
['University',"Program","Courses"]

['University', 'Program', 'Courses']

### Skill schema

In [3]:
skillSchema = StructType([ #StructField('index', StringType(), True)\,
                          StructField("Skill", StringType(), True)\
                          ,StructField("text", StringType(), True)])
#skillSchema = StructType([ StructField("text", StringType(), True)])

skills_pd = pd.read_csv("proto1/skills_full.csv",index_col = 0)
skills = spark.createDataFrame(skills_pd,schema=skillSchema)

#skills = (spark.read
#    .schema(skillSchema)
#    .option("header", "true")
#    .option("mode", "DROPMALFORMED")
#    .csv("proto1/skills_full.csv"))

### Occupation schema

In [4]:
occupationSchema = StructType([ StructField("Occupation", StringType(), True)\
                               ,StructField("Skill", StringType(), True)])

occupations_pd = pd.read_csv('proto1/occupations_full.csv')
occupations = spark.createDataFrame(occupations_pd,schema=occupationSchema)


#occupations = (spark.read
#    .schema(occupationSchema)
#    .option("header", "true")
#    .option("mode", "DROPMALFORMED")
#    .csv("proto1/occupations_full.csv"))

# Defining pipeline for string similarity

### Defining stopwords for pre-processing (faster than loading from nltk)

In [5]:
stopW = ['i','me','my', 'myself','we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours',
 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its',
 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these',
 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did',
 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about',
 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in',
 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how',
 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same',
 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm',
 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't",
 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan',
 "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't","pdf"]

### Spark pipeline 

In [6]:
course = courses.select("text").toDF("text").filter(col('text').isNotNull())
skill = skills.select("text").toDF("text").filter(col('text').isNotNull())

In [9]:
# Word2Vec trained on course strings
model = Pipeline(stages=[
    RegexTokenizer(gaps = False, pattern = '\w+', inputCol = 'text', outputCol = 'tokens'),
    StopWordsRemover(stopWords = stopW, inputCol = 'tokens', outputCol = 'tokens_sw'),
    #NGram(n=2, inputCol="tokens_sw", outputCol="ngrams"),
    Word2Vec(vectorSize = 300, minCount = 2, inputCol = 'tokens_sw',outputCol = 'vectors')
]).fit(course)

### Generating columns defined above

In [10]:
course_hashed = model.transform(course)
skill_hashed = model.transform(skill)

### Defining cosine similarity 

In [15]:
def cossim(v1, v2): 
    return  np.absolute(np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2)))

Tranforming transformed dataframes to rdd  

In [None]:
length_test1 = course_hashed.count()
length_test2 = skill_hashed.count()
test1 = course_hashed.select("vectors").rdd
test2 = skill_hashed.select("vectors").rdd
test1.cache()
test2.cache()
test3 = test1.collect()
test4 = test2.collect()

pandas inefficient matching...

In [18]:
temp = pd.DataFrame(columns = ['University',"Program","Courses","Skills",'Similarity'])
l = 0

for i in range(0,length_test1): 
    for j in range(0,length_test2): 
        if np.linalg.norm(test3[i][0]) != 0 and np.linalg.norm(test4[j][0]) != 0:
            score = cossim(test3[i][0],test4[j][0])
            if score > 0.65:
                temp.loc[l,['University',"Program","Courses"]] = courses_pd.loc[i,['University',"Program","Courses"]]
                temp.loc[l,'Skills'] = skills_pd.loc[j,'Skill']
                temp.loc[l,'Similarity'] = score
                l += 1
        else:
            pass

In [21]:
temp.to_csv('NLP_results_proto1.csv')

In [22]:
test = pd.read_csv('NLP_results_proto1.csv',index_col = 0)

In [24]:
test.tail()

Unnamed: 0,University,Program,Courses,Skills,Similarity
149187,Free University of Brussels,Master of Economics of Globalisation and Europ...,Academic Writing,use specific writing techniques,0.660128
149188,Free University of Brussels,Master of Economics of Globalisation and Europ...,Academic Writing,report analysis results,0.697671
149189,Free University of Brussels,Master of Economics of Globalisation and Europ...,Academic Writing,study relevant writing,0.693779
149190,Free University of Brussels,Master of Economics of Globalisation and Europ...,Academic Writing,scientific research methodology,0.677442
149191,Free University of Brussels,Master of Economics of Globalisation and Europ...,Academic Writing,perform background research on writing subject,0.689483
