In [1]:
import time
start = time.time()

# Creating spark context

Similar to what we did in prototype 1, we use spark dataframes instead of pandas dataframes to match research article content to AI course content. In this script we use a mix of spark dataframes and RDD's and as in poc1 we train a word2vec model on our data (loagGoogleModel is broken). We compute the cosine similarity for each combination of vector representation of strings, and when the score is higher than a threshold we record the match. We then retrieve all the additionnal information from each dataset and join it in one full dataset. 
Given that we can't use Google's Word2Vec model here, the results are a bit less acurate. Although we gain time with the pre-processing steps and word2vec training and fitting. This also allows our NLP to be scaled to much larger datasets than the one we're currently working now. 

In [None]:
import findspark
import os
your_path = os.getcwd()
findspark.init(your_path + 'spark-2.4.3-bin-hadoop2.7')

In [2]:
import pyspark 
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

from pyspark.sql import SQLContext
sqlContext = SQLContext(spark)
sc = spark.sparkContext

import pandas as pd
import numpy as np

import pickle
import copy

from pyspark.sql import udf
from pyspark.sql.types import StructType,StringType,StructField,IntegerType
from pyspark.sql.functions import col

from pyspark.ml import Pipeline
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, NGram, Word2Vec, HashingTF, IDF, MinHashLSH

# Loading spark dataframes

### Sector schema

In [3]:
sectorSchema = StructType([# StructField('index', IntegerType(), True)\,
                            StructField("Sector", StringType(), True)\
                           ,StructField("text", StringType(), True)])
sectors_pd = pd.read_csv('proto2/articles_full.csv', index_col = 0)
sectors = spark.createDataFrame(sectors_pd,schema=sectorSchema)

#sectors = (spark.read
#    .schema(sectorSchema)
#    .option("header", "true")
#    .csv("proto2/articles_full.csv"))

### Course schema

In [4]:
courseSchema = StructType([# StructField('index', IntegerType(), True)\,
                            StructField("Courses", StringType(), True)\
                           ,StructField("text", StringType(), True)])

courses_pd = pd.read_csv('proto2/courses_full.csv', index_col = 0)
courses = spark.createDataFrame(courses_pd, schema = courseSchema)

#courses = (spark.read
#    .schema(courseSchema)
#    .option("header", "true")
#    .option("mode", "DROPMALFORMED")
#    .csv("proto2/courses_full.csv"))

# Defining pipeline for string similarity

### Defining stopwords for pre-processing (faster than loading from nltk)

In [5]:
stopW = ['i','me','my', 'myself','we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours',
 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its',
 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these',
 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did',
 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about',
 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in',
 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how',
 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same',
 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm',
 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't",
 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan',
 "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't","pdf"]

### Course pipeline 

In [None]:
course = courses.select("text").toDF("text")

model_course = Pipeline(stages=[
    RegexTokenizer(gaps = False, pattern = '\w+', inputCol = 'text', outputCol = 'tokens'),
    StopWordsRemover(stopWords = stopW, inputCol = 'tokens', outputCol = 'tokens_sw'),
    NGram(n=2, inputCol="tokens_sw", outputCol="ngrams"),
    Word2Vec(vectorSize = 300, minCount = 2, inputCol = 'tokens_sw',outputCol = 'vectors')
]).fit(course)

course_hashed = model_course.transform(course)

In [19]:
sector = sectors.select("text").toDF("text")

sector_hashed = model_course.transform(sector)

In [20]:
def cossim(v1, v2): 
    return  np.absolute(np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2)))

In [22]:
course_hashed.show(5)
sector_hashed.show(5)

+--------------------+--------------------+--------------------+--------------------+--------------------+
|                text|              tokens|           tokens_sw|              ngrams|             vectors|
+--------------------+--------------------+--------------------+--------------------+--------------------+
|This seminar trie...|[this, seminar, t...|[seminar, tries, ...|[seminar tries, t...|[0.00591259398497...|
|This seminar will...|[this, seminar, w...|[seminar, cover, ...|[seminar cover, c...|[0.00306318020793...|
|The seminar provi...|[the, seminar, pr...|[seminar, provide...|[seminar provides...|[0.01423976779253...|
|The DSS are inter...|[the, dss, are, i...|[dss, interactive...|[dss interactive,...|[0.00690956534051...|
|In this module, “...|[in, this, module...|[module, satisfic...|[module satisfici...|[0.01031608912197...|
+--------------------+--------------------+--------------------+--------------------+--------------------+
only showing top 5 rows

+-----------

In [21]:
length_test1 = course_hashed.count()
length_test2 = sector_hashed.count()
test1 = course_hashed.select("vectors").rdd
test2 = sector_hashed.select("vectors").rdd
test1.cache()
test2.cache()
test3 = test1.collect()
test4 = test2.collect()

In [28]:
sectors_pd.loc[4,'Sector']

'Legal and accounting activities'

In [None]:
temp = pd.DataFrame(columns = ["Sector",'Article',"Courses"])
l = 0

for i in range(0,length_test1): 
    for j in range(0,length_test2): 
        if np.linalg.norm(test3[i][0]) != 0 and np.linalg.norm(test4[j][0]) != 0:
            score = cossim(test3[i][0],test4[j][0])
            if score > 0.3:
                temp.loc[l,['Courses']] = courses_pd.loc[i,["Courses"]]
                temp.loc[l,['Sector','Article']] = sectors_pd.loc[j,['Sector','text']]
                temp.loc[l,'Similarity'] = score
                l += 1
        else:
            pass

In [32]:
temp.to_csv('NLP_results_proto2.csv')

### Sector pipeline

In [None]:
sector = sectors.select("text").toDF("text").filter(col('text').isNotNull())

# Word2Vec treained on article strings
model_sector = Pipeline(stages=[
    RegexTokenizer(gaps = False, pattern = '\w+', inputCol = 'text', outputCol = 'tokens'),
    StopWordsRemover(stopWords = stopW, inputCol = 'tokens', outputCol = 'tokens_sw'),
    NGram(n=2, inputCol="tokens_sw", outputCol="ngrams"),
    Word2Vec(vectorSize = 100, minCount = 2, inputCol = 'tokens_sw',outputCol = 'vectors')
    #HashingTF(inputCol="ngrams", outputCol="rawFeatures"),
    #IDF(inputCol="rawFeatures", outputCol="features"),
    #MinHashLSH(inputCol="rawFeatures", outputCol="lsh",seed=10)
]).fit(sector)

# Generating columns defined above
sector_hashed = model_sector.transform(sector)

### Generating dataframe with matches

In [None]:
matches = model.stages[-1].approxSimilarityJoin(sector_hashed, course_hashed,threshold = 1000000000, distCol="EuclideanDistance")
#.select('datasetA','datasetB')

In [None]:
matches.show(100)

In [18]:
matches.count()

65

## Preparing the data for Graph construction 

### Extracting only text

In [9]:
matches = matches.withColumn("Articles", matches["datasetA"]["text"]).withColumn("AI_topics", matches["datasetB"]["text"]).select("Articles",'AI_topics')

### Matching course labels 

In [10]:
left_join = matches.join(courses, matches.AI_topics == courses.text,how='left') 

# Free space in cache !
matches.unpersist()
courses.unpersist()

left_join = left_join.select("Articles",'Courses')

### Matching skill labels

In [11]:
full_data = left_join.join(sectors, left_join.Articles == sectors.text, how = "left")

# Free space in cache !
left_join.unpersist()
sectors.unpersist()

full_data = full_data.select("Courses","Sector")

### Cleaning the data

In [12]:
full_data = full_data.filter(col('Courses').isNotNull()).filter(col('Sector').isNotNull())

In [13]:
full_data.show(10)

+--------------------+--------------------+
|             Courses|              Sector|
+--------------------+--------------------+
|DT2119 Speech and...|Accomodation and ...|
|DISTRIBUTED ALGOR...|Accomodation and ...|
|       Linear models|Accomodation and ...|
|Optimization for ...|Accomodation and ...|
|Project Managemen...|Accomodation and ...|
|Algebraic Methods...|Accomodation and ...|
|Information Security|Accomodation and ...|
|Computability and...|Accomodation and ...|
|Shape Modeling an...|Accomodation and ...|
|  Probability Theory|Accomodation and ...|
+--------------------+--------------------+
only showing top 10 rows



In [14]:
end = time.time()
print(end-start)

724.6993787288666


### Pandas conversion?

This is probably the most time consuming process of all... If we could somehow avoid having to transfer data as pandas that would be great.

In [15]:
test = full_data.toPandas()
#full_data.unpersist()

In [16]:
test.head()

Unnamed: 0,Courses,Sector
0,DT2119 Speech and Speaker Recognition,Accomodation and food service activities
1,DISTRIBUTED ALGORITHMS,Accomodation and food service activities
2,Linear models,Accomodation and food service activities
3,Optimization for Data Science,Accomodation and food service activities
4,Project Management and Risk Control,Accomodation and food service activities


In [17]:
test.to_csv('pyspark__results_proto2.csv')

In [None]:
end = time.time()
print(end-start)