In [1]:
import pymongo 
import pandas as pd 
import plotly as px 
import numpy as np 
import sys
import pymongo
from pymongo import MongoClient 
import re
import nltk
from nltk.collocations import *
from nltk.corpus import stopwords
from collections import Counter
sys.path.append("../DataPipe/")
from scraping.classes.DataBase.Mongo import *
from scraping.classes.Role import *

In [50]:
class Analysis_Processing:
    def __init__(self,db:Mongo,role:Role):
        self.db = db
        self.role = role

    def get_data(self,pipe,col) -> pd.DataFrame:
        query_cursor = self.db.db[col].aggregate(pipe)
        print(list(query_cursor))
        out = pd.DataFrame(query_cursor)
        if ( out.empty):
            raise Exception("DATA FRAME EMPTY")
        return out

    def cleanse_sentence(self,sentence:str):
        stop = stopwords.words('english')
        sentence_clean = sentence.replace("-", " ")
        sentence_clean = re.sub("[\n]", " ",sentence_clean)
        sentence_clean = re.sub("[.!?/\()-,:]", "",sentence_clean)
        sentence_clean = sentence_clean.lower()
        sentence_clean = " ".join([word for word in sentence_clean.split(" ") if word not in stop])
        return sentence_clean

    def word_count(self, text:pd.Series):
        return Counter(self.cleanse_sentence(" ".join(text)).split(" "))

    def text_count(self,query_df):
        urls = query_df.urls.unique()
        count_arr = np.empty(shape = (len(urls)),dtype = Counter)
        for i,url in enumerate(urls):
            count_arr[i] = self.word_count(query_df[query_df.urls == url].text)
        return count_arr 

    def per_role_analysis(self,keyword_path,insert = False,col = None,insert_key = None ):
        tech_count = {}
        tech_list = pd.read_csv(keyword_path ,index_col=[0]).iloc[:,0].str.lower()
        for dicts in counters:
            for word in dicts.keys():
                if word in list(tech_list): 
                    if word not in tech_count.keys():
                        tech_count.update({word:1})
                    else:
                        tech_count[word] += 1
        if insert == True: 
            requests = [InsertOne({f"{insert_key}":tech_count,'role':self.role.title})]
            self.db.db[col].bulk_write(requests)
            print('successful insertion')
        else: 
            return tech_count

    def do_analysis(self,keyword_path,col = None ):
        urls = []
        #load tech lits
        tech_list = list(pd.read_csv(keyword_path ,index_col=[0]).iloc[:,0].str.lower())
        #load all text 
        model_outs = self.db.db[col].find({},{"text":1, "_id":0,'urls':1 })
        #iterate through text, for items in list
        for text_dict in model_outs:
            found = []
            text = text_dict.get("text")
            text = self.cleanse_sentence(text)
            for word in text.split(" "):
                if word in tech_list:
                   found.append(word)
            urls.append({'url':text_dict.get("urls"), 'tech_list': found})
        return urls


            

    def strip_digits_from_corupus(self,text):
        subs = re.sub("[\d+][+-]", "",text)
        subs = re.sub("[’']", "",subs)
        return(subs)
        
    def bigram_analysis(self,df,thresh = 5,insert = False,col = "bigrams"):
        bigram_measures = nltk.collocations.BigramAssocMeasures()
        corpus_list = [self.strip_digits_from_corupus(sentence) for sentence in df.text]
        corpus = ' '.join(corpus_list)
        finder = BigramCollocationFinder.from_words(corpus.lower().split(" "),window_size=2)
        finder.apply_freq_filter(thresh)
        bigram_results = finder.score_ngrams(bigram_measures.pmi)
        if insert == True: 
            requests =  [InsertOne({"bigram":x[0],"pmi":x[1],'role':self.role.title}) for x in bigram_results]
            self.db.db[col].write(requests)
            print("successful insertion")
        else: 
            return bigram_results 

    def store_analysis(self,db,data:dict):
        [{"bigram":x[0],"pmi":x[1]} for x in analysis.bigram_analysis(df = query_df)]
        requests = [InsertOne(x) for x in data]
        scrape_table.collection.bulk_write(requests)
        return None

In [52]:
client = MongoClient()
db = Mongo(client)
analysis = Analysis_Processing(db,Role("Data Science") )
inserts = analysis.do_analysis(r"C:\Users\Emiliano\Documents\Git\DataScienceReq\data\languages.txt" ,col = 'model_outputs')
# query_df = analysis.get_data()
# counters = analysis.text_count(query_df)
# path_tech = r"C:\Users\Emiliano\Documents\Git\DataScienceReq\data\languages.txt"
# tech_count = analysis.per_role_analysis(path_tech)
# path_packages = r'C:\Users\Emiliano\Documents\Git\DataScienceReq\data\DS_packages.txt'
# analysis.per_role_analysis(path_packages ,col = "packages",insert_key = "packages")

In [60]:
[x for x in inserts if x.get("tech_list")][1:3]

[{'url': 'https://ca.linkedin.com/jobs/view/data-scientist-at-people-machine-2778956194?refId=cdDjEuJS6WBpy01lwjdWjw%3D%3D&trackingId=gfPYR6gG2WESTVnOjY8srg%3D%3D&position=2&pageNum=0&trk=public_jobs_jserp-result_search-card',
  'tech_list': ['python', 'r', 'sql']},
 {'url': 'https://ca.linkedin.com/jobs/view/data-applied-scientist-at-microsoft-2770518771?refId=cdDjEuJS6WBpy01lwjdWjw%3D%3D&trackingId=RdwqUgyKzXFjoh2WAbKzug%3D%3D&position=5&pageNum=0&trk=public_jobs_jserp-result_search-card',
  'tech_list': ['python', 'c', 'c', 'java', 'scala']}]

In [61]:
insert = [x for x in inserts if x.get("tech_list") ]
client.prod.techs.insert_many(insert)

<pymongo.results.InsertManyResult at 0x1b8e9dfe580>

In [None]:
# pipe = [{
#    '$lookup':
#      {
#        'from': "Scraped_Data",
#       'localField': "urls",
#       'foreignField':"url",
#       'as': "test"
#      }
# },
# {'$match': {"test.country": "US" }}
# ]

# query_df = analysis.get_data(pipe = pipe,col = 'model_outputs')

[{'_id': ObjectId('617d5d5979ce654f9090f711'), 'text': 'Using verbal and written communication skills to convey basic, routine factual information about day-to-day activities to others who are fully knowledgeable in the subject area.', 'urls': 'https://ca.linkedin.com/jobs/view/machine-learning-engineer-at-qualcomm-2771065339?refId=cdDjEuJS6WBpy01lwjdWjw%3D%3D&trackingId=5LRDikTHyQZbsiZGjSU3Pw%3D%3D&position=1&pageNum=0&trk=public_jobs_jserp-result_search-card', 'role': 'Data Science', 'test': [{'_id': ObjectId('617c88a1fbfa44c2cc99e639'), 'title': 'Machine Learning ', 'url': 'https://ca.linkedin.com/jobs/view/machine-learning-engineer-at-qualcomm-2771065339?refId=cdDjEuJS6WBpy01lwjdWjw%3D%3D&trackingId=5LRDikTHyQZbsiZGjSU3Pw%3D%3D&position=1&pageNum=0&trk=public_jobs_jserp-result_search-card', 'location': 'Markham, Ontario, Canada', 'company': 'Qualcomm', 'city': 'Markham', 'region': 'Ontario', 'country': 'US', 'role': 'machine learning engineer', 'date': '2021-10', 'description': "Co

Exception: DATA FRAME EMPTY