In [10]:
import pandas as pd

'''
Loading Gensim and nltk libraries
'''

import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np

In [15]:
df= pd.read_csv('Arabic-Original.csv', sep='|', header=None)

df.head()

Unnamed: 0,0,1,2
0,1,1,بِسْمِ اللَّهِ الرَّحْمَٰنِ الرَّحِيمِ
1,1,2,الْحَمْدُ لِلَّهِ رَبِّ الْعَالَمِينَ
2,1,3,الرَّحْمَٰنِ الرَّحِيمِ
3,1,4,مَالِكِ يَوْمِ الدِّينِ
4,1,5,إِيَّاكَ نَعْبُدُ وَإِيَّاكَ نَسْتَعِينُ


In [16]:
df.columns

Int64Index([0, 1, 2], dtype='int64')

#### Data Preprocessing

We will perform the following steps:

**Tokenization**: Split the text into sentences and the sentences into words. Lowercase the words and remove punctuation. <br>
Words that have fewer than 3 characters are removed. <br>
All stopwords are removed. <br>
**Words are lemmatized** - words in third person are changed to first person and verbs in past and future tenses are changed into present. <br>
Words are stemmed - words are reduced to their root form.

In [26]:
'''
Loading Gensim and nltk libraries
'''

import gensim
from gensim.utils import simple_preprocess
# from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer, ISRIStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(400)

In [13]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ABUTON\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [33]:
'''
Write a function to perform the pre processing steps on the entire dataset
'''
stemmer = SnowballStemmer("english")
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

# Tokenize and Lemmatize
def preprocess(text):
    import re
    result=[]
    for token in gensim.utils.simple_preprocess(text) :
        if token not in nltk.corpus.stopwords.words('arabic') and len(token) > 3:
#             a = lemmatize_steming(re.sub('[~]', '', token))
            result.append(lemmatize_stemming(token))
            
    return result

In [23]:
df.reset_index(inplace=True)

In [24]:
df.head()

Unnamed: 0,index,0,1,2
0,0,1,1,بِسْمِ اللَّهِ الرَّحْمَٰنِ الرَّحِيمِ
1,1,1,2,الْحَمْدُ لِلَّهِ رَبِّ الْعَالَمِينَ
2,2,1,3,الرَّحْمَٰنِ الرَّحِيمِ
3,3,1,4,مَالِكِ يَوْمِ الدِّينِ
4,4,1,5,إِيَّاكَ نَعْبُدُ وَإِيَّاكَ نَسْتَعِينُ


In [34]:
'''
Preview a document after preprocessing
'''

document_num = 4310
doc_sample = df[df['index'] == document_num].values[0][-1]

print("Original document: ")
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print("\n\nTokenized and lemmatized document: ")
print(preprocess(doc_sample))

Original document: 
['وَالَّذِينَ', 'إِذَا', 'أَصَابَهُمُ', 'الْبَغْيُ', 'هُمْ', 'يَنْتَصِرُونَ']


Tokenized and lemmatized document: 
[]


In [30]:
df[df['index'] == document_num].values[0][-1]

'وَالَّذِينَ إِذَا أَصَابَهُمُ الْبَغْيُ هُمْ يَنْتَصِرُونَ'

## Testing MechanicalSoup

In [35]:
import mechanicalsoup

browser = mechanicalsoup.StatefulBrowser()

url = "https://www.google.com/search?q=cat&sxsrf=ALeKk02OouVpPFDXjIrDvJmBUg6nnDeDAw:1610807126259&source=lnms&tbm=isch&sa=X&ved=2ahUKEwj3nr-U1KDuAhW3QRUIHXhFBCgQ_AUoAXoECBQQAw&biw=1366&bih=695"
browser.open(url)

<Response [200]>

In [36]:
# get HTML
browser.get_current_page()

<!DOCTYPE html PUBLIC "-//WAPFORUM//DTD XHTML Mobile 1.0//EN" "http://www.wapforum.org/DTD/xhtml-mobile10.dtd">
<html lang="en-NG" xmlns="http://www.w3.org/1999/xhtml"><head><meta content="application/xhtml+xml; charset=utf-8" http-equiv="Content-Type"/><meta content="no-cache" name="Cache-Control"/><title>cat - Google Search</title><style>a{text-decoration:none;color:inherit}a:hover{text-decoration:underline}a img{border:0}body{font-family:Roboto,Helvetica,Arial,sans-serif;padding:8px;margin:0 auto;max-width:700px;min-width:240px;}.FbhRzb{border-left:thin solid #dadce0;border-right:thin solid #dadce0;border-top:thin solid #dadce0;height:40px;overflow:hidden}.n692Zd{margin-bottom:10px}.cvifge{height:40px;border-spacing:0}.QvGUP{height:40px;padding:0 8px 0 8px;vertical-align:top}.O4cRJf{height:40px;width:100%;padding:0;padding-right:16px}.O1ePr{height:40px;padding:0;vertical-align:top}.kgJEQe{height:36px;width:98px;vertical-align:top;margin-top:4px}.lXLRf{vertical-align:top}.MhzMZd{bord

In [37]:
# target the search input
browser.select_form()
browser.get_current_form().print_summary()

<input name="biw" type="hidden" value="1366"/>
<input name="bih" type="hidden" value="695"/>
<input name="ie" type="hidden" value="ISO-8859-1"/>
<input name="tbm" type="hidden" value="isch"/>
<input name="oq" type="hidden"/>
<input name="aqs" type="hidden"/>
<input class="MhzMZd" name="q" type="text" value="cat"/>
<input class="xB0fq" type="submit" value="Search"/>


In [38]:
# search for a term
search_term = 'cat'
browser['q'] = search_term

In [39]:
type(browser)

mechanicalsoup.stateful_browser.StatefulBrowser

In [40]:
# submit/ "click" search
browser.launch_browser()


In [41]:
response = browser.submit_selected()

In [42]:
print('new_url: ', browser.get_url())
print('response: \n', response.text[:500])

new_url:  https://www.google.com/search?biw=1366&bih=695&ie=ISO-8859-1&tbm=isch&oq=&aqs=&q=cat
response: 
 <!DOCTYPE html PUBLIC "-//WAPFORUM//DTD XHTML Mobile 1.0//EN" "http://www.wapforum.org/DTD/xhtml-mobile10.dtd"><html xmlns="http://www.w3.org/1999/xhtml" lang="en-NG"><head><meta content="application/xhtml+xml; charset=UTF-8" http-equiv="Content-Type"/><meta content="no-cache" name="Cache-Control"/><title>cat - Google Search</title><style>a{text-decoration:none;color:inherit}a:hover{text-decoration:underline}a img{border:0}body{font-family:Roboto,Helvetica,Arial,sans-serif;padding:8px;margin:0 a


Navigate to the new pages and target all the images, output will be returned as URLs list

In [43]:
# open URL
new_url = browser.get_url()
browser.open(new_url)

<Response [200]>

In [44]:
# get HTML code
page = browser.get_current_page()
all_images = page.find_all('img')

In [46]:
type(all_images)

bs4.element.ResultSet

In [47]:
# target the source attributes of image
img_src = []
for image in all_images:
    image = image.get('src')
    img_src.append(image)

In [48]:
len(img_src)

21

In [50]:
img_src[:10]

['/images/branding/searchlogo/1x/googlelogo_desk_heirloom_color_150x55dp.gif',
 'https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTD9EBJQoQP6dwl7Vq_LaYshi7rJ0Gy7Lp1IT--g4t4mquU0E9ul-9Fj8f6H4o&s',
 'https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTBU9Fx1oykyNxQLIuB8KyUBNhC0rqjjCIpQB8sUkethHM_MpNtFqL-7kMhzw&s',
 'https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcR1M6cnFrSeLZ4Mhu6G7M4_1hxuM1Svg-QJ2dnBYv5kM1Kg5GDzx91AZccX-Q&s',
 'https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcS0VgOQNStRu2FMOu93paswZZzCBmmNcBz43URlXKhL3bqjNsYUb-gaB-dKOWs&s',
 'https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTV3-zBg2kpIl_oDZMC0gNdGplc5zMRQEI418ZJ26dnzSEQ4hRTyEDE71pJCA&s',
 'https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQpEPs6OzeZseb-LvnqZGty5j4lTd_Kv18gPvL87DhhlNSwtPmAT6diUT7eqJc&s',
 'https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcR7UDPBCebpUM7tMLJEJwrU_6DW6OBmLaLBVSmWM8YXq7ChWw2fPbR4ZrKEiA&s',
 'https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTGP5Zmc9uybitaj

Fixing the Corrupted URLs

In [51]:
# py startswith function to remove all element not having HTTPS

# save cleaned links in img_src

img_src = [img for img in img_src if img.startswith('https')]

print(img_src)

['https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTD9EBJQoQP6dwl7Vq_LaYshi7rJ0Gy7Lp1IT--g4t4mquU0E9ul-9Fj8f6H4o&s', 'https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTBU9Fx1oykyNxQLIuB8KyUBNhC0rqjjCIpQB8sUkethHM_MpNtFqL-7kMhzw&s', 'https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcR1M6cnFrSeLZ4Mhu6G7M4_1hxuM1Svg-QJ2dnBYv5kM1Kg5GDzx91AZccX-Q&s', 'https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcS0VgOQNStRu2FMOu93paswZZzCBmmNcBz43URlXKhL3bqjNsYUb-gaB-dKOWs&s', 'https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTV3-zBg2kpIl_oDZMC0gNdGplc5zMRQEI418ZJ26dnzSEQ4hRTyEDE71pJCA&s', 'https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQpEPs6OzeZseb-LvnqZGty5j4lTd_Kv18gPvL87DhhlNSwtPmAT6diUT7eqJc&s', 'https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcR7UDPBCebpUM7tMLJEJwrU_6DW6OBmLaLBVSmWM8YXq7ChWw2fPbR4ZrKEiA&s', 'https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTGP5Zmc9uybitajY-AsRj99o33w7qJMmN6RSoaD4BI4ZaM7qNqtd73bWL8LXw&s', 'https://encrypted-tbn0.gstatic.com

In [53]:
# create a local repo to store cat images

import os

path = os.getcwd()
path = os.path.join(path, search_term + "s")
print(path)

C:\Users\ABUTON\Desktop\ML_PATH\10Acad AfterMath\Modelling Topics\cats


In [54]:
# create the directory
os.mkdir(path)
# print path where cats images are saving at
path

'C:\\Users\\ABUTON\\Desktop\\ML_PATH\\10Acad AfterMath\\Modelling Topics\\cats'

In [55]:
# download images
import wget
counter = 0
for img in img_src:
    save_as = os.path.join(path, search_term +str(counter) + '.jpg')
    wget.download(img, save_as)
    counter += 1

100% [................................................................................] 3858 / 3858

In [58]:
url = 'https://www.glassdoor.com/Job/jobs.htm?suggestCount=0&suggestChosen=false&clickSource=searchBtn&typedKeyword=&locT=C&locId=2285502&jobType=&context=Jobs&sc.keyword=Data+Scientist+Intern&dropdown=0'

In [59]:
browser.open(url)

<Response [403]>

In [61]:
import requests
from bs4 import BeautifulSoup

In [175]:
def extract(page):
#     headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36'}
    url = f'https://www.indeed.co.uk/jobs?q=data+scientist&l=London,+Greater+London&start={page}'
    try:
        browser.open(url)
        soup = browser.get_current_page()
    except Exception as e:
        print(f'check your internet connection {e}')
    return soup

In [176]:
def transform(soup):
    divs = soup.find_all('div', class_ = 'jobsearch-SerpJobCard')
    for item in divs:
        title = item.find('a').text.strip()
        company = item.find('span', class_='company').text.strip()
#         rating = item.find('span', {'id': 'text'}).text.strip()
        location = item.find('span', class_ = 'location').text.strip()
        try:
            salary = item.find('span', 'salaryText').text.strip()
        except:
            salary = ''
        summary = item.find('div', {'class': 'summary'}).text.strip().replace('\n', '')
        
        jobs = {
            'title':title,
            'company':company,
            'location': location,
            'salary':salary,
            'summary': summary
        }
        
        joblist.append(jobs)
    return

In [185]:
joblist = []

for i in range(0, 100, 10):
    print(f'Getting jobs post {i}')
    soup = extract(i)
    transform(soup)

Getting jobs post 0
Getting jobs post 10
Getting jobs post 20
Getting jobs post 30
Getting jobs post 40
Getting jobs post 50
Getting jobs post 60
Getting jobs post 70
Getting jobs post 80
Getting jobs post 90


In [186]:
len(joblist)

100

In [187]:
import pandas as pd

df = pd.DataFrame(joblist)
df.head()

Unnamed: 0,title,company,location,salary,summary
0,Data Scientist,infarm,London WC2N 5DU,,Support other teams to constantly improve our ...
1,Graduate Program - Associate Data Scientist,RELX Group,London,,"Create data dictionaries, manage complex entit..."
2,Data Scientist,NHS Midlands and Lancashire Commissioning Supp...,London SE1 6LH,"£24,907 - £30,615 a year",Good knowledge of R. This is essential to mani...
3,Junior Data Scientist,Novafutur,London,,Being a data scientist would require the abili...
4,Data Scientist,Deliveroo,London,,Data scientists at Deliveroo report into our d...


In [189]:
df['location'].nunique()

19

In [190]:
df['salary'].isna().sum()

0

In [191]:
df['salary'].value_counts()

                              76
£31,350 - £32,315 a year       2
£35,000 - £65,000 a year       1
£24,907 - £30,615 a year       1
£53,168 - £62,001 a year       1
£85,000 - £95,000 a year       1
£400 - £500 a week             1
£17,000 a year                 1
£28,996 a year                 1
£75,000 - £80,000 a year       1
£75,000 - £95,000 a year       1
£40,000 - £45,000 a year       1
£55,000 - £65,000 a year       1
£46,000 - £57,000 a year       1
£100,000 - £125,000 a year     1
£38,000 - £50,000 a year       1
£54,700 - £60,635 a year       1
£31,000 - £41,000 a year       1
£450 - £500 a day              1
£43,000 a year                 1
£500 - £600 a day              1
£45,000 - £60,000 a year       1
£49,000 - £80,000 a year       1
£450 - £550 a day              1
Name: salary, dtype: int64

In [200]:
lower_bound_salary = [x.split(' ')[0].strip().replace('£','').replace(',', '') for x in df['salary']]
# upper_bound_salary = [x.split('-')[1].split('')[0].strip().replace('£','').replace(',', '') for x in df['salary']]

In [204]:
a = '£31,350 - £32,315 a year'
b = '£28,996 a year '
if '-' in b:
    print('yes')
else:
    v = b.split(' ')[0]
    print(v)

£28,996


In [225]:
upper_bound_salary, lower = [], []
for x in df['salary']:
    if '-' in x:
        a = x.split(' ')[0].strip().replace('£','').replace(',', '')
        b = x.split('-')[1].split(' ')[1].strip().replace('£','').replace(',', '')
        upper_bound_salary.append(b)
        lower.append(a)
    else:
        b = x.split(' ')[0].strip().replace('£','').replace(',', '')
        a = x.split(' ')[0].strip().replace('£','').replace(',', '')
        upper_bound_salary.append(b)
        lower.append(a)

In [226]:
for i in range(20):
    print(lower[i], upper_bound_salary[i])

# a = '£31,350 - £32,315 a year'
# if '-' in a:
#     print(a.split(' ')[0])
#     print(a.split('-')[1].split(' ')[1])
# else:
#     print(a.split('-')[1].split(' ')[0])

 
 
24907 30615
 
 
 
53168 62001
 
 
43000 43000
 
 
 
85000 95000
75000 95000
 
450 500
 
 
 


In [221]:
a.split('-')[1].

' £32,315 a year'