### Loading Required Libraries

In [1]:
from modules import PreProcessing, functions
from sqlite3 import connect 
import pandas as pd

# to ensure all changes are reflected 
# in-case jupyter notebook was already oppened

from importlib import reload
reload(PreProcessing)
reload(functions)

project_dir = '/Users/nima/repository/Project_509'

### Section 1 - Data Collection

In [2]:
driver = PreProcessing.ChromeDriver(driver_path = project_dir + '/chromedriver')
scrapper = PreProcessing.WebScraper(driver)

# wait randomly between 3 to 6 seconds between each retrieval
retrieval_control = functions.RetrievalControl(min_value=3,max_value=6)

In [3]:
# define the courses which reviews needs to be collected for!

course_names = ['ai-for-everyone', 'excel-essentials', 'introduction-cybersecurity-cyber-attacks']

In [4]:
for course_name in course_names:
    course = PreProcessing.Course(course_name)
    
    driver.url = course.main_page_link
    
    course.title = scrapper.extract_course_info('title')
    
    # Click on view all to expand the info section and get the full about text
    driver.click_element(by_class_name='_1wziqqgx')
    course.about_text = scrapper.extract_course_info('about_text')
        
    course.available_review_pages = scrapper.extract_course_info('available_review_pages')
    
    # if smaller portions of the reviews needs to be retrieved,
    # set what pages to tetrieve for example only get reviews from
    # page 1, 5 and 10 instead of all pages
    # course.available_review_pages = [1, 5, 10]
    
    course.save_course_info_to_file(file_location=project_dir)

    retrieval_control.wait
    
    for page_num in course.available_review_pages:

        driver.url = course.page_link(page_num)
        reviews_in_page = scrapper.extract_reviews_from_current_page()
    
        course.save_reviews_to_file(
                this_page_reviews=reviews_in_page, 
                directory=project_dir + '/reviews/')
        
        retrieval_control.wait

There was no button to click.
Information about ai-for-everyone with title of Back to AI For Everyone was added.

Wait 6.84 seconds!
ai-for-everyone.csv now contains reviews from page 1

Wait 3.21 seconds!
ai-for-everyone.csv now contains reviews from page 2

Wait 6.38 seconds!
ai-for-everyone.csv now contains reviews from page 3

Wait 4.42 seconds!
ai-for-everyone.csv now contains reviews from page 4

Wait 3.13 seconds!
ai-for-everyone.csv now contains reviews from page 5

Wait 5.24 seconds!
ai-for-everyone.csv now contains reviews from page 6

Wait 4.74 seconds!
ai-for-everyone.csv now contains reviews from page 7

Wait 5.45 seconds!
ai-for-everyone.csv now contains reviews from page 8

Wait 3.92 seconds!
ai-for-everyone.csv now contains reviews from page 9

Wait 4.07 seconds!
ai-for-everyone.csv now contains reviews from page 10

Wait 4.32 seconds!
ai-for-everyone.csv now contains reviews from page 11

Wait 6.10 seconds!
ai-for-everyone.csv now contains reviews from page 12

Wait 5.

### Section 2 - Gather all data into a database
This will help with performing queries using sql statements <br>
as well as sharing the data with others in a single file

In [5]:
databse_file = 'database.db'

course_info = pd.read_csv(project_dir + '/course_info.csv')
review_files = os.listdir(project_dir + '/reviews/')

all_reviews = pd.DataFrame()

for file_path in review_files:
    course_review = pd.read_csv(project_dir + '/reviews/' + file_path, encoding='unicode_escape')
    course_name = file_path.replace('.csv','')
    course_review['course_name'] = course_name
    all_reviews = pd.concat([all_reviews, course_review], ignore_index=True)
  
# drop any columns that contain Unnamed in all_reviews 
all_reviews = all_reviews.drop(columns=[col for col in all_reviews.columns if 'Unnamed' in col])


# change index name of all_reviews to id
# this will be used to juxtopose 
# the original text of review vs after being cleaned
all_reviews.index.name = 'id'

conn = connect(':memory:')
all_reviews.to_sql('reviews', conn)
course_info.to_sql('course', conn)


if os.path.exists(databse_file):
    print('Database already exists!\n',
          'change %s to something else.\n' % databse_file,
          'This is to avoid loosing data from before!')
else:
    main = connect(databse_file)
    conn.backup(main)
    main.commit()
    main.close()

In case if the data has already been converted to a database, <br>
the block above can be skippled and only load the database

In [8]:
# test if all data can be accessed.
main = connect(databse_file)

query = \
"""
SELECT *
FROM reviews 
WHERE helpful > 20 
LIMIT 5
"""

pd.read_sql_query(query, main)

Unnamed: 0,id,course_name,review_date,reviewer,star,helpful,text
0,0,ai-for-everyone,2019-03-09,Arun,5.0,91.0,I got a comprehensive overview of what AI is a...
1,1,ai-for-everyone,2019-11-05,Abhilash R N,5.0,77.0,Highly recommended for anyone wanting to start...
2,2,ai-for-everyone,2019-03-25,Marcus B,5.0,73.0,"Brilliantly delivered, contains all the most i..."
3,3,ai-for-everyone,2019-05-11,H.y. C,5.0,65.0,I now are equipped with right attitude to AI p...
4,4,ai-for-everyone,2019-06-16,Aleena A,5.0,60.0,I got a comprehensive overview of what AI is a...
