In [123]:
import datetime
import re

import numpy as np
import pandas as pd
from pandas import json_normalize

import requests

In [2]:
school_dic = {
    "ironhack" : 10828,
    "app-academy" : 10525,
    "springboard" : 11035,
    "le-wagon" : 10868,
    "general-assembly" : 10761,
    "hackwagon-academy" : 10792,
    "udacity" : 11118,
    "thinkful" : 11098,
    "nyc-data-science-academy" : 10925
}

In [3]:
# Regexes used for removing tags and period

TAG_RE = re.compile(r'<[^>]+>')

YEAR_RE = re.compile(r'\..')

In [4]:
def remove_tags(x):
    "Removes tags"

    return TAG_RE.sub('',x)

In [6]:
def get_comments_school(school):
    """
    Send API calls to retrieve data and returns response as JSON file.
    """
    
    # Defines url to make api call to data -> dynamic with school if you want to scrape competition    
    url = ("https://www.switchup.org/chimera/v1/school-review-list?mainTemplate=school-review-list&path=%2Fbootcamps%2F"
           + school
           + "&isDataTarget=false&page=3&perPage=10000&simpleHtml=true&truncationLength=250")
    
    # Makes get request and converts answer to json
    data = requests.get(url).json()
    
    return data

In [5]:
def comments_to_dataframe(json_file, school):
    
    # Convert API response to Pandas DataFrame
    reviews = pd.DataFrame(json_file["content"]["reviews"])
    
    # Remove tags
    reviews["review_body"] = reviews["body"].apply(remove_tags)
    
    # Add new column of school names
    reviews["school"] = school
    
    return reviews

### Retrieve data and convert it to Pandas DataFrame

In [7]:
comments = [comments_to_dataframe(get_comments_school(school), school) for school in school_dic.keys()]

comments = pd.concat(comments)

In [10]:
def get_school_info(school, school_id):
    url = ("https://www.switchup.org/chimera/v1/bootcamp-data?mainTemplate=bootcamp-data%2Fdescription&path=%2Fbootcamps%2F"
           + str(school) + "&isDataTarget=false&bootcampId="
           + str(school_id)
           + "&logoTag=logo&truncationLength=250&readMoreOmission=...&readMoreText=Read%20More&readLessText=Read%20Less")

    data = requests.get(url).json()

    return data

In [11]:
def info_to_dataframe(data, school, school_id):
    
    # Convert API response to Pandas DataFrame
    courses_df = pd.DataFrame(data["content"]["courses"], columns= ["courses"])
    
    # Location DataFrame
    locations_df = json_normalize(data["content"]["locations"])
    
    badges_df = pd.DataFrame(data["content"]["meritBadges"])
    
    website = data["content"]["webaddr"]
    description = data["content"]["description"]
    logoUrl = data["content"]["logoUrl"]
    
    school_df = pd.DataFrame([website, description, logoUrl]).T
    
    school_df.columns =  ["website","description","LogoUrl"]
    
    # Add School name column to each DataFrame
    locations_df["school"] = school
    courses_df["school"] = school
    badges_df["school"] = school
    school_df["school"] = school
    
    # Add School Id column
    locations_df["school_id"] = school_id
    courses_df["school_id"] = school_id
    badges_df["school_id"] = school_id
    school_df["school_id"] = school_id
    
    return locations_df, courses_df, badges_df, school_df

### Extract school infos

In [12]:
school_infos = [info_to_dataframe(get_school_info(school, school_id), school, school_id) for school, school_id in school_dic.items()]

### Convert extracted info in DataFrames

In [15]:
locations = pd.concat([school_infos[i][0] for i in range(8)])

courses = pd.concat([school_infos[i][1] for i in range(8)])

badges = pd.concat([school_infos[i][2] for i in range(8)])

schools = pd.concat([school_infos[i][3] for i in range(8)])

In [16]:
def correct_year(x):
    
    return YEAR_RE.sub('',x)

In [17]:
# Cleaning extracted comments

def clean_comments(comments):

    # Change column types
    convert_dict = {"overallScore": float,
                    "curriculum": float,
                    "jobSupport": float,
                    "id": object,
                    "graduatingYear": str}
    
    comments = comments.astype(convert_dict)  
    
    comments['queryDate'] = pd.to_datetime(comments['queryDate'])

    # Apply regex to graduating year
    comments["graduatingYear"] = comments["graduatingYear"].apply(correct_year)
    
    # Dropping unecessary columns
    drop_cols = ["user", "body", "createdAt", "comments", "hostProgramName"]   
    comments = comments.drop(drop_cols, axis=1)

    # Renaming columns
    comments.rename(columns={"review_body": "Review"}, inplace=True)

    # Substituting nulls - fill missing values
    comments = comments.fillna("not available")
    comments["jobTitle"]=comments["jobTitle"].where(comments["jobTitle"] != "", "not available")
    
    return comments

In [18]:
# Cleaning locations DataFrame

def clean_locations(locations):
    
    # Change column types
    convert_dict = {"id": object,
                    "city.id": object,
                    "country.id": object,
                    "state.id": object}
    
    locations = locations.astype(convert_dict)
    
    # Fill null values
    locations = locations.fillna("not available")
    
    return locations

In [19]:
# Cleaning badges

def clean_badge_school(badges):
    
    badges["description"] = badges["description"].apply(remove_tags)
    
    return badges

### Apply cleaning functions

In [22]:
# Getting new versions goinggg

comments = clean_comments(comments).reset_index(drop=True)

locations = clean_locations(locations).reset_index(drop=True)

badges = clean_badge_school(badges).reset_index(drop=True)

schools = clean_badge_school(schools).reset_index(drop=True)

## Print head of each DataFrame

In [23]:
locations.head()

Unnamed: 0,id,description,country.id,country.name,country.abbrev,city.id,city.name,city.keyword,state.id,state.name,state.abbrev,state.keyword,school,school_id
0,15901,"Berlin, Germany",57,Germany,DE,31156,Berlin,berlin,not available,not available,not available,not available,ironhack,10828
1,16022,"Mexico City, Mexico",29,Mexico,MX,31175,Mexico City,mexico-city,not available,not available,not available,not available,ironhack,10828
2,16086,"Amsterdam, Netherlands",59,Netherlands,NL,31168,Amsterdam,amsterdam,not available,not available,not available,not available,ironhack,10828
3,16088,"Sao Paulo, Brazil",42,Brazil,BR,31121,Sao Paulo,sao-paulo,not available,not available,not available,not available,ironhack,10828
4,16109,"Paris, France",38,France,FR,31136,Paris,paris,not available,not available,not available,not available,ironhack,10828


In [24]:
comments.head()

Unnamed: 0,id,name,anonymous,graduatingYear,isAlumni,jobTitle,tagline,queryDate,program,overallScore,overall,curriculum,jobSupport,Review,school
0,270846,Anonymous,True,2020,False,unemployed,Disappointed,2020-11-25,Web Development Bootcamp,2.7,3.0,4,1,I participated to the remote Web Dev bootcamp....,ironhack
1,270766,salim ameur,False,2020,False,frontend developer,"""An amazing experience !""",2020-11-23,Web Development Bootcamp,5.0,5.0,5,5,IRONHACK will be the best choice if you want t...,ironhack
2,270471,michael moyers,False,2020,False,Product Designer,Exceeded my expectations and beyond!,2020-11-16,UX/UI Design Bootcamp,5.0,5.0,5,5,I could not be more impressed with my experien...,ironhack
3,270296,Anonymous,True,2020,False,not available,Very intense and high reward bootcamp,2020-11-13,Web Development Bootcamp,4.7,5.0,4,5,Ironhack bootcamp is a unique experience where...,ironhack
4,270295,VALENTINE CONQ,False,2020,False,Product Designer,"Great bootcamp, amazing humans.",2020-11-13,UX/UI Design Bootcamp,5.0,5.0,5,5,I attended the UX/UI bootcamp at Ironhack and ...,ironhack


In [25]:
badges.head()

Unnamed: 0,name,keyword,description,school,school_id
0,Available Online,available_online,School offers fully online courses,ironhack,10828
1,Verified Outcomes,verified_outcomes,School publishes a third-party verified outcom...,ironhack,10828
2,Flexible Classes,flexible_classes,School offers part-time and evening classes,ironhack,10828
3,Available Online,available_online,School offers fully online courses,app-academy,10525
4,Flexible Classes,flexible_classes,School offers part-time and evening classes,app-academy,10525


In [26]:
schools.head()

Unnamed: 0,website,description,LogoUrl,school,school_id
0,www.ironhack.com/en,Ironhack is a global tech school with 9 campus...,https://d92mrp7hetgfk.cloudfront.net/images/si...,ironhack,10828
1,appacademy.io,App Academy is a coding school that offers onl...,https://d92mrp7hetgfk.cloudfront.net/images/si...,app-academy,10525
2,www.springboard.com/?utm_source=switchup&utm_m...,Springboard is an online school for learning 2...,https://d92mrp7hetgfk.cloudfront.net/images/si...,springboard,11035
3,www.lewagon.com,Le Wagon runs immersive coding bootcamps in 40...,https://d92mrp7hetgfk.cloudfront.net/images/si...,le-wagon,10868
4,generalassemb.ly/?utm_medium=affiliate-lead-gl...,General Assembly is creating a global communit...,https://d92mrp7hetgfk.cloudfront.net/images/si...,general-assembly,10761


In [27]:
courses.head()

Unnamed: 0,courses,school,school_id
0,Data Analytics Bootcamp,ironhack,10828
1,Data Analytics Part-Time,ironhack,10828
2,UX/UI Design Bootcamp,ironhack,10828
3,UX/UI Design Part-Time,ironhack,10828
4,Web Development Bootcamp,ironhack,10828


# Questions

---


* which programs are most commented?
* which area's are le-wagon operating that IronHack aren't
* search for words in Review column

### Top 10 most commented programs

In [34]:
pd.DataFrame(comments.groupby(["program"])["Review"].count()).sort_values(by="Review", ascending=False).head(10)

Unnamed: 0_level_0,Review
program,Unnamed: 1_level_1
not available,1068
FullStack program - 35+ locations,1038
Software Engineer Track: In-Person,731
,633
Full-time Web Development Bootcamp,362
DATA SCIENCE,356
FullStack program,307
Data Science,262
Web Development Course - Full-Time,211
Data Science Career Track,202


In [36]:
list(comments["school"].unique())

['ironhack',
 'app-academy',
 'springboard',
 'le-wagon',
 'general-assembly',
 'hackwagon-academy',
 'udacity',
 'thinkful',
 'nyc-data-science-academy']

In [37]:
locations.head()

Unnamed: 0,id,description,country.id,country.name,country.abbrev,city.id,city.name,city.keyword,state.id,state.name,state.abbrev,state.keyword,school,school_id
0,15901,"Berlin, Germany",57,Germany,DE,31156,Berlin,berlin,not available,not available,not available,not available,ironhack,10828
1,16022,"Mexico City, Mexico",29,Mexico,MX,31175,Mexico City,mexico-city,not available,not available,not available,not available,ironhack,10828
2,16086,"Amsterdam, Netherlands",59,Netherlands,NL,31168,Amsterdam,amsterdam,not available,not available,not available,not available,ironhack,10828
3,16088,"Sao Paulo, Brazil",42,Brazil,BR,31121,Sao Paulo,sao-paulo,not available,not available,not available,not available,ironhack,10828
4,16109,"Paris, France",38,France,FR,31136,Paris,paris,not available,not available,not available,not available,ironhack,10828


In [39]:
le_wagon = locations[locations["school"] == "le-wagon"]

In [40]:
iron_hack = locations[locations["school"] == "ironhack"]

In [108]:
iron_hack_mapping = dict(pd.DataFrame(iron_hack.groupby(["country.name", "city.name"])["school"]).drop(columns=1)[0].apply(pd.Series).values)

In [109]:
le_wagon_mapping = dict(pd.DataFrame(le_wagon.groupby(["country.name", "city.name"])["school"])[0].apply(pd.Series).values)

In [115]:
# There are just unique countris and cities in which Iron Hack does not operate.

{k : le_wagon_mapping[k] for k in set(le_wagon_mapping) - set(iron_hack_mapping)}

{'Norway': 'Oslo',
 'Japan': 'Tokyo',
 'Chile': 'Santiago',
 'Canada': 'Toronto',
 'Argentina': 'Buenos Aires',
 'South Korea': 'Seoul',
 'Turkey': 'Istanbul',
 'Morocco': 'Casablanca',
 'Singapore': 'Singapore',
 'Belgium': 'Brussels',
 'Denmark': 'Copenhagen',
 'United Arab Emirates': 'Dubai',
 'Indonesia': 'Bali',
 'Australia': 'Melbourne',
 'Switzerland': 'Lausanne',
 'England': 'London',
 'Italy': 'Rome',
 'Sweden': 'Stockholm',
 'China': 'Shenzhen',
 'Israel': 'Tel Aviv'}

### These are countries in which Iron Hack does not operate

In [45]:
set(le_wagon["country.name"].unique()) - set(iron_hack["country.name"].unique())

{'Argentina',
 'Australia',
 'Belgium',
 'Canada',
 'Chile',
 'China',
 'Denmark',
 'England',
 'Indonesia',
 'Israel',
 'Italy',
 'Japan',
 'Morocco',
 'Norway',
 'Singapore',
 'South Korea',
 'Sweden',
 'Switzerland',
 'Turkey',
 'United Arab Emirates'}

### These are cities in which Iron Hack does not operate

In [46]:
set(le_wagon["city.name"].unique()) - set(iron_hack["city.name"].unique())

{'Bali',
 'Belo Horizonte',
 'Bordeaux',
 'Brussels',
 'Buenos Aires',
 'Casablanca',
 'Chengdu',
 'Copenhagen',
 'Dubai',
 'Istanbul',
 'Kyoto',
 'Lausanne',
 'Lille',
 'London',
 'Lyon',
 'Marseille',
 'Melbourne',
 'Milan',
 'Montreal',
 'Munich',
 'Nantes',
 'Oslo',
 'Rennes',
 'Rio de Janeiro ',
 'Rome',
 'Santiago',
 'Seoul',
 'Shanghai',
 'Shenzhen',
 'Singapore',
 'Stockholm',
 'Tel Aviv',
 'Tokyo',
 'Toronto'}

# Algorithm to find keywords in Review colum

---

1) Define keywords you want to find in Review  column. These can be "Fine, worst, good - it depends on your imagination and preference"

2) use Pandas .str.contains(keyword) to find if your keyword is in each row of Review column

3) If your keyword is inside Review column (here I mean each row) then mark this row as 1, otherwise mark it as 0

4) Make some analysis on the new column you created in step 3

In [119]:
KEY_WORD = "good"

In [121]:
comments["Review"].str.contains(KEY_WORD)

0        True
1        True
2        True
3        True
4       False
        ...  
7915    False
7916    False
7917    False
7918    False
7919    False
Name: Review, Length: 7920, dtype: bool

In [125]:
comments["contains_my_keyword"] = np.where(comments["Review"].str.contains(KEY_WORD), 1, 0)

In [127]:
comments.head(5)

Unnamed: 0,id,name,anonymous,graduatingYear,isAlumni,jobTitle,tagline,queryDate,program,overallScore,overall,curriculum,jobSupport,Review,school,contains_my_keyword
0,270846,Anonymous,True,2020,False,unemployed,Disappointed,2020-11-25,Web Development Bootcamp,2.7,3.0,4,1,I participated to the remote Web Dev bootcamp....,ironhack,1
1,270766,salim ameur,False,2020,False,frontend developer,"""An amazing experience !""",2020-11-23,Web Development Bootcamp,5.0,5.0,5,5,IRONHACK will be the best choice if you want t...,ironhack,1
2,270471,michael moyers,False,2020,False,Product Designer,Exceeded my expectations and beyond!,2020-11-16,UX/UI Design Bootcamp,5.0,5.0,5,5,I could not be more impressed with my experien...,ironhack,1
3,270296,Anonymous,True,2020,False,not available,Very intense and high reward bootcamp,2020-11-13,Web Development Bootcamp,4.7,5.0,4,5,Ironhack bootcamp is a unique experience where...,ironhack,1
4,270295,VALENTINE CONQ,False,2020,False,Product Designer,"Great bootcamp, amazing humans.",2020-11-13,UX/UI Design Bootcamp,5.0,5.0,5,5,I attended the UX/UI bootcamp at Ironhack and ...,ironhack,0


In [None]:
POSITIVE_KEY_WORD = ["good", "fine", "excellent", "brilliant"]

In [None]:
NEGATIVE_KEY_WORDS = ["bad", "worst"]