# Part 1 - Combine Course Info with Requirements


## 1. Read the course schedule into a DataFrame

In [855]:
import pandas as pd
from bs4 import BeautifulSoup
import re


#read in course_schedule
with open("course_Schedule.html", "r", encoding="utf-8") as f:
    body = f.read()
dom= BeautifulSoup(body, "lxml")
dom.head()

#select div class=faculty 
type(dom.select('.faculty'))

for faculty in dom.select(".faculty"):
    row_items = faculty.select("li.row")

courses=[]
for i in range(len(row_items)):
    number_section=list(filter(lambda x: x!="" ,row_items[i].select("span")[0].getText().replace('\u200b', '').split(" ")))[0].strip().split("\n")
    course_name = " ".join([line.strip() for line in row_items[i].select("span")[1].text.strip().split("\n") if line.strip()])
    instructor=row_items[i].select("span")[2].getText().strip()
    time=row_items[i].select("span")[3].getText()
    #if more than two number_sections
    
    if len(number_section) > 1:
        #for each one, we make a copy of the data and append to courses
        for number_sec in number_section:
            course_info = [number_sec, course_name,instructor,time]
            courses.append(course_info)
    else:
        courses.append([number_section, course_name, instructor, time])

course_schedule_df=pd.DataFrame(courses,columns=["Number-Section","Name","Instructor","Time"])
course_schedule_df[['Number','Section']]=course_schedule_df["Number-Section"].str.extract(r'(.*)-(\d{3})')
course_schedule_df

Unnamed: 0,Number-Section,Name,Instructor,Time,Number,Section
0,CSCI-GA.1170-001,Fundamental Algorithms,Chee Yap,W 4:55-6:55PM,CSCI-GA.1170,001
1,DS-GA.1170-001,Fundamental Algorithms,Chee Yap,W 4:55-6:55PM,DS-GA.1170,001
2,CSCI-GA.1170-002,Fundamental Algorithms Recitation,Bingwei Zhang,R 5:55-6:45PM,CSCI-GA.1170,002
3,DS-GA.1170-002,Fundamental Algorithms Recitation,Bingwei Zhang,R 5:55-6:45PM,DS-GA.1170,002
4,CSCI-GA.1170-003,Fundamental Algorithms Recitation,Bingwei Zhang,F 4:55-5:45PM,CSCI-GA.1170,003
...,...,...,...,...,...,...
347,(16906),Special Topics: Agile Software Development and...,Amos Bloomberg,MW 11:00-12:15PM,,
348,CSCI-UA.0480-075,Special Topics: Introduction to Deep Learning,Alfredo Canziani,TR 12:30-1:45PM,CSCI-UA.0480,075
349,(23956),Special Topics: Introduction to Deep Learning,Alfredo Canziani,TR 12:30-1:45PM,,
350,CSCI-UA.0490-001,Special Topics in Programming Languages,Benjamin Goldberg,MW 3:30-4:45PM,CSCI-UA.0490,001


In [856]:
display(course_schedule_df.info())
display(course_schedule_df.head(5))
display(course_schedule_df.tail(5))
display(course_schedule_df.sample(5))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 352 entries, 0 to 351
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Number-Section  352 non-null    object
 1   Name            352 non-null    object
 2   Instructor      352 non-null    object
 3   Time            352 non-null    object
 4   Number          183 non-null    object
 5   Section         183 non-null    object
dtypes: object(6)
memory usage: 16.6+ KB


None

Unnamed: 0,Number-Section,Name,Instructor,Time,Number,Section
0,CSCI-GA.1170-001,Fundamental Algorithms,Chee Yap,W 4:55-6:55PM,CSCI-GA.1170,1
1,DS-GA.1170-001,Fundamental Algorithms,Chee Yap,W 4:55-6:55PM,DS-GA.1170,1
2,CSCI-GA.1170-002,Fundamental Algorithms Recitation,Bingwei Zhang,R 5:55-6:45PM,CSCI-GA.1170,2
3,DS-GA.1170-002,Fundamental Algorithms Recitation,Bingwei Zhang,R 5:55-6:45PM,DS-GA.1170,2
4,CSCI-GA.1170-003,Fundamental Algorithms Recitation,Bingwei Zhang,F 4:55-5:45PM,CSCI-GA.1170,3


Unnamed: 0,Number-Section,Name,Instructor,Time,Number,Section
347,(16906),Special Topics: Agile Software Development and...,Amos Bloomberg,MW 11:00-12:15PM,,
348,CSCI-UA.0480-075,Special Topics: Introduction to Deep Learning,Alfredo Canziani,TR 12:30-1:45PM,CSCI-UA.0480,75.0
349,(23956),Special Topics: Introduction to Deep Learning,Alfredo Canziani,TR 12:30-1:45PM,,
350,CSCI-UA.0490-001,Special Topics in Programming Languages,Benjamin Goldberg,MW 3:30-4:45PM,CSCI-UA.0490,1.0
351,(23952),Special Topics in Programming Languages,Benjamin Goldberg,MW 3:30-4:45PM,,


Unnamed: 0,Number-Section,Name,Instructor,Time,Number,Section
46,CSCI-GA.2572-002,Deep Learning Lab,TBA,W 3:45-4:35PM,CSCI-GA.2572,2.0
260,CSCI-UA.0201-002,Computer Systems Organization - Recitation,Anway Agte,F 9:30-10:45AM,CSCI-UA.0201,2.0
293,(16890),Basic Algorithms,Rotem Oshman,MW 3:30-4:45PM,,
59,(5174),DevOps and Agile Methodologies,John Rofrano,W 4:55-6:55PM,,
201,(16845),Intro To Computer Science,Tobias Blickhan,MW 9:30-10:45AM,,


## 2. Read the course catalog into a DataFrame

In [859]:
with open("course_catalog.html", "r", encoding="utf-8") as f:
    body = f.read()
dom= BeautifulSoup(body, "lxml")
courses=dom.select(".courses-listing")[0].select("li")

catalogue=[]
for i in range(len(courses)):
    number=re.search(r".+.\d{4}",courses[i].select("p")[0].getText().strip())[0]
    points=re.search(r"(.+) Points",courses[i].select("p")[1].getText())[1].strip()
    prereqs=courses[i].select("p")[2].getText()[15:]
    catalogue.append([number,prereqs,points])

In [861]:
course_catalogue_df=pd.DataFrame(catalogue,columns=["Number","Prereqs","Points"])

display(course_catalogue_df.info())
display(course_catalogue_df.head(5))
display(course_catalogue_df.tail(5))
display(course_catalogue_df.sample(5))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 96 entries, 0 to 95
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Number   96 non-null     object
 1   Prereqs  96 non-null     object
 2   Points   96 non-null     object
dtypes: object(3)
memory usage: 2.4+ KB


None

Unnamed: 0,Number,Prereqs,Points
0,CSCI-GA.1170,At least one year of experience with a high-le...,3
1,CSCI-GA.1180,,3
2,CSCI-GA.2110,Students taking this class should already have...,3
3,CSCI-GA.2112,Multivariate calculus and linear algebra. Some...,3
4,CSCI-GA.2130,"CSCI-GA 1170, CSCI-GA 2110, and CSCI-GA 2250.",3


Unnamed: 0,Number,Prereqs,Points
91,CSCI-UA.0897,Restricted to declared computer science majors...,1 - 4
92,CSCI-UA.0898,Restricted to declared computer science majors...,1 - 4
93,CSCI-UA.0997,Permission of the department. Does not satisfy...,1 - 4
94,CSCI-UA.0998,Permission of the department. Does not satisfy...,1 - 4
95,FRSEM-UA.0597,"Some programming experience in Python, Java, J...",4


Unnamed: 0,Number,Prereqs,Points
7,CSCI-GA.2250,,3
37,CSCI-GA.2830,,3
86,CSCI-UA.0479,Data Structures (CSCI-UA 102). Students that c...,4
14,CSCI-GA.2420,Corequisite: linear algebra.,3
78,CSCI-UA.0469,Computer Systems Organization (CSCI-UA 201).,4


## 3. Put together both DataFrames


In [863]:
# merge catalogue info onto scheduled courses
merged_df = pd.merge(course_schedule_df, course_catalogue_df, on="Number", how="left")
final_df = merged_df[["Number", "Name", "Instructor", "Time", "Prereqs", "Points"]]
final_df

Unnamed: 0,Number,Name,Instructor,Time,Prereqs,Points
0,CSCI-GA.1170,Fundamental Algorithms,Chee Yap,W 4:55-6:55PM,At least one year of experience with a high-le...,3
1,DS-GA.1170,Fundamental Algorithms,Chee Yap,W 4:55-6:55PM,,
2,CSCI-GA.1170,Fundamental Algorithms Recitation,Bingwei Zhang,R 5:55-6:45PM,At least one year of experience with a high-le...,3
3,DS-GA.1170,Fundamental Algorithms Recitation,Bingwei Zhang,R 5:55-6:45PM,,
4,CSCI-GA.1170,Fundamental Algorithms Recitation,Bingwei Zhang,F 4:55-5:45PM,At least one year of experience with a high-le...,3
...,...,...,...,...,...,...
347,,Special Topics: Agile Software Development and...,Amos Bloomberg,MW 11:00-12:15PM,,
348,CSCI-UA.0480,Special Topics: Introduction to Deep Learning,Alfredo Canziani,TR 12:30-1:45PM,Topics determine prerequisites.,4
349,,Special Topics: Introduction to Deep Learning,Alfredo Canziani,TR 12:30-1:45PM,,
350,CSCI-UA.0490,Special Topics in Programming Languages,Benjamin Goldberg,MW 3:30-4:45PM,Topics determine prerequisites.,4


## 4. Conclusion




#### **Issues**
1. Invisible Zero-Width Spaces
   - Some course numbers (e.g., CSCI-GA.1170-​001) contained hidden spaces (\u200b)-> removed these spaces (i.e. used .replace('\u200b', ''))

2. Courses with Multiple Numbers 
   - Courses like CSCI-GA.1170-001 and DS-GA.1170-001 (5115) shared the same section but had different number sections -> split these into separate rows to preserve all course numbers 


#### **Behavior of how='left' Merge**  
- Keep every row from the course_schedule (left dataframe)  
- Add matching data from the course catalogue (right dataframe) where possible and leave NaN where no match exists  


# Part 2 - Using an API


## 1. Retrieve the data, and examine it.


In [866]:
import requests
import pandas as pd

url= "http://linserv1.cims.nyu.edu:10000/films?_page=1"
# url = "https://ghibliapi.vercel.app/films"

response = requests.get(url)
films = response.json()



- Interested in the director and rt_score keys to get the report
- When I modify the url, it returns a separate set of movies. Going past page 3 reveals nothing - so there are  3 pages of movie info


## 2. Load the data into a DataFrame


In [890]:
df = pd.DataFrame(films)

for i in range(2,4):
    url= f"http://linserv1.cims.nyu.edu:10000/films?_page={i}"
    response = requests.get(url)
    films = response.json()
    temp_df=pd.DataFrame(films)
    df=pd.concat([df, temp_df])

df=df.reset_index()
df

Unnamed: 0,index,id,title,original_title,original_title_romanised,description,director,producer,release_date,running_time,rt_score,people,species,locations,vehicles,url
0,0,d868e6ec-c44a-405b-8fa6-f7f0f8cfb500,The Red Turtle,レッドタートル ある島の物語,Reddotātoru aru shima no monogatari,A man set adrift by a storm wakes up on a beac...,Michaël Dudok de Wit,"Toshio Suzuki, Isao Takahata, Vincent Maraval,...",2016,80,93,[https://ghibliapi.herokuapp.com/people/],[https://ghibliapi.herokuapp.com/species/],[https://ghibliapi.herokuapp.com/locations/],[https://ghibliapi.herokuapp.com/vehicles/],https://ghibliapi.herokuapp.com/films/d868e6ec...
1,0,dc2e6bd1-8156-4886-adff-b39e6043af0c,Spirited Away,千と千尋の神隠し,Sen to Chihiro no kamikakushi,Spirited Away is an Oscar winning Japanese ani...,Hayao Miyazaki,Toshio Suzuki,2001,124,97,[https://ghibliapi.herokuapp.com/people/],[https://ghibliapi.herokuapp.com/species/af391...,[https://ghibliapi.herokuapp.com/locations/],[https://ghibliapi.herokuapp.com/vehicles/],https://ghibliapi.herokuapp.com/films/dc2e6bd1...
2,1,90b72513-afd4-4570-84de-a56c312fdf81,The Cat Returns,猫の恩返し,Neko no ongaeshi,"Haru, a schoolgirl bored by her ordinary routi...",Hiroyuki Morita,Toshio Suzuki,2002,75,89,[https://ghibliapi.herokuapp.com/people/],[https://ghibliapi.herokuapp.com/species/af391...,[https://ghibliapi.herokuapp.com/locations/],[https://ghibliapi.herokuapp.com/vehicles/],https://ghibliapi.herokuapp.com/films/90b72513...
3,2,cd3d059c-09f4-4ff3-8d63-bc765a5184fa,Howl's Moving Castle,ハウルの動く城,Hauru no ugoku shiro,"When Sophie, a shy young woman, is cursed with...",Hayao Miyazaki,Toshio Suzuki,2004,119,87,[https://ghibliapi.herokuapp.com/people/],[https://ghibliapi.herokuapp.com/species/af391...,[https://ghibliapi.herokuapp.com/locations/],[https://ghibliapi.herokuapp.com/vehicles/],https://ghibliapi.herokuapp.com/films/cd3d059c...
4,3,112c1e67-726f-40b1-ac17-6974127bb9b9,Tales from Earthsea,ゲド戦記,Gedo senki,Something bizarre has come over the land. The ...,Gorō Miyazaki,Toshio Suzuki,2006,116,41,[https://ghibliapi.herokuapp.com/people/],[https://ghibliapi.herokuapp.com/species/af391...,[https://ghibliapi.herokuapp.com/locations/],[https://ghibliapi.herokuapp.com/vehicles/],https://ghibliapi.herokuapp.com/films/112c1e67...
5,4,758bf02e-3122-46e0-884e-67cf83df1786,Ponyo,崖の上のポニョ,Gake no ue no Ponyo,"The son of a sailor, 5-year old Sosuke lives a...",Hayao Miyazaki,Toshio Suzuki,2008,100,92,[https://ghibliapi.herokuapp.com/people/],[https://ghibliapi.herokuapp.com/species/af391...,[https://ghibliapi.herokuapp.com/locations/],[https://ghibliapi.herokuapp.com/vehicles/],https://ghibliapi.herokuapp.com/films/758bf02e...
6,5,2de9426b-914a-4a06-a3a0-5e6d9d3886f6,Arrietty,借りぐらしのアリエッティ,Karigurashi no Arietti,14-year-old Arrietty and the rest of the Clock...,Hiromasa Yonebayashi,Toshio Suzuki,2010,94,95,[https://ghibliapi.herokuapp.com/people/],[https://ghibliapi.herokuapp.com/species/af391...,[https://ghibliapi.herokuapp.com/locations/],[https://ghibliapi.herokuapp.com/vehicles/],https://ghibliapi.herokuapp.com/films/2de9426b...
7,6,45db04e4-304a-4933-9823-33f389e8d74d,From Up on Poppy Hill,コクリコ坂から,Kokuriko zaka kara,The story is set in 1963 in Yokohama. Kokuriko...,Gorō Miyazaki,Toshio Suzuki,2011,91,83,[https://ghibliapi.herokuapp.com/people/],[https://ghibliapi.herokuapp.com/species/af391...,[https://ghibliapi.herokuapp.com/locations/],[https://ghibliapi.herokuapp.com/vehicles/],https://ghibliapi.herokuapp.com/films/45db04e4...
8,7,67405111-37a5-438f-81cc-4666af60c800,The Wind Rises,風立ちぬ,Kaze tachinu,A lifelong love of flight inspires Japanese av...,Hayao Miyazaki,Toshio Suzuki,2013,126,89,[https://ghibliapi.herokuapp.com/people/],[https://ghibliapi.herokuapp.com/species/af391...,[https://ghibliapi.herokuapp.com/locations/],[https://ghibliapi.herokuapp.com/vehicles/],https://ghibliapi.herokuapp.com/films/67405111...
9,8,578ae244-7750-4d9f-867b-f3cd3d6fecf4,The Tale of the Princess Kaguya,かぐや姫の物語,Kaguya-Hime no Monogatari,A bamboo cutter named Sanuki no Miyatsuko disc...,Isao Takahata,Yoshiaki Nishimura,2013,137,100,[https://ghibliapi.herokuapp.com/people/],[https://ghibliapi.herokuapp.com/species/af391...,[https://ghibliapi.herokuapp.com/locations/],[https://ghibliapi.herokuapp.com/vehicles/],https://ghibliapi.herokuapp.com/films/578ae244...


## 3. Report


In [888]:
df["rt_score"] = pd.to_numeric(df["rt_score"], errors="coerce")
avg_rt_score=df[["director","rt_score"]].groupby("director")["rt_score"].mean()

director_counts=df["director"].value_counts()

report=pd.merge(avg_rt_score,director_counts,on="director")
report=report.sort_values(by="rt_score",ascending=False)
report

Unnamed: 0_level_0,rt_score,count
director,Unnamed: 1_level_1,Unnamed: 2_level_1
Isao Takahata,100.0,1
Hiromasa Yonebayashi,93.5,2
Michaël Dudok de Wit,93.0,2
Hayao Miyazaki,91.25,4
Hiroyuki Morita,89.0,1
Gorō Miyazaki,62.0,2
