## Setup & Imports

In [1]:
import pandas as pd
import glob
import os
import re

# For feature engineering
from sklearn.preprocessing import LabelEncoder

## Load the Data

In [2]:
# Path to csv data folder
path = '..\data\cvs_data'

# Get all CSV files
all_files = glob.glob(os.path.join(path, "*.csv"))

# Combine all files into one DataFrame
df = pd.concat((pd.read_csv(f) for f in all_files), ignore_index=True)

# Info on the DataFrame
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4007 entries, 0 to 4006
Data columns (total 15 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Section     4007 non-null   object 
 1   Number      4007 non-null   int64  
 2   Mode        4007 non-null   object 
 3   Title       4007 non-null   object 
 4   Satifies    1615 non-null   object 
 5   Unit        4007 non-null   float64
 6   Type        4007 non-null   object 
 7   Days        4007 non-null   object 
 8   Times       4007 non-null   object 
 9   Instructor  4007 non-null   object 
 10  Location    3945 non-null   object 
 11  Dates       4007 non-null   object 
 12  Seats       4007 non-null   int64  
 13  Year        4007 non-null   int64  
 14  Semester    4007 non-null   object 
dtypes: float64(1), int64(3), object(11)
memory usage: 469.7+ KB


Unnamed: 0,Section,Number,Mode,Title,Satifies,Unit,Type,Days,Times,Instructor,Location,Dates,Seats,Year,Semester
0,BIOL 10 (Section 01),40529,In Person,The Living World,GE: B2,3.0,LEC,TR,09:00AM-10:15AM,Allison Harness,SCI164,08/19/22-12/06/22,59,2022,Fall
1,BIOL 10 (Section 03),40060,In Person,The Living World,GE: B2,3.0,LEC,MW,10:30AM-11:45AM,Phillip Hawkins,SCI164,08/19/22-12/06/22,42,2022,Fall
2,BIOL 10 (Section 04),47603,Fully Online,The Living World,GE: B2,3.0,LEC,TBA,TBA,Phillip Hawkins,ONLINE,08/19/22-12/06/22,6,2022,Fall
3,BIOL 10 (Section 99),41828,Fully Online,The Living World,GE: B2,3.0,LEC,TBA,TBA,Mary Poffenroth,ONLINE,08/19/22-12/06/22,1,2022,Fall
4,CHEM 1A (Section 01),40081,In Person,General Chemistry,GE: B1+B3,5.0,LEC,MWF,09:30AM-10:20AM,Resa Kelly,SCI142,08/19/22-12/06/22,0,2022,Fall


# Feature Selection Notes

### Dropped Features
Course Number, Title, Satisfies, Unit, Location, Dates, Seats  
â†’ Not relevant for modeling and can be removed.

### Kept and Important Features
- **Section**: Used to extract the course name  
- **Mode**: Shows if the course is In Person, Fully Online, or Hybrid  
- **Type**: Shows if the course is a Lecture, Seminar, or Lab  
- **Days**: Needed for student scheduling  
- **Times**: Needed for student scheduling  
- **Year**: Used to give more weight to recent years  
- **Semester**: Important because some courses are only offered in certain semesters


In [3]:
# Extract the Course name from Section
df['Course'] = df['Section'].str.extract(r"^([A-Z]+\s*\d+)")

# Only inlcude useful features
cols = ['Course', 'Mode', 'Type', 'Times', 'Year', 'Semester', 'Instructor']
df = df[cols]
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4007 entries, 0 to 4006
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Course      4007 non-null   object
 1   Mode        4007 non-null   object
 2   Type        4007 non-null   object
 3   Times       4007 non-null   object
 4   Year        4007 non-null   int64 
 5   Semester    4007 non-null   object
 6   Instructor  4007 non-null   object
dtypes: int64(1), object(6)
memory usage: 219.3+ KB


Unnamed: 0,Course,Mode,Type,Times,Year,Semester,Instructor
0,BIOL 10,In Person,LEC,09:00AM-10:15AM,2022,Fall,Allison Harness
1,BIOL 10,In Person,LEC,10:30AM-11:45AM,2022,Fall,Phillip Hawkins
2,BIOL 10,Fully Online,LEC,TBA,2022,Fall,Phillip Hawkins
3,BIOL 10,Fully Online,LEC,TBA,2022,Fall,Mary Poffenroth
4,CHEM 1,In Person,LEC,09:30AM-10:20AM,2022,Fall,Resa Kelly


### Feature Engineering: Mode

In [4]:
print(df['Mode'].unique())
print()
print(df['Mode'].value_counts())

# Manually label encoding for mode
# Tree-based models don't care about numeric order, so we don't need one-hot encoding
mode_map = {
    'In Person': 0,
    'Hybrid': 1,
    'Fully Online': 2
}
df['Mode'] = df['Mode'].map(mode_map)
df.head()

['In Person' 'Fully Online' 'Hybrid']

Mode
In Person       3544
Fully Online     366
Hybrid            97
Name: count, dtype: int64


Unnamed: 0,Course,Mode,Type,Times,Year,Semester,Instructor
0,BIOL 10,0,LEC,09:00AM-10:15AM,2022,Fall,Allison Harness
1,BIOL 10,0,LEC,10:30AM-11:45AM,2022,Fall,Phillip Hawkins
2,BIOL 10,2,LEC,TBA,2022,Fall,Phillip Hawkins
3,BIOL 10,2,LEC,TBA,2022,Fall,Mary Poffenroth
4,CHEM 1,0,LEC,09:30AM-10:20AM,2022,Fall,Resa Kelly


### Feature Engineering: Type

In [5]:
print(df['Type'].unique())
print()
print(df['Type'].value_counts())

type_map = {
    'LAB': 0,
    'LEC': 1,
    'SEM': 2
}
df['Type'] = df['Type'].map(type_map)
df.head()

['LEC' 'SEM' 'LAB']

Type
LAB    1650
LEC    1474
SEM     883
Name: count, dtype: int64


Unnamed: 0,Course,Mode,Type,Times,Year,Semester,Instructor
0,BIOL 10,0,1,09:00AM-10:15AM,2022,Fall,Allison Harness
1,BIOL 10,0,1,10:30AM-11:45AM,2022,Fall,Phillip Hawkins
2,BIOL 10,2,1,TBA,2022,Fall,Phillip Hawkins
3,BIOL 10,2,1,TBA,2022,Fall,Mary Poffenroth
4,CHEM 1,0,1,09:30AM-10:20AM,2022,Fall,Resa Kelly


### Feature Engineering: Year & Semester

In [6]:
print(df['Year'].unique())
print()
print(df['Year'].value_counts())

print(df['Semester'].unique())
print()
print(df['Semester'].value_counts())


[2022 2023 2024 2025]

Year
2024    1191
2022    1127
2023    1115
2025     574
Name: count, dtype: int64
['Fall' 'Spring']

Semester
Spring    2160
Fall      1847
Name: count, dtype: int64


# Decision Tree