In [26]:
import pandas as pd
import numpy as np

url = "https://raw.githubusercontent.com/7HE-LUCKY-FISH/major_map/main/data/cvs_data/Spring-2025.csv"
df = pd.read_csv(url)          # or read_json/read_table as needed
df.head()


Unnamed: 0,Section,Number,Mode,Title,Satifies,Unit,Type,Days,Times,Instructor,Location,Dates,Seats,Year,Semester
0,BIOL 10 (Section 01),22045,In Person,The Living World,GE: 5B,3,LEC,TR,01:30PM-02:45PM,Allison Harness,DH135,01/23/25-05/12/25,35,2025,Spring
1,BIOL 10 (Section 02),26892,In Person,The Living World,GE: 5B,3,LEC,TR,03:00PM-04:15PM,Allison Harness,DH135,01/23/25-05/12/25,49,2025,Spring
2,BIOL 10 (Section 81),23567,Fully Online,The Living World,GE: 5B,3,LEC,TBA,TBA,Allison Harness,ONLINE,01/23/25-05/12/25,6,2025,Spring
3,BIOL 10 (Section 99),21337,Fully Online,The Living World,GE: 5B,3,LEC,TBA,TBA,Mary Poffenroth,ONLINE,01/23/25-05/12/25,5,2025,Spring
4,CHEM 1A (Section 01),24936,In Person,General Chemistry,GE: 5A+5C,5,LEC,MWF,09:30AM-10:20AM,Melody Esfandiari,SCI142,01/23/25-05/12/25,0,2025,Spring


In [27]:
import io, re, zipfile, requests, glob
from pathlib import Path
import pandas as pd

owner = "7HE-LUCKY-FISH"
repo = "major_map"
branch = "main"

# 1) Download repo zip (public).
zip_url = f"https://codeload.github.com/{owner}/{repo}/zip/refs/heads/{branch}"
z = zipfile.ZipFile(io.BytesIO(requests.get(zip_url).content))
extract_dir = "/content"
z.extractall(extract_dir)

# 2) Read all CSVs under data/cvs_data
base = Path(extract_dir) / f"{repo}-{branch}" / "data" / "cvs_data"
files = sorted(glob.glob(str(base / "*.csv")))

dfs = []
for fp in files:
    name = Path(fp).name  # e.g., "Spring-2025.csv"
    # tolerant CSV parser; tweak sep/quotechar if needed
    df = pd.read_csv(fp, engine="python")
    #append the dataframe to the list
    dfs.append(df)

#combine all the dataframes into one
all_df = pd.concat(dfs, ignore_index=True)

all_df.head(), all_df.shape, files[:3]

(                Section  Number          Mode              Title   Satifies  \
 0  BIOL 10 (Section 01)   40529     In Person   The Living World     GE: B2   
 1  BIOL 10 (Section 03)   40060     In Person   The Living World     GE: B2   
 2  BIOL 10 (Section 04)   47603  Fully Online   The Living World     GE: B2   
 3  BIOL 10 (Section 99)   41828  Fully Online   The Living World     GE: B2   
 4  CHEM 1A (Section 01)   40081     In Person  General Chemistry  GE: B1+B3   
 
    Unit Type Days            Times       Instructor Location  \
 0   3.0  LEC   TR  09:00AM-10:15AM  Allison Harness   SCI164   
 1   3.0  LEC   MW  10:30AM-11:45AM  Phillip Hawkins   SCI164   
 2   3.0  LEC  TBA              TBA  Phillip Hawkins   ONLINE   
 3   3.0  LEC  TBA              TBA  Mary Poffenroth   ONLINE   
 4   5.0  LEC  MWF  09:30AM-10:20AM       Resa Kelly   SCI142   
 
                Dates  Seats  Year Semester  
 0  08/19/22-12/06/22     59  2022     Fall  
 1  08/19/22-12/06/22     42  2022

In [28]:
#Extract the Course name from Section
print(list(all_df.columns))

all_df['Course'] = all_df['Section'].str.extract(r"^([A-Z]+\s*\d+)",expand=False)

#Only inlcude useful features
cols = ['Course', 'Mode', 'Type', 'Days', 'Times', 'Instructor', 'Year', 'Semester']
all_df = all_df[cols]
all_df['Scheduled'] = 1

all_df.info()
all_df.head()



['Section', 'Number', 'Mode', 'Title', 'Satifies', 'Unit', 'Type', 'Days', 'Times', 'Instructor', 'Location', 'Dates', 'Seats', 'Year', 'Semester']
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4007 entries, 0 to 4006
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Course      4007 non-null   object
 1   Mode        4007 non-null   object
 2   Type        4007 non-null   object
 3   Days        4007 non-null   object
 4   Times       4007 non-null   object
 5   Instructor  4007 non-null   object
 6   Year        4007 non-null   int64 
 7   Semester    4007 non-null   object
 8   Scheduled   4007 non-null   int64 
dtypes: int64(2), object(7)
memory usage: 281.9+ KB


Unnamed: 0,Course,Mode,Type,Days,Times,Instructor,Year,Semester,Scheduled
0,BIOL 10,In Person,LEC,TR,09:00AM-10:15AM,Allison Harness,2022,Fall,1
1,BIOL 10,In Person,LEC,MW,10:30AM-11:45AM,Phillip Hawkins,2022,Fall,1
2,BIOL 10,Fully Online,LEC,TBA,TBA,Phillip Hawkins,2022,Fall,1
3,BIOL 10,Fully Online,LEC,TBA,TBA,Mary Poffenroth,2022,Fall,1
4,CHEM 1,In Person,LEC,MWF,09:30AM-10:20AM,Resa Kelly,2022,Fall,1


In [32]:
df = all_df[['Course','Instructor','Mode','Days','Times','Year','Semester']].dropna(subset=['Course'])

# Canonical time-slot id
def norm_slot(d,t):
    #\s+ is a regex meaning all whitespace (newlines, space, tabs)
    return f"{re.sub('\\s+','',str(d))}|{str(t).strip()}"
df['Slot'] = [norm_slot(d,t) for d,t in zip(df['Days'], df['Times'])]

# Choose your split first (example train â‰¤ 2023)
#train = df.copy()

# Positives (unique sections at the granularity you care about)
key_cols = ['Course','Instructor','Slot','Mode','Year','Semester']
pos = df[key_cols].drop_duplicates().assign(Scheduled=1)

# Contexts to rank within
ctx = pos[['Course','Year','Semester']].drop_duplicates()

# Feasible sets
active_inst = pos[['Instructor','Year','Semester']].drop_duplicates()                # instructors active that term
slots_by_course = pos[['Course','Slot']].drop_duplicates()                           # slots course has used
modes_by_course = pos[['Course','Mode']].drop_duplicates()                           # modes course has used

# Cross-join contexts with feasible sets
cand = (ctx.merge(active_inst, on=['Year','Semester'])
          .merge(slots_by_course, on='Course')
          .merge(modes_by_course, on='Course'))

# Label: left-anti join -> negatives
cand = cand.merge(pos[key_cols], on=key_cols, how='left', indicator=True)
cand['Scheduled'] = (cand['_merge'] == 'both').astype(int)
cand = cand.drop(columns=['_merge'])

# Optional: downsample negatives to K per context to keep balance
K = 5
cand = (cand
        .assign(r=np.random.RandomState(42).rand(len(cand)))
        .sort_values('r')
        .drop(columns='r')
        .groupby(['Course','Year','Semester'], group_keys=False)
        .apply(lambda g: pd.concat([g[g['Scheduled']==1],
                                    g[g['Scheduled']==0].head(K*max(1, g['Scheduled'].sum()))])))



  .apply(lambda g: pd.concat([g[g['Scheduled']==1],


In [40]:
out_csv   = "/content/candidates_train.csv"
cand.to_csv(out_csv, index=False)

In [35]:
from sklearn.linear_model import LogisticRegression

# === 1) Split by time (example cutoffs) ===
train = cand[cand["Year"] <= 2023].copy()
val   = cand[cand["Year"] == 2024].copy()
test  = cand[cand["Year"] == 2025].copy()  # adjust to your data

cat_cols = ["Course", "Instructor", "Slot", "Mode", "Semester"]
num_cols = ["Year"]

# 2) Clean missing values
for df_split in (train, val, test):
    for c in cat_cols:
        df_split[c] = df_split[c].fillna("Missing")
    # Ensure 'Year' is numeric; errors='coerce' would introduce NaNs if not numeric, but it's already int type.
    # No need to convert if already numeric and no missing values are expected.
    df_split["Year"] = pd.to_numeric(df_split["Year"], errors="coerce")

y_train = train["Scheduled"].astype(int).values
y_val   = val["Scheduled"].astype(int).values
y_test  = test["Scheduled"].astype(int).values # Add y_test

# === 2) One-hot encode (train-first, then align others) ===
X_train_cat = pd.get_dummies(train[cat_cols], dummy_na=False)

# Add numeric columns as-is. Ensure indices align for concatenation.
X_train = pd.concat([X_train_cat, train[num_cols]], axis=1)

X_val_cat   = pd.get_dummies(val[cat_cols],   dummy_na=False)
# Make validation columns match training columns (unseen cats -> 0s)
X_val_cat = X_val_cat.reindex(columns=X_train_cat.columns, fill_value=0)
X_val   = pd.concat([X_val_cat,   val[num_cols]],   axis=1)

X_test_cat = pd.get_dummies(test[cat_cols], dummy_na=False) # Prepare X_test
X_test_cat = X_test_cat.reindex(columns=X_train_cat.columns, fill_value=0) # Align X_test columns
X_test = pd.concat([X_test_cat, test[num_cols]], axis=1) # Concatenate X_test

# (Optional) keep the column list so you can transform test the same way later
ohe_columns = X_train.columns.tolist()

# === 3) Train Logistic Regression ===
clf = LogisticRegression(max_iter=1000, class_weight="balanced")
clf.fit(X_train, y_train)

# Quick check
val_proba = clf.predict_proba(X_val)[:, 1]
print("Validation proba sample:", val_proba[:5])

Validation proba sample: [0.51180756 0.61530585 0.3606864  0.56120888 0.29968028]


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [39]:
print(X_train.isna().sum().sum() )        # total NaNs
print(X_train.isna().sum().sort_values())
print(X_train.shape)

print(clf.score(X_test,y_test))
print(clf.score(X_val,y_val))

0
Slot_W|10:00AM-12:50PM    0
Slot_W|10:00AM-12:45PM    0
Slot_W|09:00AM-11:50AM    0
Slot_W|09:00AM-11:45AM    0
Slot_W|08:30AM-11:20AM    0
                         ..
Mode_In Person            0
Semester_Fall             0
Semester_Spring           0
Year                      0
Course_BIOL 10            0
Length: 830, dtype: int64
(12180, 830)
0.6757944557133199
0.6899008674101611


Do we need to switch to regression?

Classification
- Random set --> Model,
- Model has 80% accuracy, then we could have unscheduled classes, which we don't want


Regression
- Random set --> model
- We get a score for each possible class. Only schedule those with high confidence, but we can still get unscheduled classes
- However, we have a confidence score though.