In [2]:
# Cell 1
import pandas as pd
import csv
import re
from typing import List, Dict, Any

pd.set_option("display.max_columns", 100)
pd.set_option("display.width", 160)

csv_path = r"C:\Users\ianma\OneDrive\Escritorio\Project_Mining\Old_Dataset.csv"
csv_path


'C:\\Users\\ianma\\OneDrive\\Escritorio\\Project_Mining\\Old_Dataset.csv'

In [4]:
# Cell 2
encodings_to_try = ["utf-8-sig", "utf-8", "cp1252", "latin-1"]
detected_encoding = None
raw_sample = None

for enc in encodings_to_try:
    try:
        with open(csv_path, "r", encoding=enc, errors="strict") as f:
            raw_sample = f.read(4096)
        detected_encoding = enc
        break
    except Exception:
        continue

if detected_encoding is None:
    # Fall back that should not crash
    detected_encoding = "latin-1"
    with open(csv_path, "r", encoding=detected_encoding, errors="replace") as f:
        raw_sample = f.read(4096)

print("Detected (or fallback) encoding:", detected_encoding)

# Delimiter sniff
dialect = csv.Sniffer().sniff(raw_sample, delimiters=[",",";","\t","|"])
detected_delimiter = dialect.delimiter
print("Detected delimiter:", repr(detected_delimiter))

# Show first 5 raw lines for sanity-check
with open(csv_path, "r", encoding=detected_encoding, errors="replace") as f:
    for i in range(5):
        print(f.readline().rstrip("\n"))


Detected (or fallback) encoding: cp1252
Detected delimiter: ';'
STUDIENR;UDDANNELSE;KURSKODE;KURSTXT;BEDOMMELSE;SKALA;ECTS;UDPROVNING;CENSUR;BEDOMMELSESDATO
STNR000001;Materiale- og procesteknologi, cand.polyt.;2402;02402 Introduktion til statistik;4;7-trinsskala;5;Skriftlig;ekstern censur;19-12-2018
STNR000001;Materiale- og procesteknologi, cand.polyt.;28213;28213 Polymerteknologi;7;7-trinsskala;5;Afløsningsopgave;ekstern censur;30-12-2019
STNR000001;Materiale- og procesteknologi, cand.polyt.;41632;41632 Robust design af produkter og mekanismer;7;7-trinsskala;5;Afløsningsopgave;intern censur;07-06-2019
STNR000001;Materiale- og procesteknologi, cand.polyt.;41656;41656 Materialer i avancerede anvendelser og produkter;10;7-trinsskala;10;Skriftlig;ekstern censur;27-05-2020


In [7]:
# Cell 3
df = pd.read_csv(
    r"C:\Users\ianma\OneDrive\Escritorio\Project_Mining\Old_Dataset.csv",
    sep=";",
    encoding="cp1252",
    dtype=str,
    engine="python",
    quotechar='"',
    on_bad_lines="skip",   # change to "warn" if you want to see which lines get skipped
)
print(df.shape)
df.head()


(327248, 10)


Unnamed: 0,STUDIENR,UDDANNELSE,KURSKODE,KURSTXT,BEDOMMELSE,SKALA,ECTS,UDPROVNING,CENSUR,BEDOMMELSESDATO
0,STNR000001,"Materiale- og procesteknologi, cand.polyt.",2402,02402 Introduktion til statistik,4,7-trinsskala,5,Skriftlig,ekstern censur,19-12-2018
1,STNR000001,"Materiale- og procesteknologi, cand.polyt.",28213,28213 Polymerteknologi,7,7-trinsskala,5,Afløsningsopgave,ekstern censur,30-12-2019
2,STNR000001,"Materiale- og procesteknologi, cand.polyt.",41632,41632 Robust design af produkter og mekanismer,7,7-trinsskala,5,Afløsningsopgave,intern censur,07-06-2019
3,STNR000001,"Materiale- og procesteknologi, cand.polyt.",41656,41656 Materialer i avancerede anvendelser og p...,10,7-trinsskala,10,Skriftlig,ekstern censur,27-05-2020
4,STNR000001,"Materiale- og procesteknologi, cand.polyt.",41661,41661 Metallære,10,7-trinsskala,5,Afløsningsopgave,intern censur,18-12-2018


In [8]:
# Cell 4
df.columns = [c.strip() for c in df.columns]

expected_cols = [
    'STUDIENR', 'UDDANNELSE', 'KURSKODE', 'KURSTXT', 'BEDOMMELSE',
    'SKALA', 'ECTS', 'UDPROVNING', 'CENSUR', 'BEDOMMELSESDATO'
]
missing = [c for c in expected_cols if c not in df.columns]
if missing:
    raise ValueError(f"Missing expected columns: {missing}")

# Strip whitespace
for c in expected_cols:
    if df[c].dtype == object:
        df[c] = df[c].astype(str).str.strip()

# Handle ECTS with possible comma decimals (e.g., "7,5")
df['ECTS'] = pd.to_numeric(df['ECTS'].str.replace(",", ".", regex=False), errors="coerce")

# Parse Danish-style dates (day-first)
df['BEDOMMELSESDATO'] = pd.to_datetime(
    df['BEDOMMELSESDATO'], errors="coerce", dayfirst=True, infer_datetime_format=True
)

df.head()


  df['BEDOMMELSESDATO'] = pd.to_datetime(


Unnamed: 0,STUDIENR,UDDANNELSE,KURSKODE,KURSTXT,BEDOMMELSE,SKALA,ECTS,UDPROVNING,CENSUR,BEDOMMELSESDATO
0,STNR000001,"Materiale- og procesteknologi, cand.polyt.",2402,02402 Introduktion til statistik,4,7-trinsskala,5.0,Skriftlig,ekstern censur,2018-12-19
1,STNR000001,"Materiale- og procesteknologi, cand.polyt.",28213,28213 Polymerteknologi,7,7-trinsskala,5.0,Afløsningsopgave,ekstern censur,2019-12-30
2,STNR000001,"Materiale- og procesteknologi, cand.polyt.",41632,41632 Robust design af produkter og mekanismer,7,7-trinsskala,5.0,Afløsningsopgave,intern censur,2019-06-07
3,STNR000001,"Materiale- og procesteknologi, cand.polyt.",41656,41656 Materialer i avancerede anvendelser og p...,10,7-trinsskala,10.0,Skriftlig,ekstern censur,2020-05-27
4,STNR000001,"Materiale- og procesteknologi, cand.polyt.",41661,41661 Metallære,10,7-trinsskala,5.0,Afløsningsopgave,intern censur,2018-12-18


In [13]:
# Cell 5 (English version)
def semester_from_date(ts: pd.Timestamp) -> str:
    if pd.isna(ts):
        return "Unknown"
    y, m = ts.year, ts.month
    if m in (2,3,4,5,6):
        return f"Spring {y}"
    if m in (8,9,10,11,12):
        return f"Autumn {y}"
    if m == 1:
        return f"Autumn {y-1}"   # January -> previous Autumn
    if m == 7:
        return f"Summer {y}"     # optional summer term
    return "Unknown"

df['Semester'] = df['BEDOMMELSESDATO'].apply(semester_from_date)
df[['BEDOMMELSESDATO','Semester']].head(10)


Unnamed: 0,BEDOMMELSESDATO,Semester
0,2018-12-19,Autumn 2018
1,2019-12-30,Autumn 2019
2,2019-06-07,Spring 2019
3,2020-05-27,Spring 2020
4,2018-12-18,Autumn 2018
5,2019-01-25,Autumn 2018
6,2019-06-24,Spring 2019
7,2019-07-08,Summer 2019
8,2019-01-28,Autumn 2018
9,2020-01-28,Autumn 2019


In [18]:
# Cell 6 (English version)
import re

# Define season order for sorting
SEASON_ORDER = {"Spring": 1, "Summer": 2, "Autumn": 3, "Unknown": 99}

def semester_sort_key(sem_text: str):
    """
    Turn a semester label like 'Spring 2020' into (year, order)
    so that sorting by this key gives chronological order.
    """
    if not isinstance(sem_text, str) or sem_text.strip() == "":
        return (9999, 99)

    sem_text = sem_text.strip()
    # Matches 'Spring 2020', 'Summer 2019', 'Autumn 2021'
    m = re.search(r'(Spring|Summer|Autumn)\s+([12]\d{3})', sem_text, flags=re.IGNORECASE)
    if m:
        season = m.group(1).capitalize()
        year = int(m.group(2))
        return (year, SEASON_ORDER.get(season, 50))

    # Fallback: just a year
    m = re.search(r'([12]\d{3})', sem_text)
    if m:
        return (int(m.group(1)), 50)

    return (9999, 99)


def row_sort_key(row):
    """
    Prefer sorting by BEDOMMELSESDATO (exact date);
    if missing, fall back to the parsed semester key.
    """
    dt = row['BEDOMMELSESDATO']
    if pd.notna(dt):
        return (dt.year, dt.month, 0)
    y, o = semester_sort_key(row['Semester'])
    return (y, o, 1)


# Apply sorting and drop helper column afterwards
df_sorted = df.assign(_key=df.apply(row_sort_key, axis=1)) \
              .sort_values('_key') \
              .drop(columns=['_key'])

df_sorted.head(10)



Unnamed: 0,STUDIENR,UDDANNELSE,KURSKODE,KURSTXT,BEDOMMELSE,SKALA,ECTS,UDPROVNING,CENSUR,BEDOMMELSESDATO,Semester
125706,STNR009750,"Elektroteknologi, ing.prof.bach.",PRA31-30,Ingeniørpraktik,BE,bestået/ikke bestået,30.0,Rapport,intern censur,2017-07-19,Summer 2017
227068,STNR017589,"Miljøvidenskab, ing.bach.",12003,12003 Miljøteknisk feltkursus,12,7-trinsskala,5.0,Skriftlig,ekstern censur,2017-07-06,Summer 2017
300199,STNR022380,"Eksport og teknologi, ing.prof.bach.",62163,62163 Grundlæggende mekanik,EM,7-trinsskala,5.0,Skriftlig,intern censur,2017-07-20,Summer 2017
300200,STNR022381,"Eksport og teknologi, ing.prof.bach.",62163,62163 Grundlæggende mekanik,EM,7-trinsskala,5.0,Skriftlig,intern censur,2017-07-20,Summer 2017
300201,STNR022382,"Eksport og teknologi, ing.prof.bach.",62163,62163 Grundlæggende mekanik,0,7-trinsskala,5.0,Skriftlig,intern censur,2017-07-20,Summer 2017
264071,STNR019922,"Miljøvidenskab, ing.bach.",42340,42340 Bæredygtighed i ingeniørløsninger,10,7-trinsskala,5.0,Skriftlig,intern censur,2017-07-18,Summer 2017
88234,STNR005801,"Sundhedsteknologi, ing.prof.bach.",62521,62521 Design-build projekt,7,7-trinsskala,10.0,Skriftlig,ekstern censur,2017-07-03,Summer 2017
35884,STNR002364,"Softwareteknologi, ing.prof.bach.",S62-02-1,Specialkursus ved Institut for Ingeniørteknolo...,BE,7-trinsskala,2.5,Projekt,intern censur,2017-07-02,Summer 2017
118917,STNR009194,"Konstruktion og mekanik, cand.polyt.",41315,41315 Anvendt CFD,BE,bestået/ikke bestået,5.0,Skriftlig,intern censur,2017-07-27,Summer 2017
88274,STNR005803,"Sundhedsteknologi, ing.prof.bach.",62521,62521 Design-build projekt,12,7-trinsskala,10.0,Skriftlig,ekstern censur,2017-07-03,Summer 2017


In [19]:
# Cell 7
cols_for_course = [
    'KURSKODE','KURSTXT','BEDOMMELSE','SKALA','ECTS',
    'UDPROVNING','CENSUR','BEDOMMELSESDATO','UDDANNELSE'
]

def build_student_semesters(sub_df: pd.DataFrame):
    # assumes df_sorted is already chronologically sorted
    blocks = []
    ordered_labels = pd.unique(sub_df['Semester'])  # preserves order from df_sorted
    for sem in ordered_labels:
        chunk = sub_df[sub_df['Semester'] == sem]
        courses = chunk[cols_for_course].to_dict(orient='records')
        blocks.append({"Semester": sem, "Courses": courses})
    return blocks

# sort chronologically first (reuse your df_sorted from Cell 6)
students_semester_courses = []
for studienr, g in df_sorted.groupby('STUDIENR', sort=False):
    semesters = build_student_semesters(g)
    students_semester_courses.append([studienr, *semesters])

# Quick peek at the first student
students_semester_courses[0] if students_semester_courses else "No data"


['STNR009750',
 {'Semester': 'Summer 2017',
  'Courses': [{'KURSKODE': 'PRA31-30',
    'KURSTXT': 'Ingeniørpraktik',
    'BEDOMMELSE': 'BE',
    'SKALA': 'bestået/ikke bestået',
    'ECTS': 30.0,
    'UDPROVNING': 'Rapport',
    'CENSUR': 'intern censur',
    'BEDOMMELSESDATO': Timestamp('2017-07-19 00:00:00'),
    'UDDANNELSE': 'Elektroteknologi, ing.prof.bach.'}]},
 {'Semester': 'Autumn 2017',
  'Courses': [{'KURSKODE': '62738',
    'KURSTXT': '62738 Digital signalbehandling',
    'BEDOMMELSE': '2',
    'SKALA': '7-trinsskala',
    'ECTS': 10.0,
    'UDPROVNING': 'Afløsningsopgave',
    'CENSUR': 'ekstern censur',
    'BEDOMMELSESDATO': Timestamp('2017-08-30 00:00:00'),
    'UDDANNELSE': 'Elektroteknologi, ing.prof.bach.'},
   {'KURSKODE': 'DE31-20',
    'KURSTXT': 'Institut for Elektroteknologi',
    'BEDOMMELSE': '7',
    'SKALA': '7-trinsskala',
    'ECTS': 20.0,
    'UDPROVNING': 'Projekt',
    'CENSUR': 'ekstern censur',
    'BEDOMMELSESDATO': Timestamp('2017-12-21 00:00:00'),
 

In [21]:
# Cell 8a: tidy view = one row per student-course (already sorted)
tidy = df_sorted.copy()
tidy.head()


Unnamed: 0,STUDIENR,UDDANNELSE,KURSKODE,KURSTXT,BEDOMMELSE,SKALA,ECTS,UDPROVNING,CENSUR,BEDOMMELSESDATO,Semester
125706,STNR009750,"Elektroteknologi, ing.prof.bach.",PRA31-30,Ingeniørpraktik,BE,bestået/ikke bestået,30.0,Rapport,intern censur,2017-07-19,Summer 2017
227068,STNR017589,"Miljøvidenskab, ing.bach.",12003,12003 Miljøteknisk feltkursus,12,7-trinsskala,5.0,Skriftlig,ekstern censur,2017-07-06,Summer 2017
300199,STNR022380,"Eksport og teknologi, ing.prof.bach.",62163,62163 Grundlæggende mekanik,EM,7-trinsskala,5.0,Skriftlig,intern censur,2017-07-20,Summer 2017
300200,STNR022381,"Eksport og teknologi, ing.prof.bach.",62163,62163 Grundlæggende mekanik,EM,7-trinsskala,5.0,Skriftlig,intern censur,2017-07-20,Summer 2017
300201,STNR022382,"Eksport og teknologi, ing.prof.bach.",62163,62163 Grundlæggende mekanik,0,7-trinsskala,5.0,Skriftlig,intern censur,2017-07-20,Summer 2017


In [22]:
# Cell 8b: how many courses each student took per semester
courses_per_semester = (tidy
    .groupby(['STUDIENR','Semester'], dropna=False)
    .size()
    .rename('n_courses')
    .reset_index()
    .sort_values(['STUDIENR','Semester'])
)
courses_per_semester.head(20)


Unnamed: 0,STUDIENR,Semester,n_courses
0,STNR000001,Autumn 2017,1
1,STNR000001,Autumn 2018,5
2,STNR000001,Autumn 2019,4
3,STNR000001,Autumn 2020,1
4,STNR000001,Spring 2018,2
5,STNR000001,Spring 2019,3
6,STNR000001,Spring 2020,2
7,STNR000001,Spring 2021,1
8,STNR000001,Summer 2017,1
9,STNR000001,Summer 2019,1


In [23]:
# Cell 11
import numpy as np

def normalize_text(x):
    return "" if pd.isna(x) else str(x).strip()

def classify_pass_fail(row):
    scale = normalize_text(row['SKALA']).casefold()
    grade = normalize_text(row['BEDOMMELSE']).strip()

    # Common non-pass tokens in DK datasets
    non_pass_tokens = {"ib", "ikke bestået", "em", "im", "syg", "0", "00", "-3"}

    # 1) Pass/Fail scale (bestået/ikke bestået)
    if "bestået" in scale:  # e.g., "bestået/ikke bestået"
        g = grade.casefold()
        if g.startswith("be") or g == "bestået":
            return True, False, np.nan  # passed, failed, grade_num
        if g in non_pass_tokens:
            return False, True, np.nan
        # Unknown token on this scale -> treat as fail-safe unknown
        return False, False, np.nan

    # 2) 7-trinsskala (numeric)
    if "7-trin" in scale or "7 trin" in scale or "7trin" in scale:
        # Map strings like "02","00","-3" to numbers
        gtxt = grade.replace(",", ".")
        try:
            gnum = float(gtxt)
        except ValueError:
            # Sometimes "BE" sneaks in on 7-trinsskala rows -> treat as pass
            if grade.casefold().startswith("be"):
                return True, False, np.nan
            # Otherwise unknown
            return False, False, np.nan

        # Passing threshold on 7-trinsskala is >= 02
        # NOTE: Datasets often store "02" as "2"
        passed = gnum >= 2
        failed = gnum in (-3, 0) or gnum < 2
        return passed, failed, gnum

    # 3) Anything else -> unknown
    return False, False, np.nan

df_work = df_sorted.copy()
df_work[['passed','failed','grade_num']] = df_work.apply(
    lambda r: pd.Series(classify_pass_fail(r)),
    axis=1
)

df_work[['STUDIENR','KURSKODE','BEDOMMELSE','SKALA','grade_num','passed','failed','Semester']].head(10)


Unnamed: 0,STUDIENR,KURSKODE,BEDOMMELSE,SKALA,grade_num,passed,failed,Semester
125706,STNR009750,PRA31-30,BE,bestået/ikke bestået,,True,False,Summer 2017
227068,STNR017589,12003,12,7-trinsskala,12.0,True,False,Summer 2017
300199,STNR022380,62163,EM,7-trinsskala,,False,False,Summer 2017
300200,STNR022381,62163,EM,7-trinsskala,,False,False,Summer 2017
300201,STNR022382,62163,0,7-trinsskala,0.0,False,True,Summer 2017
264071,STNR019922,42340,10,7-trinsskala,10.0,True,False,Summer 2017
88234,STNR005801,62521,7,7-trinsskala,7.0,True,False,Summer 2017
35884,STNR002364,S62-02-1,BE,7-trinsskala,,True,False,Summer 2017
118917,STNR009194,41315,BE,bestået/ikke bestået,,True,False,Summer 2017
88274,STNR005803,62521,12,7-trinsskala,12.0,True,False,Summer 2017


In [24]:
# Cell 12
# Sort for deterministic attempt numbering
date_col = 'BEDOMMELLESESDATO' if 'BEDOMMELLESESDATO' in df_work.columns else 'BEDOMMELSESDATO'
df_work = df_work.sort_values([ 'STUDIENR', 'KURSKODE', date_col ])

# Attempt number per (student, course)
df_work['attempt_no'] = df_work.groupby(['STUDIENR','KURSKODE']).cumcount() + 1
df_work['n_attempts'] = df_work.groupby(['STUDIENR','KURSKODE'])['KURSKODE'].transform('size')
df_repeats = df_work[df_work['n_attempts'] > 1].copy()

# Quick peek at repeats
df_repeats[['STUDIENR','KURSKODE','KURSTXT','Semester','BEDOMMELSE','SKALA','attempt_no','n_attempts']].head(20)


Unnamed: 0,STUDIENR,KURSKODE,KURSTXT,Semester,BEDOMMELSE,SKALA,attempt_no,n_attempts
22,STNR000003,1005,01005 Matematik 1,Spring 2018,IB,7-trinsskala,1,2
23,STNR000003,1005,01005 Matematik 1,Autumn 2018,7,7-trinsskala,2,2
32,STNR000003,26400,26400 Organisk kemi 1,Spring 2018,0,7-trinsskala,1,2
31,STNR000003,26400,26400 Organisk kemi 1,Autumn 2018,4,7-trinsskala,2,2
53,STNR000004,1005,01005 Matematik 1,Spring 2018,IB,7-trinsskala,1,2
54,STNR000004,1005,01005 Matematik 1,Spring 2019,10,7-trinsskala,2,2
57,STNR000004,2402,02402 Introduktion til statistik,Autumn 2020,IG,7-trinsskala,1,2
56,STNR000004,2402,02402 Introduktion til statistik,Spring 2021,7,7-trinsskala,2,2
142,STNR000007,11115,11115 Bygningsenergi og -installationer - Inte...,Autumn 2019,EM,7-trinsskala,1,2
143,STNR000007,11115,11115 Bygningsenergi og -installationer - Inte...,Autumn 2020,7,7-trinsskala,2,2


In [28]:
# Cell 13 (final): full semester-course summary (all courses, not only failed)

# --- 1️⃣ Build student-level status per course per semester ---
# A student counts as "failed" if they had at least one failed attempt
# and no passed attempt for that same course & semester.
student_sem_status = (
    df_work
    .groupby(['Semester', 'KURSKODE', 'KURSTXT', 'STUDIENR'], dropna=False)
    .agg(
        attempts=('KURSKODE', 'size'),
        any_pass=('passed', 'max'),
        any_fail=('failed', 'max')
    )
    .reset_index()
)

student_sem_status['student_failed'] = student_sem_status['any_fail'] & (~student_sem_status['any_pass'])
student_sem_status['student_passed'] = student_sem_status['any_pass']

# --- 2️⃣ Aggregate to course–semester level (students, fails, passes) ---
student_summary = (
    student_sem_status
    .groupby(['Semester', 'KURSKODE', 'KURSTXT'], dropna=False)
    .agg(
        n_students=('STUDIENR', 'nunique'),
        n_students_failed=('student_failed', 'sum'),
        n_students_passed=('student_passed', 'sum')
    )
    .reset_index()
)

# --- 3️⃣ Add attempt-level information (total attempts & failed attempts) ---
attempt_summary = (
    df_work
    .groupby(['Semester', 'KURSKODE', 'KURSTXT'], dropna=False)
    .agg(
        n_attempts=('KURSKODE', 'size'),
        n_failed_attempts=('failed', 'sum')
    )
    .reset_index()
)

# --- 4️⃣ Merge both summaries ---
course_semester_stats = (
    student_summary
    .merge(attempt_summary, on=['Semester','KURSKODE','KURSTXT'], how='left')
)

# --- 5️⃣ Compute failure rate (% of students who failed that term) ---
course_semester_stats['fail_rate_students'] = (
    course_semester_stats['n_students_failed'] / course_semester_stats['n_students']
).round(4)

# --- 6️⃣ Rank courses by number of failed students within each semester ---
course_semester_stats['rank_in_semester'] = (
    course_semester_stats.groupby('Semester')['n_students_failed']
    .rank(method='first', ascending=False)
)

# --- 7️⃣ Optional: top N per semester (set N = None for all) ---
N = None   # set to an integer like 15 if you only want the top 15 each term
if N:
    course_semester_stats = (
        course_semester_stats[course_semester_stats['rank_in_semester'] <= N]
        .sort_values(['Semester','n_students_failed'], ascending=[True, False])
        .reset_index(drop=True)
    )
else:
    course_semester_stats = (
        course_semester_stats
        .sort_values(['Semester','n_students_failed'], ascending=[True, False])
        .reset_index(drop=True)
    )

# Display the full table (first 50 rows for sanity)
course_semester_stats.head(50)



Unnamed: 0,Semester,KURSKODE,KURSTXT,n_students,n_students_failed,n_students_passed,n_attempts,n_failed_attempts,fail_rate_students,rank_in_semester
0,Autumn 2017,1901,01901 Basismat - Indledende matematik for dipl...,885,169,625,942,186,0.191,1.0
1,Autumn 2017,62337,62337 Geoteknik og fundering,161,56,80,163,57,0.3478,2.0
2,Autumn 2017,1920,01920 Basismat 2 - Videregående matematik for ...,218,42,153,218,42,0.1927,3.0
3,Autumn 2017,2631,02631 Introduktion til programmering og databe...,176,40,135,178,42,0.2273,4.0
4,Autumn 2017,62155,62155 Matematik basis 1,59,37,21,61,38,0.6271,5.0
5,Autumn 2017,1035,01035 Matematik 2,312,36,249,321,41,0.1154,6.0
6,Autumn 2017,62145,62145 Produkters dokumentation,94,36,53,96,38,0.383,7.0
7,Autumn 2017,62675,62675 Mekanik 1,108,36,44,114,37,0.3333,8.0
8,Autumn 2017,62157,62157 Matematik basis 3,59,31,28,59,31,0.5254,9.0
9,Autumn 2017,2450,02450 Introduktion til machine learning og dat...,343,29,295,346,30,0.0845,10.0


In [30]:
# Cell 15
import os

# Define export path (same folder as your original CSV)
base_dir = os.path.dirname(r"C:\Users\ianma\OneDrive\Escritorio\Project_Mining\Old_Dataset.csv")
cleaned_csv = os.path.join(base_dir, "Old_Dataset_CLEANED.csv")

# Save it — this includes your added columns:
# ['Semester', 'passed', 'failed', 'grade_num', 'attempt_no', 'n_attempts', ...]
df_work.to_csv(cleaned_csv, index=False, encoding='utf-8')

print(f"✅ Cleaned dataset saved successfully at:\n{cleaned_csv}")


✅ Cleaned dataset saved successfully at:
C:\Users\ianma\OneDrive\Escritorio\Project_Mining\Old_Dataset_CLEANED.csv
