In [1]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import recommendations
from sklearn.cluster import KMeans

In [2]:
df = pd.read_csv("../../datasets/ANHUI/processed_data.csv")

In [3]:
df

Unnamed: 0,Student Number,Course Title,Course Credit,Grades,Course Semester,GPA,Completed Credits,Semester GPA,Semester Credit,Department Code
0,0,Ideological and Moral Cultivation and Legal Fo...,2.5,4,1,3.102564,19.5,3.102564,19.5,BLG
1,0,Success: Career Planning,1.0,3,1,3.102564,19.5,3.102564,19.5,BLG
2,0,Introduction to Computer Science,2.0,1,1,3.102564,19.5,3.102564,19.5,BLG
3,0,Advanced Mathematics A(1),5.5,4,1,3.102564,19.5,3.102564,19.5,BLG
4,0,College English A(1),4.0,3,1,3.102564,19.5,3.102564,19.5,BLG
...,...,...,...,...,...,...,...,...,...,...
19095,381,J2EE Framework,4.5,3,6,2.790076,131.0,2.800000,20.0,BLG
19096,381,Intellectual Property and Software Protection,1.0,4,7,2.838129,139.0,3.625000,8.0,BLG
19097,381,Human-Computer Interaction Technology,2.0,5,7,2.838129,139.0,3.625000,8.0,BLG
19098,381,Software Development and Testing Training,3.0,3,7,2.838129,139.0,3.625000,8.0,BLG


In [5]:
df = pd.concat([df, pd.get_dummies(df['Department Code'], prefix='Department Code')], axis=1)
df.drop(['Department Code'], axis=1, inplace=True)

In [6]:
df

Unnamed: 0,Student Number,Course Title,Course Credit,Grades,Course Semester,GPA,Completed Credits,Semester GPA,Semester Credit,Department Code_BLG
0,0,Ideological and Moral Cultivation and Legal Fo...,2.5,4,1,3.102564,19.5,3.102564,19.5,True
1,0,Success: Career Planning,1.0,3,1,3.102564,19.5,3.102564,19.5,True
2,0,Introduction to Computer Science,2.0,1,1,3.102564,19.5,3.102564,19.5,True
3,0,Advanced Mathematics A(1),5.5,4,1,3.102564,19.5,3.102564,19.5,True
4,0,College English A(1),4.0,3,1,3.102564,19.5,3.102564,19.5,True
...,...,...,...,...,...,...,...,...,...,...
19095,381,J2EE Framework,4.5,3,6,2.790076,131.0,2.800000,20.0,True
19096,381,Intellectual Property and Software Protection,1.0,4,7,2.838129,139.0,3.625000,8.0,True
19097,381,Human-Computer Interaction Technology,2.0,5,7,2.838129,139.0,3.625000,8.0,True
19098,381,Software Development and Testing Training,3.0,3,7,2.838129,139.0,3.625000,8.0,True


In [7]:
course_credits = {}
for row_idx in df.index:
    course_title = df.iloc[row_idx, 1]
    credit = df.iloc[row_idx, 2]    
    course_credits[course_title] = credit

In [8]:
def get_semester_data(semester_num):
    semester_data = {}   # semester data in shape {student_number: {course_title: grade, ...}, ...}
    
    # extracting the instances with the given semester_num from the main dataFrame
    dataset = df[df.iloc[:, 4] == semester_num]
    dataset.index = range(len(dataset)) 

    # filling the semester_data dictionary
    for row_idx in dataset.index:
        student_number = dataset.iloc[row_idx, 0]
        course_title = dataset.iloc[row_idx, 1]
        grade = dataset.iloc[row_idx, 3]
        
        semester_data.setdefault(student_number, {})
        semester_data[student_number][course_title] = grade
    
    return semester_data

In [9]:
def get_avg_gpa(train_semester, student):
    courses = train_semester[student]
    total_credit = 0
    weights = 0
    for course in courses:
        total_credit += course_credits[course]
        weights += courses[course] * course_credits[course]
    
    return weights / total_credit

In [10]:
def get_grade_stats(semester_data, student):
    grade_list = []
    
    for course in semester_data[student]:
        grade = semester_data[student][course]
        grade_list.append(grade)
    
    mean = np.mean(grade_list)
    std_dev = np.std(grade_list)
    
    return mean, std_dev

In [11]:
def fit_cluster(train_sems, num_clusters, training_data, cluster_model):

    train_dataset = pd.DataFrame(columns=df.columns)
    
    # extracting instances from the dataset which should be in training data
    for sem in train_sems:
        train_dataset = pd.concat([train_dataset, df[df.iloc[:, 4] == sem]], ignore_index=True)
 
    cluster_features = train_dataset[['GPA', 'Completed Credits'] + list(train_dataset.columns[9:])]
    
    # fitting a clustering model based on GPA, Completed Credits and Departments
    fitted_cluster_model = cluster_model(n_clusters=num_clusters).fit(cluster_features)
    cluster_labels = fitted_cluster_model.labels_  
    
    cluster_dataset = {}   # splitting the train dataset into sub-dicts based on their predicted cluster label
    
    # assigning each students' data to their predicted clusters
    for i in range(len(cluster_labels)):
        cluster_dataset.setdefault(cluster_labels[i], {})
        student_number = train_dataset.iloc[i, 0]
        cluster_dataset[cluster_labels[i]][student_number] = training_data[student_number]
    
    return cluster_dataset, fitted_cluster_model

In [12]:
def cluster_test_data(fitted_cluster_model, semester_num):
    # extracting all instances with the given semester_num from the main dataFrame
    test_dataset = df[df.iloc[:, 4] == semester_num]
    test_dataset.index = range(len(test_dataset))
    
    # predicting the cluster labels of test data using a cluster model fitted on the train data so far
    cluster_features = test_dataset[['GPA', 'Completed Credits'] + list(test_dataset.columns[9:])]
    cluster_labels = fitted_cluster_model.predict(cluster_features) # predict the test data 
    
    # getting the semester data of available students in test semester
    semester_data = get_semester_data(semester_num)
    
    cluster_dataset = {}   # splitting the test dataset into sub-dicts based on their predicted cluster label
    
    # assigning each students' data to their predicted clusters
    for i in range(len(cluster_labels)):
        cluster_dataset.setdefault(cluster_labels[i], {})
        student_number = test_dataset.iloc[i, 0]
        cluster_dataset[cluster_labels[i]][student_number] = semester_data[student_number]
        
    return cluster_dataset    

In [None]:
def get_errors(train_semester, test_semester, sim, item_based):
    # print(f"train_semester: {train_semester}")
    average_gpa = {}
    y_true = []
    y_pred = []
    gpa = {}
    
    for student in train_semester:
        # print(f'student: {student}')
        gpa[student] = get_avg_gpa(train_semester, student)
    
    item_sims = recommendations.calculateSimilarItems(train_semester)
    # predicting recommended courses for each student in training data
    for student in train_semester:
        # print(f'{student} student')
        recommended_courses = {}
        
        if item_based:
            recs = recommendations.getRecommendedItems(train_semester, item_sims, student)
        else:
            # print('somthing')
            recs = recommendations.getRecommendations(train_semester, student, sim, dgpa=True, gpa=gpa, delta=0.7)
        
        # print(f'recs {recs}')

        for rec_grade, rec_course in recs:
            # print(f'recs: {recs}')
            recommended_courses.setdefault(rec_course, rec_grade)
            
        average_gpa.setdefault(student, get_avg_gpa(train_semester, student))
        
        # skipping students from training data who do not have not taken courses in test data
        if student not in test_semester:
            continue
        
        mean, std_dev = get_grade_stats(train_semester, student)

       # checking for students' test data records in recommended courses
        for course_title in test_semester[student]:
            # print(f"Course: {course_title}, Recommended Courses: {recommended_courses}")
            if course_title in recommended_courses:   # considering the predicted grade if course is available
                rec_grade = recommended_courses[course_title]
                if rec_grade < mean - (2 * std_dev) or rec_grade > mean + (2 * std_dev):
                    continue
                y_pred.append(rec_grade)
            else:   # considering the average GPA if course is not available in recommended courses
                rec_grade = average_gpa[student]
                if rec_grade < mean - (2 * std_dev) or rec_grade > mean + (2 * std_dev):
                    continue
                y_pred.append(rec_grade)
            y_true.append(test_semester[student][course_title])
            
    assert len(y_true) == len(y_pred)   
    return y_true, y_pred

In [14]:
def predict(sim, cluster_model, item_based=False):
    predictions = {} # storing error scores in a dict with shape: 
                      # {num_clusters (k=2,3,...,7): 
                        # {num_training_semesters (N=1,2,...,7): {'y_true': [], 'y_pred': []}, ...},
                      #...}
    
    sorted_semesters = sorted(set(df.iloc[:, 4]))   # sorting semesters in a time series manner
    for num_clusters in range(10, 31, 5):
        predictions.setdefault(str(num_clusters), {})
        train_semester = {}   # {student_number: {course_title: grade, ...}, ...}
        for sem_idx in range(1, len(sorted_semesters)): 
            predictions[str(num_clusters)].setdefault(str(sem_idx), {'y_true': [], 'y_pred': []})
            
            # combining all previous semesters in dataset and consider it as training semester
            new_semester = get_semester_data(sorted_semesters[sem_idx-1])
            for student in new_semester:
                if student in train_semester:   # combine data if a student already exist
                    train_semester[student].update(new_semester[student])
                else:   # create a new key-value pair for students with no record
                    train_semester[student] = new_semester[student]
            
            training_semesters_name = sorted_semesters[:sem_idx]   # names of all training semesters
            print(training_semesters_name)
            
            # getting the cluster model fitted on training data and each clusters' training data
            train_cluster_data, fitted_cluster_model = fit_cluster(training_semesters_name, num_clusters, train_semester, cluster_model)
            
            # getting the clustered test data
            test_semester_name = sorted_semesters[sem_idx]
            test_cluster_data = cluster_test_data(fitted_cluster_model, test_semester_name)
            
            # fitting each cluster label with a similarity metric, and measure the error between the same
            # cluster labels in training and test data
            for cluster_label in train_cluster_data:
                if cluster_label not in test_cluster_data:
                    continue
                y_true, y_pred = get_errors(train_cluster_data[cluster_label], test_cluster_data[cluster_label], sim, item_based)
                predictions[str(num_clusters)][str(sem_idx)]['y_true'] += y_true
                predictions[str(num_clusters)][str(sem_idx)]['y_pred'] += y_pred
        
    return predictions

In [47]:
model_predictions = {}

### User-based Collaborative Filtering

In [50]:
predictions = predict(recommendations.sim_distance, KMeans)
model_predictions['Euclidean Distance'] = predictions

[1]


  train_dataset = pd.concat([train_dataset, df[df.iloc[:, 4] == sem]], ignore_index=True)


[1, 2]


  train_dataset = pd.concat([train_dataset, df[df.iloc[:, 4] == sem]], ignore_index=True)


[1, 2, 3]


  train_dataset = pd.concat([train_dataset, df[df.iloc[:, 4] == sem]], ignore_index=True)


[1, 2, 3, 4]


  train_dataset = pd.concat([train_dataset, df[df.iloc[:, 4] == sem]], ignore_index=True)


[1, 2, 3, 4, 5]


  train_dataset = pd.concat([train_dataset, df[df.iloc[:, 4] == sem]], ignore_index=True)


[1, 2, 3, 4, 5, 6]


  train_dataset = pd.concat([train_dataset, df[df.iloc[:, 4] == sem]], ignore_index=True)


[1]


  train_dataset = pd.concat([train_dataset, df[df.iloc[:, 4] == sem]], ignore_index=True)


[1, 2]


  train_dataset = pd.concat([train_dataset, df[df.iloc[:, 4] == sem]], ignore_index=True)


[1, 2, 3]


  train_dataset = pd.concat([train_dataset, df[df.iloc[:, 4] == sem]], ignore_index=True)


[1, 2, 3, 4]


  train_dataset = pd.concat([train_dataset, df[df.iloc[:, 4] == sem]], ignore_index=True)


[1, 2, 3, 4, 5]


  train_dataset = pd.concat([train_dataset, df[df.iloc[:, 4] == sem]], ignore_index=True)


[1, 2, 3, 4, 5, 6]


  train_dataset = pd.concat([train_dataset, df[df.iloc[:, 4] == sem]], ignore_index=True)


[1]


  train_dataset = pd.concat([train_dataset, df[df.iloc[:, 4] == sem]], ignore_index=True)


[1, 2]


  train_dataset = pd.concat([train_dataset, df[df.iloc[:, 4] == sem]], ignore_index=True)


[1, 2, 3]


  train_dataset = pd.concat([train_dataset, df[df.iloc[:, 4] == sem]], ignore_index=True)


[1, 2, 3, 4]


  train_dataset = pd.concat([train_dataset, df[df.iloc[:, 4] == sem]], ignore_index=True)


[1, 2, 3, 4, 5]


  train_dataset = pd.concat([train_dataset, df[df.iloc[:, 4] == sem]], ignore_index=True)


[1, 2, 3, 4, 5, 6]


  train_dataset = pd.concat([train_dataset, df[df.iloc[:, 4] == sem]], ignore_index=True)


[1]


  train_dataset = pd.concat([train_dataset, df[df.iloc[:, 4] == sem]], ignore_index=True)


[1, 2]


  train_dataset = pd.concat([train_dataset, df[df.iloc[:, 4] == sem]], ignore_index=True)


[1, 2, 3]


  train_dataset = pd.concat([train_dataset, df[df.iloc[:, 4] == sem]], ignore_index=True)


[1, 2, 3, 4]


  train_dataset = pd.concat([train_dataset, df[df.iloc[:, 4] == sem]], ignore_index=True)


KeyboardInterrupt: 

In [None]:
predictions = predict(recommendations.sim_jaccard, KMeans)
model_predictions['Jaccard Index'] = predictions

In [None]:
predictions = predict(recommendations.sim_pearson, KMeans)
model_predictions['Pearson Correlation'] = predictions

In [23]:
with open('student_base_clustering_and_userbased_collaborative_filtering.json', 'w') as fw:
    json.dump(model_predictions, fw, default=str)  

### Item-based Collaborative Filtering

In [24]:
model_predictions = {}

In [None]:
predictions = predict(recommendations.sim_distance, KMeans, item_based=True)
model_predictions['Euclidean Distance'] = predictions

In [None]:
predictions = predict(recommendations.sim_jaccard, KMeans, item_based=True)
model_predictions['Jaccard Index'] = predictions

In [None]:
predictions = predict(recommendations.sim_pearson, KMeans, item_based=True)
model_predictions['Pearson Correlation'] = predictions

In [29]:
with open('student_base_clustering_and_itembased_collaborative_filtering.json', 'w') as fw:
    json.dump(model_predictions, fw, default=str)  

# cf coverage ratio analizi icin

In [None]:
import numpy as np
import recommendations

# --- YARDIMCI FONKSİYON: Global İstatistik ---
def get_global_coverage_stats(train_semester, test_semester, is_item_based=True):
    covered_count = 0
    total_count = 0
    covered_sq_err = []
    fallback_sq_err = []
    
    # 1. Ortalamaları Hazırla
    average_gpa = {}
    for student in train_semester:
        average_gpa[student] = get_avg_gpa(train_semester, student)
        
    # 2. Benzerlik Matrisi (Tüm veri üzerinde)
    # Eğer Item-Based ise
    item_sims = {}
    if is_item_based:
        # recommendations dosyanızdaki calculateSimilarItems fonksiyonu
        # parametre alıyorsa n=10 verin, almıyorsa boş bırakın.
        try:
            item_sims = recommendations.calculateSimilarItems(train_semester, n=10)
        except TypeError:
            item_sims = recommendations.calculateSimilarItems(train_semester)

    # 3. Test Aşaması
    for student in train_semester: # Sadece geçmişi olan öğrenciler
        if student not in test_semester:
            continue
            
        # Önerileri Al
        recommended_courses = {}
        if is_item_based:
            recs = recommendations.getRecommendedItems(train_semester, item_sims, student)
        else:
            # User-Based Pearson (Varsayılan)
            recs = recommendations.getRecommendations(train_semester, student) # sim parametresi gerekebilir
            
        # Sözlüğe çevir: {Ders: Tahmin}
        for rec_grade, rec_course in recs:
            recommended_courses[rec_course] = rec_grade
            
        # Outlier kontrolü için istatistik
        mean, std_dev = get_grade_stats(train_semester, student)

        # Hedef dersleri kontrol et
        for course_title in test_semester[student]:
            actual_grade = test_semester[student][course_title]
            total_count += 1
            
            pred_grade = 0
            is_covered = False
            
            # --- KRİTİK KONTROL ---
            if course_title in recommended_courses:
                pred_grade = recommended_courses[course_title]
                is_covered = True
            else:
                pred_grade = average_gpa.get(student, 0)
                is_covered = False
                
            # Outlier Filtresi (Coverage hesabını bozmaması için buraya da ekliyoruz)
            if std_dev > 0:
                if pred_grade < mean - (2 * std_dev) or pred_grade > mean + (2 * std_dev):
                    continue # Tahmin geçersiz sayılır, pas geç
            
            # Hata Kaydı
            err_sq = (pred_grade - actual_grade) ** 2
            if is_covered:
                covered_count += 1
                covered_sq_err.append(err_sq)
            else:
                fallback_sq_err.append(err_sq)

    return covered_sq_err, fallback_sq_err

# --- ANA ÇALIŞTIRMA (NO CLUSTERING) ---
def analyze_anhui_global_coverage():
    print("Anhui Dataset - Global CF Coverage Analysis (No Clustering)...")
    
    all_covered = []
    all_fallback = []
    
    sorted_semesters = sorted(set(df.iloc[:, 4]))
    
    for sem_idx in range(1, len(sorted_semesters)):
        curr_sem_name = sorted_semesters[sem_idx]
        # print(f"Processing {curr_sem_name}...")
        
        # Train verisini kümülatif hazırla
        train_semester = {}
        for k in range(sem_idx):
             sem_data = get_semester_data(sorted_semesters[k])
             for stu in sem_data:
                 if stu in train_semester:
                     train_semester[stu].update(sem_data[stu])
                 else:
                     train_semester[stu] = sem_data[stu].copy()
        
        test_semester = get_semester_data(curr_sem_name)
        
        # Kümeleme yapmadan direkt hesapla
        cov_sq, fb_sq = get_global_coverage_stats(train_semester, test_semester, is_item_based=True)
        
        all_covered.extend(cov_sq)
        all_fallback.extend(fb_sq)
        
    # --- RAPOR ---
    total_valid = len(all_covered) + len(all_fallback)
    if total_valid == 0:
        print("HATA: Hiç geçerli veri bulunamadı.")
        return

    coverage_ratio = (len(all_covered) / total_valid) * 100
    rmse_cf = np.sqrt(np.mean(all_covered)) if all_covered else 0
    rmse_fb = np.sqrt(np.mean(all_fallback)) if all_fallback else 0
    
    print("\n" + "="*40)
    print(f"Total Instances:     {total_valid}")
    print(f"Covered by CF:       {len(all_covered)}")
    print(f"COVERAGE RATIO:      {coverage_ratio:.2f}% ")
    print("-" * 40)
    print(f"RMSE (Pure CF):      {rmse_cf:.4f}")
    print(f"RMSE (Fallback):     {rmse_fb:.4f}")
    print("="*40)

# Çalıştır
analyze_anhui_global_coverage()

Anhui Dataset - Global CF Coverage Analysis (No Clustering)...

Total Instances:     16044
Covered by CF:       0
COVERAGE RATIO:      0.00%  <-- BU DEĞERİ KULLAN
----------------------------------------
RMSE (Pure CF):      0.0000
RMSE (Fallback):     0.9879


In [19]:
def diagnose_data_overlap():
    sorted_semesters = sorted(set(df.iloc[:, 4]))
    print(f"Tanı Başlıyor... Toplam Dönem Sayısı: {len(sorted_semesters)}")
    
    for sem_idx in range(1, len(sorted_semesters)):
        curr_sem = sorted_semesters[sem_idx]
        prev_sems = sorted_semesters[:sem_idx]
        
        # 1. Eğitimdeki Dersleri Topla
        train_courses = set()
        for ps in prev_sems:
            # df.iloc[:, 4] == semester, df.iloc[:, 1] == course_title
            courses = df[df.iloc[:, 4] == ps].iloc[:, 1].unique()
            train_courses.update(courses)
            
        # 2. Testteki Dersleri Topla
        test_courses = set(df[df.iloc[:, 4] == curr_sem].iloc[:, 1].unique())
        
        # 3. Kesişime Bak
        intersection = train_courses.intersection(test_courses)
        
        print(f"\n--- Transition to Sem {curr_sem} ---")
        print(f"Train Unique Courses: {len(train_courses)}")
        print(f"Test Unique Courses:  {len(test_courses)}")
        print(f"OVERLAP (Intersection): {len(intersection)}")
        
        if len(intersection) == 0:
            print("⚠️ KRİTİK: Hiç ortak ders yok! CF çalışamaz.")

diagnose_data_overlap()

Tanı Başlıyor... Toplam Dönem Sayısı: 7

--- Transition to Sem 2 ---
Train Unique Courses: 8
Test Unique Courses:  9
OVERLAP (Intersection): 0
⚠️ KRİTİK: Hiç ortak ders yok! CF çalışamaz.

--- Transition to Sem 3 ---
Train Unique Courses: 17
Test Unique Courses:  8
OVERLAP (Intersection): 0
⚠️ KRİTİK: Hiç ortak ders yok! CF çalışamaz.

--- Transition to Sem 4 ---
Train Unique Courses: 25
Test Unique Courses:  7
OVERLAP (Intersection): 0
⚠️ KRİTİK: Hiç ortak ders yok! CF çalışamaz.

--- Transition to Sem 5 ---
Train Unique Courses: 32
Test Unique Courses:  6
OVERLAP (Intersection): 0
⚠️ KRİTİK: Hiç ortak ders yok! CF çalışamaz.

--- Transition to Sem 6 ---
Train Unique Courses: 38
Test Unique Courses:  8
OVERLAP (Intersection): 0
⚠️ KRİTİK: Hiç ortak ders yok! CF çalışamaz.

--- Transition to Sem 7 ---
Train Unique Courses: 46
Test Unique Courses:  4
OVERLAP (Intersection): 0
⚠️ KRİTİK: Hiç ortak ders yok! CF çalışamaz.


In [17]:
def check_course_overlap():
    sorted_semesters = sorted(set(df.iloc[:, 4])) # Dönemleri sırala
    print(f"Toplam Dönem Sayısı: {len(sorted_semesters)}")
    
    # Her bir test dönemi için kontrol et
    for i in range(1, len(sorted_semesters)):
        current_sem = sorted_semesters[i] # Test Dönemi (Örn: Dönem 2)
        
        # Geçmişteki TÜM dönemleri topla (Train)
        past_semesters = sorted_semesters[:i] 
        
        # 1. Geçmişteki (Train) Derslerin Listesi
        # df.iloc[:, 4] -> Dönem No, df.iloc[:, 1] -> Ders Adı
        train_courses = set(df[df.iloc[:, 4].isin(past_semesters)].iloc[:, 1].unique())
        
        # 2. Şu anki (Test) Derslerin Listesi
        test_courses = set(df[df.iloc[:, 4] == current_sem].iloc[:, 1].unique())
        
        # 3. Kesişim (Overlap)
        common_courses = train_courses.intersection(test_courses)
        
        print(f"\n--- Test Dönemi: {current_sem} ---")
        print(f"Eğitim Setindeki Ders Sayısı: {len(train_courses)}")
        print(f"Test Setindeki Ders Sayısı:   {len(test_courses)}")
        print(f"ORTAK DERS SAYISI (Overlap):  {len(common_courses)}")
        
        if len(common_courses) == 0:
            print("❌ SONUÇ: Ortak ders yok -> CF Tahmin Üretemez -> Coverage %0 olur.")
        else:
            print(f"✅ SONUÇ: {len(common_courses)} adet ortak ders var -> Tahmin yapılabilir.")

check_course_overlap()

Toplam Dönem Sayısı: 7

--- Test Dönemi: 2 ---
Eğitim Setindeki Ders Sayısı: 8
Test Setindeki Ders Sayısı:   9
ORTAK DERS SAYISI (Overlap):  0
❌ SONUÇ: Ortak ders yok -> CF Tahmin Üretemez -> Coverage %0 olur.

--- Test Dönemi: 3 ---
Eğitim Setindeki Ders Sayısı: 17
Test Setindeki Ders Sayısı:   8
ORTAK DERS SAYISI (Overlap):  0
❌ SONUÇ: Ortak ders yok -> CF Tahmin Üretemez -> Coverage %0 olur.

--- Test Dönemi: 4 ---
Eğitim Setindeki Ders Sayısı: 25
Test Setindeki Ders Sayısı:   7
ORTAK DERS SAYISI (Overlap):  0
❌ SONUÇ: Ortak ders yok -> CF Tahmin Üretemez -> Coverage %0 olur.

--- Test Dönemi: 5 ---
Eğitim Setindeki Ders Sayısı: 32
Test Setindeki Ders Sayısı:   6
ORTAK DERS SAYISI (Overlap):  0
❌ SONUÇ: Ortak ders yok -> CF Tahmin Üretemez -> Coverage %0 olur.

--- Test Dönemi: 6 ---
Eğitim Setindeki Ders Sayısı: 38
Test Setindeki Ders Sayısı:   8
ORTAK DERS SAYISI (Overlap):  0
❌ SONUÇ: Ortak ders yok -> CF Tahmin Üretemez -> Coverage %0 olur.

--- Test Dönemi: 7 ---
Eğitim Setinde

In [18]:
def debug_coverage():
    sorted_semesters = sorted(set(df.iloc[:, 4]))
    
    # Basit durum: İlk 2 dönem
    train_semester = get_semester_data(sorted_semesters[0])
    test_semester = get_semester_data(sorted_semesters[1])
    
    print(f"Train Öğrencileri: {len(train_semester)}")
    print(f"Test Öğrencileri: {len(test_semester)}")
    
    # Kesişim
    common_students = set(train_semester.keys()).intersection(set(test_semester.keys()))
    print(f"Ortak Öğrenciler: {len(common_students)}")
    
    if len(common_students) > 0:
        sample_student = list(common_students)[0]
        print(f"\nÖrnek Öğrenci {sample_student}:")
        print(f"  Train Dersleri: {list(train_semester[sample_student].keys())}")
        print(f"  Test Dersleri: {list(test_semester[sample_student].keys())}")
        print(f"  Ortak Dersler: {set(train_semester[sample_student].keys()).intersection(set(test_semester[sample_student].keys()))}")

debug_coverage()

Train Öğrencileri: 382
Test Öğrencileri: 382
Ortak Öğrenciler: 382

Örnek Öğrenci 0:
  Train Dersleri: ['Ideological and Moral Cultivation and Legal Foundation', 'Success: Career Planning', 'Introduction to Computer Science', 'Advanced Mathematics A(1)', 'College English A(1)', 'University Chinese', 'Mental Health Education for College Students', 'Ideological and Political Theory Course Practice1']
  Test Dersleri: ['C/C++ Language Programming', 'C/C++ Language Programming Course Design', 'Digital Logic Circuits', 'Introduction to the Basic Principles of Marxism', 'Basic Education for Entrepreneurship', 'Higher Mathematics A(2)', 'Cognitive Internship (IT Basic Training)', 'Ideological and Political Theory Course Practice2', 'College English A(2)']
  Ortak Dersler: set()
