In [None]:
# Call run_model function from model.py file
import pandas as pd
import sys
import os   
sys.path.append('/home/sahau24/csc790project/Fall2025_DCAI/CSC_790_Synthetic_Data') 
from model import run_model
from sklearn.model_selection import train_test_split

In [None]:
model = pd.read_csv('/home/sahau24/csc790project/Fall2025_DCAI/CSC_790_Synthetic_Data/data/penguins.csv')
model.columns

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.datasets import make_classification 
from sklearn.ensemble import RandomForestClassifier   
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.preprocessing import LabelEncoder


    
def run_model( datapath, target_col, val_size =0.2, test_size=0.2, random_state=42):
    # Load the dataset
    data = pd.read_csv(datapath)
    
    # Separate features and target variable
    X = data.drop(columns=[target_col])
    y = data[target_col]
    # if target col has . values, remove those rows
    y = y.replace('.', np.nan)
    X = X.replace('.', np.nan)

    # remove all rows with missing values in features
    X = X.dropna()
    y = y[X.index]  

    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    # remove missing rows if target label has missing values
    train_data = pd.concat([X_train, y_train], axis=1).dropna(subset=[target_col])
    X_train = train_data.drop(columns=[target_col])
    y_train = train_data[target_col]
    test_data = pd.concat([X_test, y_test], axis=1).dropna(subset=[target_col])
    X_test = test_data.drop(columns=[target_col])
    y_test = test_data[target_col]
    
    # Encode target variable if it's categorical
    if y_train.dtype == 'object':
        target_encoder = LabelEncoder()
        y_train_encoded = target_encoder.fit_transform(y_train)
        y_test_encoded = target_encoder.transform(y_test)
    else:
        y_train_encoded = y_train
        y_test_encoded = y_test  


    # preprocess the numeric features
    numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns
    scaler = StandardScaler()
    X_train[numeric_cols] = scaler.fit_transform(X_train[numeric_cols])
    X_test[numeric_cols] = scaler.transform(X_test[numeric_cols])   

    
    #preprocess categorical features using label encoding

    categorical_cols = X.select_dtypes(include=['object', 'category']).columns
    for col in categorical_cols:
        le = LabelEncoder()
        X_train[col] = le.fit_transform(X_train[col])
        X_test[col] = le.transform(X_test[col])


    # perform k -fold cross validation on logistic regression using my train and validation set
    model = LogisticRegression(random_state=random_state)
    # Fit the model on the training set and evaluate on the validation set
    cv_scores = cross_val_score(model, X_train, y_train_encoded, cv=5)
    print("Cross-validation scores:", cv_scores)
    print("Mean cross-validation score:", np.mean(cv_scores))

    #train final model on the whole training set
    model.fit(X_train, y_train_encoded)

    # test Roc AUC score
    y_pred = model.predict(X_test)
    # Evaluate the model by f1 score for logistic regression
    print("For Logistic Regression:")
    # Get target names for better readability
    if y_train.dtype == 'object':
        target_names = target_encoder.classes_
        report = classification_report(y_test_encoded, y_pred, target_names=target_names)
    else:
        report = classification_report(y_test_encoded, y_pred)
    print("Classification Report:")
    print(report)
    # area under ROC curve
    from sklearn.metrics import roc_auc_score
    roc_auc = roc_auc_score(y_test_encoded, y_pred, multi_class='ovr')
    print("ROC AUC:", roc_auc)
    print("--------------------------------")
    

    # Random Forest Classifier with k - fold cross validation
    rf_model = RandomForestClassifier(random_state=random_state)
    cv_scores = cross_val_score(rf_model, X_train, y_train_encoded, cv=5)
    print("Random Forest Classifier Cross-validation scores:", cv_scores)
    print("Mean cross-validation score:", np.mean(cv_scores))

    # Evaluate the model
    rf_model.fit(X_train, y_train_encoded)
    # Get target names for better readability
    if y_train.dtype == 'object':
        target_names = target_encoder.classes_
        report = classification_report(y_test_encoded, y_pred, target_names=target_names)
    else:
        report = classification_report(y_test_encoded, y_pred)
    print("Classification Report:")
    print(report)
    # area under ROC curve
    roc_auc = roc_auc_score(y_test_encoded, y_pred, multi_class='ovr')
    print("ROC AUC:", roc_auc)
    print("--------------------------------")


    # HistGradientBoostingClassifier with k - fold cross validation
    hgb_model = HistGradientBoostingClassifier(random_state=random_state)
    cv_scores = cross_val_score(hgb_model, X_train, y_train_encoded, cv=5)
    print("HistGradientBoostingClassifier Cross-validation scores:", cv_scores)
    # Evaluate the model
    # Train final model on the whole training set
    # Get target names for better readability
    if y_train.dtype == 'object':
        target_names = target_encoder.classes_
        report = classification_report(y_test_encoded, y_pred, target_names=target_names)
    else:
        report = classification_report(y_test_encoded, y_pred)
    print("Classification Report:")
    print(report)

    # area under ROC curve

    roc_auc = roc_auc_score(y_test_encoded, y_pred, multi_class='ovr')    

    print("ROC AUC:", roc_auc)

    print("--------------------------------")   
     

In [None]:
#run  model.py function
# perform k-fold cross validation
model = pd.read_csv('/home/sahau24/csc790project/Fall2025_DCAI/CSC_790_Synthetic_Data/data/penguins.csv')
model.columns
run_model('/home/sahau24/csc790project/Fall2025_DCAI/CSC_790_Synthetic_Data/data/penguins.csv', target_col='sex')



In [None]:
#run  model.py function
# perform k-fold cross validation
model = pd.read_csv('/home/sahau24/csc790project/Fall2025_DCAI/CSC_790_Synthetic_Data/data/penguins.csv')
model.columns
run_model('/home/sahau24/csc790project/Fall2025_DCAI/CSC_790_Synthetic_Data/data/penguins.csv', target_col='sex')



In [None]:
from split_and_downsample import process_minority_proportion
from synthetic_script import generate_synthetic_data

# Split and downsample the dataset
input_file = '/home/sahau24/csc790project/Fall2025_DCAI/CSC_790_Synthetic_Data/data/penguins.csv'
output_dir = '/home/sahau24/csc790project/Fall2025_DCAI/CSC_790_Synthetic_Data/data/temp_synth_data'

import pandas as pd
import numpy as np
#remove missing values from target column
data = pd.read_csv(input_file)
data = data.replace('.', np.nan)
data = data.dropna(subset=['sex'])

# Save cleaned data to temporary file
temp_input_file = '/home/sahau24/csc790project/Fall2025_DCAI/CSC_790_Synthetic_Data/data/temp_cleaned_penguins.csv'
data.to_csv(temp_input_file, index=False)


process_minority_proportion(temp_input_file,'sex', 0.3,
                                output_dir, categorical_cols=None, mode='independent')

In [None]:
# check near duplication using cleanlab on original data and synthetic data
import cleanlab
from cleanlab import Datalab
import pandas as pd
import numpy as np

print("=== NEAR DUPLICATE DETECTION ANALYSIS ===")

# 1. Load and analyze original dataset
print("\n1. ORIGINAL DATASET ANALYSIS")
print("Loading original dataset...")
original_data = pd.read_csv('/home/sahau24/csc790project/Fall2025_DCAI/CSC_790_Synthetic_Data/data/penguins.csv')
original_data = original_data.replace('.', np.nan)
original_data = original_data.dropna(subset=['sex'])

print(f"Original dataset shape: {original_data.shape}")
print(f"Columns: {original_data.columns.tolist()}")

# Initialize Datalab for original data
lab_original = Datalab(data=original_data, label_name='sex')
lab_original.find_issues(issue_types={"near_duplicates": {}})

# Get near duplicate results for original data
near_duplicates_orig = lab_original.get_issues("near_duplicates")
num_duplicates_orig = near_duplicates_orig.sum()

print(f"Near duplicates found in original data: {num_duplicates_orig}")
if num_duplicates_orig > 0:
    duplicate_percentage_orig = (num_duplicates_orig / len(original_data)) * 100
    print(f"Percentage of duplicates in original: {duplicate_percentage_orig:.2f}%")
    print(f"Duplicate indices: {near_duplicates_orig[near_duplicates_orig].index.tolist()}")

print("\n" + "="*60)

# 2. Load and analyze synthetic dataset
print("\n2. SYNTHETIC DATASET ANALYSIS")
synthetic_file = '/home/sahau24/csc790project/Fall2025_DCAI/CSC_790_Synthetic_Data/data/temp_synth_data/minority_30pct.csv'

try:
    synthetic_data = pd.read_csv(synthetic_file)
    print(f"Synthetic dataset shape: {synthetic_data.shape}")
    
    # Initialize Datalab for synthetic data
    lab_synthetic = Datalab(data=synthetic_data, label_name='sex')
    lab_synthetic.find_issues(issue_types={"near_duplicates": {}})
    
    # Get near duplicate results for synthetic data
    near_duplicates_synth = lab_synthetic.get_issues("near_duplicates")
    num_duplicates_synth = near_duplicates_synth.sum()
    
    print(f"Near duplicates found in synthetic data: {num_duplicates_synth}")
    if num_duplicates_synth > 0:
        duplicate_percentage_synth = (num_duplicates_synth / len(synthetic_data)) * 100
        print(f"Percentage of duplicates in synthetic: {duplicate_percentage_synth:.2f}%")
        print(f"Duplicate indices: {near_duplicates_synth[near_duplicates_synth].index.tolist()}")
    
except FileNotFoundError:
    print(f"Synthetic dataset not found at: {synthetic_file}")
    print("Make sure you've run the synthetic data generation first!")
    num_duplicates_synth = 0
    duplicate_percentage_synth = 0

print("\n" + "="*60)

# 3. Comparison and Summary
print("\n3. COMPARISON SUMMARY")
print(f"Original dataset duplicates: {num_duplicates_orig} ({(num_duplicates_orig / len(original_data)) * 100:.2f}%)")
if 'synthetic_data' in locals():
    print(f"Synthetic dataset duplicates: {num_duplicates_synth} ({duplicate_percentage_synth:.2f}%)")
    
    # Quality comparison
    print("\n4. DATA QUALITY INSIGHTS")
    if num_duplicates_orig == 0 and num_duplicates_synth == 0:
        print("✅ Both datasets have no near duplicates - good quality!")
    elif num_duplicates_orig > 0 and num_duplicates_synth == 0:
        print("✅ Synthetic data improved quality by eliminating duplicates")
    elif num_duplicates_orig == 0 and num_duplicates_synth > 0:
        print("⚠️ Synthetic data generation introduced duplicates")
    else:
        print("ℹ️ Both datasets contain some near duplicates")
        if duplicate_percentage_synth < (num_duplicates_orig / len(original_data)) * 100:
            print("✅ Synthetic data has fewer duplicates than original")
        else:
            print("⚠️ Synthetic data has more duplicates than original")

print("\n" + "="*60)
print("Analysis complete!")



