In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [None]:
file_path = 'adult_invome.csv'
df = pd.read_csv(file_path)

df.replace('?', pd.NA, inplace=True)

df_cleaned = df.dropna()

df_cleaned = df_cleaned.drop_duplicates()

df_encoded = pd.get_dummies(df_cleaned, drop_first=True)

scaler = MinMaxScaler()
df_encoded[['age', 'capital-gain', 'capital-loss', 'hours-per-week']] = scaler.fit_transform(
    df_encoded[['age', 'capital-gain', 'capital-loss', 'hours-per-week']]
)

# Check for 'income_>50K' column existence
if 'income_>50K' not in df_encoded.columns:
    raise ValueError("Target column 'income_>50K' is missing after encoding!")

plt.figure(figsize=(12, 8))
correlation_matrix = df_encoded.corr()
sns.heatmap(correlation_matrix, annot=False, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

plt.figure(figsize=(8, 5))
sns.boxplot(x='age', data=df_cleaned)
plt.title('Age Distribution and Outliers')
plt.show()

df.to_csv('cleaned.csv', index=False)


In [3]:
df_sampled = df_encoded.sample(frac=0.1, random_state=42)  
X = df_sampled.drop(columns=['income_>50K'])  
y = df_sampled['income_>50K'] 

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Decision Tree': DecisionTreeClassifier(max_depth=5),  
    'Random Forest': RandomForestClassifier(n_estimators=10),  
}

In [5]:
def evaluate_model(model, X_train, y_train, X_val, y_val):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    y_pred_proba = model.predict_proba(X_val)[:, 1] if hasattr(model, "predict_proba") else None
    
    accuracy = accuracy_score(y_val, y_pred)
    precision = precision_score(y_val, y_pred)
    recall = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)
    roc_auc = roc_auc_score(y_val, y_pred_proba) if y_pred_proba is not None else None
    
    return {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'ROC-AUC': roc_auc
    }

In [None]:
performance_results = {}

for model_name, model in models.items():
    performance_results[model_name] = evaluate_model(model, X_train, y_train, X_val, y_val)

performance_df = pd.DataFrame(performance_results).T
performance_df = performance_df.reset_index()
performance_df.columns = ['Model', 'Accuracy', 'Precision', 'Recall', 'F1-Score', 'ROC-AUC']

# Display the performance measures
print(performance_df)

# Optionally, plot the performance measures
performance_df.set_index('Model').plot(kind='bar', figsize=(12, 6))
plt.title('Model Performance Comparison')
plt.ylabel('Scores')
plt.xticks(rotation=45)
plt.legend(loc='lower right')
plt.show()