## Required Libraries and Dependencies

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE
from collections import Counter


## Does education impact income?

In [None]:
# Create a table for 'education' vs 'income'
education_income_table = pd.crosstab(census_combined_df['education'], census_combined_df['income'])
print(education_income_table)

In [None]:
# Seperate the target from the features
X = census_combined_df.drop('income', axis=1)
y = census_combined_df['income']

# Split the data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Apply SMOTE to the training data
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Check the class distribution after SMOTE
print("Before SMOTE:", Counter(y_train))
print("After SMOTE:", Counter(y_train_resampled))

In [None]:
# Visualize the data proportions of education vs income
# Calculate proportions for 'education'
education_income_prop = education_income_table.div(education_income_table.sum(axis=1), axis=0)

# Create a variable for custom legend handles
legend_handles = [
    mpatches.Patch(color='blue', label='<=50K'),
    mpatches.Patch(color='orange', label='>50K')
]
# Plot a stacked bar chart
education_income_prop.plot(kind='bar', stacked=True, figsize=(10,6))
plt.title('Income Distribution by Education')
plt.ylabel('Proportion')
plt.xlabel('Education')
plt.xticks([0, 1, 2, 3,], ['Early Education', 'Secondary Education', 'Higher Education', 'Advanced Education'], rotation=90)
plt.legend(title='Income', handles=legend_handles)
plt.show()

In [None]:
# Feature is 'education' and target is 'income'
X = census_combined_df[['education']]
y = census_combined_df['income']
display(print(X.head()))

# One-hot encode the education feature
encoder = OneHotEncoder()
X_encoded = encoder.fit_transform(X)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# Train a logistic regression model
model = LogisticRegression(solver='liblinear')
model.fit(X_train, y_train)

# Print the coefficients with their corresponding education category names
education_feature_names = encoder.get_feature_names_out(['education'])
coef_df = pd.DataFrame({
    'Education Feature': education_feature_names,
    'Coefficient': model.coef_[0]
})

print(coef_df)

## Does Marital-Status impact Income?

In [None]:
# Create a table for 'marital-status' vs 'income'
maritalStatus_income_table = pd.crosstab(census_combined_df['marital-status'], census_combined_df['income'])
print(maritalStatus_income_table)

In [None]:
# Visualize the data proportions of education vs income
# Calculate proportions for 'education'
maritalStatus_income_prop = maritalStatus_income_table.div(maritalStatus_income_table.sum(axis=1), axis=0)

# Create a variable for custom legend handles
legend_handles = [
    mpatches.Patch(color='blue', label='<=50K'),
    mpatches.Patch(color='orange', label='>50K')
]
# Plot a stacked bar chart
maritalStatus_income_prop.plot(kind='bar', stacked=True, figsize=(10,6))
plt.title('Income Distribution by Marital-Status')
plt.ylabel('Proportion')
plt.xlabel('Marital-Status')
plt.xticks([0, 1], ['Not-Married', 'Married'], rotation=90)
plt.legend(title='Income', handles=legend_handles)
plt.show()

In [None]:
# Feature is 'marital-status' and target is 'income'
X = census_combined_df[['marital-status']]
y = census_combined_df['income']
display(print(X.head()))

# One-hot encode the marital-status feature
encoder = OneHotEncoder()
X_encoded = encoder.fit_transform(X)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# Train a logistic regression model
model = LogisticRegression(solver='liblinear')
model.fit(X_train, y_train)

# Print the coefficients with their corresponding marital-status category names
maritalStatus_feature_names = encoder.get_feature_names_out(['marital-status'])
coef_df = pd.DataFrame({
    'Marital-Status Feature': maritalStatus_feature_names,
    'Coefficient': model.coef_[0]
})

print(coef_df)

## SMOTE

In [None]:
# Seperate the target from the features
X = census_combined_simplified_df.drop('income', axis=1)
y = census_combined_simplified_df['income']

# Split the data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Apply SMOTE to the training data
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Check the class distribution after SMOTE
print("Before SMOTE:", Counter(y_train))
print("After SMOTE:", Counter(y_train_resampled))