In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [2]:
df = pd.read_csv('Income Analysis.csv')

df['Education_Level'] = df['Education_Level'].replace({
    "Master's": 'Master',
    'High School': 'HighSchool',
    "Bachelor's": 'Bachelor',
    'Doctorate': 'Doctorate'
})
df['Type_of_Housing'] = df['Type_of_Housing'].replace({
    'Apartment': 'Apartment',
    'Single-family home': 'SingleFamilyHome',
    'Townhouse': 'Townhouse'
})
df['Employment_Status'] = df['Employment_Status'].replace({
    'Full-time': 'FullTime',
    'Self-employed': 'SelfEmployed',
    'Part-time': 'PartTime'
})
df['Primary_Mode_of_Transportation'] = df['Primary_Mode_of_Transportation'].replace({
    'Public transit': 'PublicTransit'
})


In [3]:
X = df.drop('Income', axis=1)  
y = df['Income']  

In [4]:
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = X.select_dtypes(include=['object']).columns

numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('selector', SelectKBest(score_func=f_regression, k=8))
])


In [5]:
X_preprocessed = pipeline.fit_transform(X, y)
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y, test_size=0.3, random_state=42)


In [6]:
def tune_model(model, param_grid, X_train, y_train):
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=1)
    grid_search.fit(X_train, y_train)
    print(f"Best parameters for {model.__class__.__name__}: {grid_search.best_params_}")
    return grid_search.best_estimator_


In [7]:
rf_param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}


In [8]:
gb_param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Tune Random Forest
rf_model = RandomForestRegressor(random_state=42)
best_rf = tune_model(rf_model, rf_param_grid, X_train, y_train)

# Tune Gradient Boosting
gb_model = GradientBoostingRegressor(random_state=42)
best_gb = tune_model(gb_model, gb_param_grid, X_train, y_train)


Fitting 5 folds for each of 108 candidates, totalling 540 fits
Best parameters for RandomForestRegressor: {'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 200}
Fitting 5 folds for each of 243 candidates, totalling 1215 fits
Best parameters for GradientBoostingRegressor: {'learning_rate': 0.01, 'max_depth': 7, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 100}


In [9]:
tuned_models = {
    'Best Random Forest': best_rf,
    'Best Gradient Boosting': best_gb
}

for name, model in tuned_models.items():
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f'{name}:')
    print(f'  Mean Squared Error: {mse}')
    print(f'  R^2 Score: {r2}')
    print('-' * 40)

Best Random Forest:
  Mean Squared Error: 3401293910801.29
  R^2 Score: -0.004052823473921663
----------------------------------------
Best Gradient Boosting:
  Mean Squared Error: 3334628900270.621
  R^2 Score: 0.015626508511377923
----------------------------------------


In [10]:
if r2_score(y_test, best_rf.predict(X_test)) > r2_score(y_test, best_gb.predict(X_test)):
    best_model = best_rf
    best_model_name = 'Best Random Forest'
else:
    best_model = best_gb
    best_model_name = 'Best Gradient Boosting'

df['predicted_income'] = best_model.predict(pipeline.transform(X))

In [11]:
print(f'The best model is: {best_model_name}')
print(df.head())

The best model is: Best Gradient Boosting
   Age Education_Level  Occupation  Number_of_Dependents Location  \
0   56          Master  Technology                     5    Urban   
1   69      HighSchool     Finance                     0    Urban   
2   46        Bachelor  Technology                     1    Urban   
3   32      HighSchool      Others                     2    Urban   
4   60        Bachelor     Finance                     3    Urban   

   Work_Experience Marital_Status Employment_Status  Household_Size  \
0               21        Married          FullTime               7   
1                4         Single          FullTime               7   
2                1         Single          FullTime               7   
3               32        Married          FullTime               1   
4               15        Married      SelfEmployed               4   

  Homeownership_Status   Type_of_Housing  Gender  \
0                  Own         Apartment    Male   
1           