In [None]:
#Import Required Libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from patsy import dmatrix
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
import statsmodels.api as sm


In [None]:
wage = pd.read_csv('Wage.csv')
wage.head()

In [None]:
print(wage.info())
print(wage.describe(include='all'))

## Exploring Categorical Predictors


In [None]:
# Boxplots of Wage vs. Marital Status and Job Class

plt.figure(figsize=(14, 6))

plt.subplot(1, 2, 1)
sns.boxplot(data=wage, x='maritl', y='wage')
plt.title('Wage vs Marital Status')
plt.xticks(rotation=45)

plt.subplot(1, 2, 2)
sns.boxplot(data=wage, x='jobclass', y='wage')
plt.title('Wage vs Job Class')
plt.xticks(rotation=45)

plt.tight_layout()
plt.show()


## Non-Linear Modeling: Age and Wage


In [None]:
# Natural Spline for Age

# Create a natural spline basis for age with 4 degrees of freedom
age_spline = dmatrix("bs(age, df=4, degree=3, include_intercept=False)", {"age": wage['age']}, return_type='dataframe')

# Fit a linear model on the spline basis
spline_model = sm.OLS(wage['wage'], age_spline).fit()

# Predict and plot
age_grid = np.linspace(wage['age'].min(), wage['age'].max(), 100)
age_spline_pred = dmatrix("bs(age, df=4, degree=3, include_intercept=False)", {"age": age_grid}, return_type='dataframe')
wage_pred = spline_model.predict(age_spline_pred)

plt.figure(figsize=(8,6))
plt.scatter(wage['age'], wage['wage'], facecolor='None', edgecolor='k', alpha=0.3)
plt.plot(age_grid, wage_pred, color='red', linewidth=2)
plt.xlabel('Age')
plt.ylabel('Wage')
plt.title('Natural Spline Fit: Wage vs Age')
plt.show()


## Interaction Effects: Age and Job Class


In [None]:
# Spline by Job Class

plt.figure(figsize=(8,6))
for job in wage['jobclass'].unique():
    df = wage[wage['jobclass'] == job]
    age_spline = dmatrix("bs(age, df=4, degree=3, include_intercept=False)", {"age": df['age']}, return_type='dataframe')
    model = sm.OLS(df['wage'], age_spline).fit()
    wage_pred = model.predict(dmatrix("bs(age, df=4, degree=3, include_intercept=False)", {"age": age_grid}, return_type='dataframe'))
    plt.plot(age_grid, wage_pred, label=f'Job Class: {job}')
plt.scatter(wage['age'], wage['wage'], facecolor='None', edgecolor='k', alpha=0.2)
plt.xlabel('Age')
plt.ylabel('Wage')
plt.title('Spline Fits: Wage vs Age by Job Class')
plt.legend()
plt.show()

## Summary of Findings

- **Marital Status:** There are visible differences in wage distributions among marital status groups.
- **Job Class:** Job class also shows clear separation in wage, with 'Information' jobs tending to have higher wages.
- **Age Relationship:** The relationship between age and wage is non-linear, with wages increasing and then plateauing or decreasing at higher ages.
- **Interaction:** The pattern of wage vs. age is different for different job classes, suggesting an interaction effect.

**Non-linear modeling techniques such as splines can reveal flexible, realistic patterns in the data that would be missed by linear models.**
