# 📘 Linear Regression in a Ghanaian Context
Predicting Monthly Income based on Age, Education, Region, and Job Sector

In [None]:
# Cell 1: Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib

### 🧪 Generate Synthetic Ghanaian Data

In [None]:
np.random.seed(42)
n = 500

ages = np.random.randint(20, 60, n)
education_levels = np.random.choice(['SHS', 'Diploma', 'Degree', 'Masters'], n)
regions = np.random.choice(['Greater Accra', 'Ashanti', 'Western', 'Northern'], n)
job_sectors = np.random.choice(['Agriculture', 'Tech', 'Education', 'Healthcare', 'Trade'], n)

education_map = {'SHS': 500, 'Diploma': 1000, 'Degree': 2000, 'Masters': 3000}
job_bonus = {'Agriculture': 200, 'Tech': 1500, 'Education': 800, 'Healthcare': 1000, 'Trade': 700}
region_bonus = {'Greater Accra': 400, 'Ashanti': 300, 'Western': 250, 'Northern': 100}

base_salary = np.array([education_map[e] for e in education_levels])
job_adjustment = np.array([job_bonus[j] for j in job_sectors])
region_adjustment = np.array([region_bonus[r] for r in regions])
noise = np.random.normal(0, 200, n)

monthly_income = base_salary + job_adjustment + region_adjustment + (ages * 10) + noise

data = pd.DataFrame({
    'Age': ages,
    'Education': education_levels,
    'Region': regions,
    'Job Sector': job_sectors,
    'Monthly Income': monthly_income
})

data.head()

### 🔍 Exploratory Data Analysis (EDA)

In [None]:
print(data.describe())
print(data.info())
sns.pairplot(data, hue='Education')

### 📊 Data Visualization

In [None]:
plt.figure(figsize=(10, 5))
sns.boxplot(x='Education', y='Monthly Income', data=data)
plt.title('Income by Education Level')

plt.figure(figsize=(10, 5))
sns.boxplot(x='Job Sector', y='Monthly Income', data=data)
plt.title('Income by Job Sector')

### 🧹 Preprocessing & Feature Engineering

In [None]:
X = data.drop('Monthly Income', axis=1)
y = data['Monthly Income']

cat_cols = ['Education', 'Region', 'Job Sector']
num_cols = ['Age']

preprocessor = ColumnTransformer([
    ('num', StandardScaler(), num_cols),
    ('cat', OneHotEncoder(drop='first'), cat_cols)
])

model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

### 🧠 Model Training and Evaluation

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print(f"MAE: {mean_absolute_error(y_test, y_pred):.2f}")
print(f"MSE: {mean_squared_error(y_test, y_pred):.2f}")
print(f"RMSE: {np.sqrt(mean_squared_error(y_test, y_pred)):.2f}")
print(f"R² Score: {r2_score(y_test, y_pred):.2f}")

### 💾 Save the Model

In [None]:
joblib.dump(model, 'ghana_income_model.pkl')
print("Model saved as 'ghana_income_model.pkl'")

### 🤖 Load Model and Make a Prediction

In [None]:
sample = pd.DataFrame({
    'Age': [30],
    'Education': ['Degree'],
    'Region': ['Greater Accra'],
    'Job Sector': ['Tech']
})

loaded_model = joblib.load('ghana_income_model.pkl')
prediction = loaded_model.predict(sample)

print(f"Predicted Monthly Income: GHS {prediction[0]:.2f}")