In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error

# ---------------------------------------------
# 🧪 Step 1: Create sample data
# ---------------------------------------------
df = pd.DataFrame({
    'Gender': ['Male', 'Female', 'Female', 'Male', 'Female'],
    'City': ['Nairobi', 'Kisumu', 'Mombasa', 'Nairobi', 'Kisumu'],
    'Age': [25, 30, 28, 45, 35],
    'Salary': [50000, 60000, 58000, 80000, 75000]
})

# ---------------------------------------------
# 🔠 Step 2: Label Encoding for Gender
# ---------------------------------------------
le = LabelEncoder()
df['Gender_encoded'] = le.fit_transform(df['Gender'])  # Male=1, Female=0

# ---------------------------------------------
# 🟦 Step 3: One-Hot Encoding for City
# ---------------------------------------------
df = pd.get_dummies(df, columns=['City'], drop_first=True)

# ---------------------------------------------
# 🎯 Step 4: Define features and target
# ---------------------------------------------
X = df.drop(['Gender', 'Salary'], axis=1)
y = df['Salary']

# ---------------------------------------------
# 🧪 Step 5: Train-Test Split
# ---------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ---------------------------------------------
# ⚙️ Step 6: Apply Linear Regression + Ridge + Lasso
# ---------------------------------------------
# Linear Regression (no regularization)
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)

# Ridge Regression (L2 regularization)
ridge = Ridge(alpha=1.0)
ridge.fit(X_train, y_train)
y_pred_ridge = ridge.predict(X_test)

# Lasso Regression (L1 regularization)
lasso = Lasso(alpha=1.0)
lasso.fit(X_train, y_train)
y_pred_lasso = lasso.predict(X_test)

# ---------------------------------------------
# 📊 Step 7: Evaluation
# ---------------------------------------------
print("\n--- Model Performance ---")
print("Linear Regression MSE:", mean_squared_error(y_test, y_pred_lr))
print("Ridge Regression MSE :", mean_squared_error(y_test, y_pred_ridge))
print("Lasso Regression MSE :", mean_squared_error(y_test, y_pred_lasso))



--- Model Performance ---
Linear Regression MSE: 56250000.0
Ridge Regression MSE : 12747747.924576089
Lasso Regression MSE : 56118588.90130388
