# Machine Learning (Supervised) - Linear Regression

In [21]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import (LinearRegression, Ridge, Lasso, ElasticNet, Lars, LassoLars,
                                  OrthogonalMatchingPursuit, BayesianRidge, ARDRegression,
                                  HuberRegressor, RANSACRegressor, TheilSenRegressor)
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures

# Importing the dataset
data_frame = pd.read_csv("C:\\Users\\david\\OneDrive\\Documents\\GitHub\\Projects\\StatisticalAnalysis\\Billionaires Statistics Dataset.csv")
data_frame.head()

# Filtering the dataset
data_frame = data_frame[['finalWorth', 'category', 'age', 'country', 'industries']]


Preprocessing Data for Machine Learning Options
1) Remove rows with missing values
2) Impute missing values with mean or other statistical method. Analyze significance of imputation.
3) Predictive Imputation - Use a model to predict missing values
4) Using a model that supports missing values
   1) Linear Regression requires no missing values

In [None]:
# Data Preprocessing
# Convert categorical variables using one-hot encoding
data_frame = pd.get_dummies(data_frame, columns=['category', 'country', 'industries'])

# Feature Scaling
scaler = StandardScaler()
data_frame[['age', 'finalWorth']] = scaler.fit_transform(data_frame[['age', 'finalWorth']])

data_frame.head()


In [18]:
# Feature Selection and Train-Test Split

# Selecting the target variable and the features
X = data_frame.drop('finalWorth', axis=1)  # Features/INdependent variables
y = data_frame['finalWorth']  # Target variable/dependent variables

# Use mean imputation
imputer = SimpleImputer(strategy='mean')
X = imputer.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model Training and Evaluation
def train_and_evaluate(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f'Model: {model.__class__.__name__}')
    print(f'Mean Squared Error: {mse}')
    print(f'R-squared: {r2}')
    print('\n')



In [None]:
# List of models
models = [
    LinearRegression(),
    Ridge(),
    Lasso(),
    ElasticNet(),
    Lars(),
    LassoLars(),
    OrthogonalMatchingPursuit(),
    BayesianRidge(),
    ARDRegression(),
    HuberRegressor(),
    RANSACRegressor(),
    TheilSenRegressor(),
    make_pipeline(PolynomialFeatures(degree=2), LinearRegression())
]

# Train and evaluate all models
for model in models:
    train_and_evaluate(model, X_train, X_test, y_train, y_test)

In [None]:
# Visualizing the results for a specific model (e.g., OLS)
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

plt.scatter(y_test, y_pred)
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title('Actual vs Predicted')
plt.show()