In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score

# Step 1: Load dataset and create dummy variables
df = pd.read_csv('/workspaces/Phuong5/1669242turnover.csv')
df_encoded = pd.get_dummies(df, columns=['Disciplined', 'Social_drinker', 'Social_smoker'], drop_first=True)

# Define dependent and independent variables
y = df_encoded['Months_active']
X = df_encoded.drop(columns=['Months_active', 'BMI', 'Weight'])  # Drop BMI and Weight due to multicollinearity

# Convert to numeric (ensure all data is in the correct format)
X = X.astype(float)
y = y.astype(float)

# Step 2: Set up the linear regression model
model = LinearRegression()

# Step 3: Perform 5-fold cross-validation with mean absolute error
# Note: cross_val_score returns negative MAE because it maximizes scores, so we negate it
mae_scores = cross_val_score(model, X, y, cv=5, scoring='neg_mean_absolute_error')
mae_scores = -mae_scores  # Convert to positive MAE

# Calculate the average prediction error (mean absolute error)
average_mae = np.mean(mae_scores)

# Print the results
print("Mean Absolute Error (MAE) for each fold:", mae_scores)
print("Average Prediction Error (MAE):", average_mae)

Mean Absolute Error (MAE) for each fold: [2.39528241 2.13763369 2.27740749 2.234107   2.32644358]
Average Prediction Error (MAE): 2.2741748340964287


In [3]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score

# Prepare the data
# Load the dataset
df = pd.read_csv('/workspaces/Phuong5/1669242turnover.csv')

# Select the target (dependent variable)
y = df['Months_active']

# Select features (independent variables) and drop unnecessary columns
X = df.drop(columns=['Months_active', 'BMI', 'Weight'])  # Drop BMI and Weight due to multicollinearity

# Turn categorical variables into dummy variables
X = pd.get_dummies(X, drop_first=True)

# Ensure all data is numeric
X = X.astype(float)
y = y.astype(float)

# Predict with all features
# Define the cross-validation (5 folds)
cv = 5  # 5-fold cross-validation

# Predict with linear regression
lm = LinearRegression()
lmscores = cross_val_score(lm, X, y, scoring='neg_mean_absolute_error', cv=cv)

# Calculate the overall mean absolute error (MAE)
lmMAE = np.mean(np.absolute(lmscores))

# Print the result
print(f"The average prediction error: {lmMAE:.2f} months")

The average prediction error: 2.27 months


In [5]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, cross_val_score

# Step 1: Load dataset and prepare the data
df = pd.read_csv('/workspaces/Phuong5/1669242turnover.csv')

# Select the target (dependent variable)
y = df['Months_active']

# Select features (independent variables) and drop unnecessary columns
X = df.drop(columns=['Months_active', 'BMI', 'Weight'])  # Drop BMI and Weight due to multicollinearity

# Turn categorical variables into dummy variables
X = pd.get_dummies(X, drop_first=True)

# Ensure all data is numeric
X = X.astype(float)
y = y.astype(float)

# Step 2: Normalize the data
scaler = StandardScaler()
X_normalized = scaler.fit_transform(X)

# Step 3: Find the best alpha using grid search with 5-fold cross-validation
# Define the ridge regression model
ridge = Ridge()

# Define the range of alpha values to test
param_grid = {'alpha': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}

# Set up grid search with 5-fold cross-validation
grid_search = GridSearchCV(estimator=ridge, param_grid=param_grid, 
                           scoring='neg_mean_absolute_error', cv=5)

# Fit the grid search to the normalized data
grid_search.fit(X_normalized, y)

# Get the best alpha value
best_alpha = grid_search.best_params_['alpha']
print(f"The best alpha value found is: {best_alpha}")

# Step 4: Run ridge regression with the best alpha and calculate MAE (following the example)
# Define the model and alpha
RidgeModel = Ridge(alpha=best_alpha)

# Define the number of folds for cross-validation
cv = 5

# Run the cross-fold validation again (same as earlier)
scoresridge = cross_val_score(RidgeModel, X_normalized, y, scoring='neg_mean_absolute_error', cv=cv)

# Calculate the overall mean absolute error
ridgeMAE = np.mean(np.absolute(scoresridge))

# Print the result
print(f"The average prediction error with ridge is: {ridgeMAE:.2f} months")

The best alpha value found is: 100
The average prediction error with ridge is: 2.24 months


In [13]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input
import tensorflow as tf

# Step 1: Load dataset and prepare the data
df = pd.read_csv('/workspaces/Phuong5/1669242turnover.csv')

# Select the target (dependent variable)
y = df['Months_active']

# Select features (independent variables) and drop unnecessary columns
X = df.drop(columns=['Months_active', 'BMI', 'Weight'])  # Drop BMI and Weight due to multicollinearity

# Turn categorical variables into dummy variables
X = pd.get_dummies(X, drop_first=True)

# Ensure all data is numeric
X = X.astype(float)
y = y.astype(float)

# Step 2: Normalize the data
scaler = StandardScaler()
X_normalized = scaler.fit_transform(X)

# Step 3: Define a function to create the neural network model
def create_model(input_shape):
    # Create the model
    nnmodel = Sequential([
        Input(shape=(input_shape,)),  # Input layer
        Dense(512, activation='relu'),  # Hidden layer 1: 512 nodes, ReLU activation
        Dense(512, activation='relu'),  # Hidden layer 2: 512 nodes, ReLU activation
        Dense(512, activation='relu'),  # Hidden layer 3: 512 nodes, ReLU activation
        Dense(512, activation='relu'),  # Hidden layer 4: 512 nodes, ReLU activation
        Dense(512, activation='relu'),  # Hidden layer 5: 512 nodes, ReLU activation
        Dense(1)  # Output layer
    ])
    
    # Compile the model
    nnmodel.compile(loss='mean_absolute_error',  # Loss calculated with MAE
                    optimizer='adam',  # Adam optimizer
                    metrics=['mae'])  # Use MAE to evaluate the model
    
    return nnmodel

# Step 4: Perform 5-fold cross-validation manually
# Define the cross-validation (5 folds, as specified in Assignment 4)
kfold = KFold(n_splits=5, random_state=42, shuffle=True)

# Initialize a list to store the MAE for each fold
mae_scores = []

# Convert X and y to numpy arrays for indexing
X_normalized = np.array(X_normalized)
y = np.array(y)

# Perform cross-validation
for train_index, test_index in kfold.split(X_normalized):
    # Split the data into training and testing sets
    X_train, X_test = X_normalized[train_index], X_normalized[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # Create a new model for this fold
    model = create_model(input_shape=X_train.shape[1])
    
    # Train the model
    model.fit(X_train, y_train, 
              epochs=100,  # Number of epochs
              batch_size=16,  # Batch size
              verbose=0)  # Suppress training output
    
    # Evaluate the model on the test set
    y_pred = model.predict(X_test, verbose=0)
    mae = np.mean(np.abs(y_test - y_pred.flatten()))  # Calculate MAE
    
    # Store the MAE for this fold
    mae_scores.append(mae)

# Step 5: Calculate the average prediction error
nn_mae = np.mean(mae_scores)

# Print the result
print(f"The average prediction error with the neural network is: {nn_mae:.2f} months")

The average prediction error with the neural network is: 0.29 months
