# I'm Creating a Predictive Machine Learning Model to see which variables between "x1"... and "x81" are best used to predict Financial Distress

# This is a streamlined process for creating a predictive model using Machine Learning 
 1. Data Preparation
Import Libraries: Load necessary libraries.
Load the Dataset: Read the CSV file into a DataFrame.
Explore the Dataset: Display rows, summary statistics, and structure.
Prepare Features and Target Variables: Separate features and target variable.
Create a Subset of Data: Extract selected key features.
2. Data Preprocessing
Additional Imports: Import functions for modeling and preprocessing.
Split Data: Divide data into training and testing sets.
Scale Features: Standardize features for improved performance.
Generate Polynomial Features: Create squared and interaction features.
Create Lag Features: Generate lagged versions of key features.
Scale Polynomial Features: Standardize polynomial features.
Combine Final Features: Merge and drop missing values.
Scale Final Features: Standardize the final feature set.
3. Model Training
Train the Model: Fit the RandomForestRegressor using training data.
4. Feature Importance Analysis
Determine Feature Importances: Extract and sort feature importance.
Visualize Feature Importances: Create bar plots of top features.
Enhanced Visualization: Use color palettes for clarity.
5. Correlation and Insights
Correlation Analysis: Compute correlation matrix for features.
Visualize Top Factors: Plot top correlated features.
Enhanced Visualization: Improve readability with distinct colors.
6. Final Preparation
Prepare Final Target Variable: Extract target variable for evaluation.

In [None]:
# 1 
# Load essential libraries for data manipulation, numerical operations, and visualization.
import pandas as pd 
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# 2 
# Read the CSV file containing the dataset into a DataFrame for analysis.

df = pd.read_csv("Financial Distress Edited.csv")
print(df)

In [None]:
# 3
# Display the first few rows, summary statistics, and information about the dataset’s structure and data types
df.head()
df.describe()
df.info()

In [None]:
# 4
# Import specific functions from sklearn for building models, splitting data, and preprocessing.
import seaborn as sns 
import matplotlib.pyplot as plt 
from sklearn.ensemble import RandomForestRegressor 
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import StandardScaler 

In [None]:
# 5 
# Compute the correlation matrix to identify relationships between features and the target variable (financial distress).
correlation_matrix = df.corr()
target_correlation = correlation_matrix["Financial Distress"].sort_values(ascending=False)

In [None]:
# 6
# Create bar plots to visualize the top features that correlate with financial distress, aiding in feature selection.
plt.figure(figsize=(10,8))
sns.barplot(x=target_correlation[1:11], y=target_correlation.index[1:11])
plt.title("Top 10 Factors Linked to Financial Distress")
plt.show()

In [None]:
#7 
# This code is the same as the one above, except for the fact that this code represents each variable in the bar plot with a different color/
# In simpler terms, the code uses distinct colors in the bar plots for improved readability and aesthetics.
plt.figure(figsize=(10, 8))
# Generate a color palette with 10 different colors
colors = sns.color_palette("husl", 10)
sns.barplot(x=target_correlation[1:11], y=target_correlation.index[1:11], palette=colors)
plt.title("Top 10 Factors Linked to Financial Distress")
plt.show()

In [None]:
# 8 
# Separate the dataset into features (independent variables) and the target variable (financial distress).
X = df.drop(columns=["Financial Distress", "Company", "Time"])
y = df["Financial Distress"]

In [None]:
# 9 
# Divide the data into training and testing subsets to evaluate model performance on unseen data.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# 10 
# Standardize the features to ensure they have a mean of 0 and a standard deviation of 1, improving model performance.
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# 11
# Initialize and fit the RandomForestRegressor model using the scaled training data.
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train_scaled, y_train)

In [None]:
# 12
# Extract and sort the importance of each feature in predicting the target variable.
feature_importances = rf_model.feature_importances_
sorted_indices = feature_importances.argsort()[::-1]
top_features = X.columns[sorted_indices[:10]]

In [None]:
# 13
# Create bar plots to display the importance of the top features identified by the model
plt.figure(figsize=(10,8))
sns.barplot(x=feature_importances[sorted_indices[:10]], y=top_features)
plt.title("RandomForest's Top 10 Feature Importances")
plt.show()

top_features, target_correlation[1:11]

In [None]:
# 14
# Use a distinct color palette for the feature importance plots to enhance clarity.

plt.figure(figsize=(10, 8))
# Generate a color palette with 10 distinct colors
colors = sns.color_palette("husl", 10)
sns.barplot(x=feature_importances[sorted_indices[:10]], y=top_features, palette=colors)
plt.title("RandomForest's Top 10 Feature Importances")
plt.show()

top_features, target_correlation[1:11]

In [None]:
# 15
# Prepare to create polynomial and interaction features to capture more complex relationships.
from sklearn.preprocessing import PolynomialFeatures

In [None]:
# 16
# Define which features will be used for polynomial feature engineering.
key_features = ["x81", "x48", "x25", "x10", "x9"]

In [None]:
# 17
# Extract the selected key features from the original dataset for further processing.
selected_data = df[key_features]

In [None]:
# 18
# Create new features that represent squared terms and interactions of the original features.
poly = PolynomialFeatures(degree=2, include_bias=False)
poly_features = poly.fit_transform(selected_data)

In [None]:
# 19 
# Retrieve the names of the newly created polynomial features for easier reference.
poly_feature_names = poly.get_feature_names_out(key_features)

In [None]:
# 20 
# Convert the polynomial features into a DataFrame for further analysis.
poly_df = pd.DataFrame(poly_features, columns=poly_feature_names)

In [None]:
# 21
# Generate lagged versions of the key features to help the model capture time-based patterns.
for feature in key_features:
    selected_data[f"(feature)_lag1"] = selected_data[feature].shift(1)
    selected_data[f"(feature)_lag2"] = selected_data[feature].shift(2)


In [None]:
# 22
# Standardize the polynomial features to maintain consistency with the original feature set.
poly_df_scaled = pd.DataFrame(scaler.fit_transform(poly_df), columns=poly_feature_names)

In [None]:
# 23
# Merge the scaled polynomial features with the original features, dropping any missing values.
final_features = pd.concat([poly_df_scaled, selected_data], axis=1).dropna()

In [None]:
# 24
# Further standardize the combined final features for use in modeling.
final_features_scaled = pd.DataFrame(scaler.fit_transform(final_features), columns=final_features.columns)

In [None]:
# 26
# Extract the target variable corresponding to the final features for model training and evaluation.
final_target= df["Financial Distress"][final_features.index]

final_features_scaled.head(), final_target.head()