# Importing Libraries

In [1]:
import pandas as pd

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import mutual_info_regression
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Loading the dataset

In [3]:
df = pd.read_csv("final_data.csv")
df

Unnamed: 0,net profit / total assets,total liabilities / total assets,working capital / total assets,current assets / short-term liabilities,[(cash + short-term securities + receivables - short-term liabilities) / (operating expenses - depreciation)] 365,retained earnings / total assets,EBIT / total assets,book value of equity / total liabilities,sales / total assets,equity / total assets,...,(sales - cost of products sold) / sales,(current assets - inventory - short-term liabilities) / (sales - gross profit - depreciation),total costs /total sales,long-term liabilities / equity,sales / inventory,sales / receivables,(short-term liabilities *365) / sales,sales / short-term liabilities,sales / fixed assets,Y
0,0.200550,0.37951,0.396410,2.04720,32.3510,0.363150,0.249760,1.33050,1.13890,0.504940,...,0.121960,0.397180,0.878040,0.001924,8.416000,5.13720,82.658,4.4158,7.42770,0.0
1,0.209120,0.49988,0.472250,1.94470,14.7860,0.000000,0.258340,0.99601,1.69960,0.497880,...,0.121300,0.420020,0.853000,0.000000,4.148600,3.27320,107.350,3.4000,20.63220,0.0
2,0.248660,0.69592,0.267130,1.55480,-1.1523,0.000000,0.309060,0.43695,1.30900,0.304080,...,0.241140,0.797572,0.765990,0.687575,4.990900,3.95100,134.270,2.7185,5.20780,0.0
3,0.081483,0.30734,0.458790,2.49280,51.9520,0.149880,0.092704,1.86610,1.05710,0.573530,...,0.054015,0.142070,0.945980,0.000000,4.574600,3.61470,86.435,4.2228,5.54970,0.0
4,0.187320,0.61323,0.229600,1.40630,-7.3128,0.187320,0.187320,0.63070,1.15590,0.386770,...,0.134850,0.484310,0.865150,0.124440,6.398500,4.31580,127.210,2.8692,7.89800,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42999,0.012898,0.70621,0.038857,1.17220,-18.9070,0.000000,0.013981,0.41600,1.67680,0.293790,...,0.020169,0.043904,1.012200,0.511631,13.472000,12.43200,49.117,7.4313,2.27990,1.0
43000,-0.165582,0.96702,-0.518476,0.16576,-67.3650,-0.160942,-0.190591,-0.40334,0.93979,-0.282769,...,-0.064073,0.585129,1.064100,-0.018084,33.517525,16.94315,81.220,4.4940,5.13050,1.0
43001,-0.165582,1.25530,-0.275990,0.74554,-120.4400,-0.160942,-0.154930,-0.26018,1.17490,-0.282769,...,0.148880,0.548240,0.851120,-0.306979,9.852600,3.48920,207.870,1.7559,9.95270,1.0
43002,-0.108860,0.74394,0.015449,1.08780,-17.0030,-0.108860,-0.109180,0.12531,0.84516,0.093224,...,-0.159538,-0.325296,1.160854,0.511631,13.886000,6.07690,83.122,4.3911,0.95575,1.0


# Train Test Split

In [4]:
X = df.drop(columns=['Y'])
y = df['Y']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardization

In [5]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Functions for each feature selection method

### Method 1: PCA

In [6]:
def apply_pca(X_train, X_test, num_components):
    pca = PCA(n_components=num_components)
    X_train_pca = pca.fit_transform(X_train)
    X_test_pca = pca.transform(X_test)
    return X_train_pca, X_test_pca

### Method 2: Mutual Information Analysis

In [7]:
def apply_mutual_information(X_train, X_test, num_features):
    mi_scores = mutual_info_regression(X_train, y_train)
    selected_features_indices = mi_scores.argsort()[::-1][:num_features]
    X_train_mi = X_train[:, selected_features_indices]
    X_test_mi = X_test[:, selected_features_indices]
    return X_train_mi, X_test_mi

### Method 3: Pearson Correlation Coefficient

In [8]:
def apply_pearson_correlation(X_train, X_test, correlation_threshold):
    correlation_matrix = pd.DataFrame(X_train).corr()
    highly_correlated_pairs = {}
    for col in correlation_matrix.columns:
        correlated_cols = correlation_matrix.index[correlation_matrix[col] > correlation_threshold].tolist()
        correlated_cols.remove(col)
        highly_correlated_pairs[col] = correlated_cols
    columns_to_retain = set(range(X_train.shape[1]))
    for _, correlated_cols in highly_correlated_pairs.items():
        columns_to_retain.difference_update(correlated_cols)
    selected_features = list(columns_to_retain)
    X_train_selected = X_train[:, selected_features]
    X_test_selected = X_test[:, selected_features]
    return X_train_selected, X_test_selected

# Defining model

In [9]:
model = LinearRegression()

In [10]:
# Methods and parameters
methods = {
    'PCA': (apply_pca, {'num_components': 35}),
    'Mutual Information': (apply_mutual_information, {'num_features': 35}),
    'Pearson Correlation': (apply_pearson_correlation, {'correlation_threshold': 0.6}),
}

In [11]:
# Applying methods
for method, (apply_method, params) in methods.items():
    X_train_method, X_test_method = apply_method(X_train_scaled, X_test_scaled, **params)
    model.fit(X_train_method, y_train)
    y_pred = model.predict(X_test_method)
    mse = mean_squared_error(y_test, y_pred)
    print(f"{method} Mean Squared Error (MSE): {mse}")

PCA Mean Squared Error (MSE): 0.045053345424157086
Mutual Information Mean Squared Error (MSE): 0.045120416072170155
Pearson Correlation Mean Squared Error (MSE): 0.04642464650033816
