LSTM


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
data = pd.read_csv('AMZN.csv')

data
data = data[['Date', 'Close']]
data
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
device


# Convert non-numeric columns to categorical type
for column in data.select_dtypes(include=['object']):
    data[column] = data[column].astype(str)
# Encode categorical variables
label_encoder = LabelEncoder()
for column in data.select_dtypes(include=['object']):
    data[column] = label_encoder.fit_transform(data[column])

data['Date'] = data['Date'].to_numpy()

plt.plot(data['Date'], data['Close'])
from copy import deepcopy as dc

def prepare_dataframe_for_lstm(df, n_steps):
    df = dc(df)

    df.set_index('Date', inplace=True)

    for i in range(1, n_steps+1):
        df[f'Close(t-{i})'] = df['Close'].shift(i)

    df.dropna(inplace=True)

    return df

lookback = 7
shifted_df = prepare_dataframe_for_lstm(data, lookback)
shifted_df
shifted_df_as_np = shifted_df.to_numpy()

shifted_df_as_np
shifted_df_as_np.shape
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler(feature_range=(-1, 1))
shifted_df_as_np = scaler.fit_transform(shifted_df_as_np)

shifted_df_as_np
X = shifted_df_as_np[:, 1:]
y = shifted_df_as_np[:, 0]

X.shape, y.shape
X = dc(np.flip(X, axis=1))
X
split_index = int(len(X) * 0.95)

split_index
X_train = X[:split_index]
X_test = X[split_index:]

y_train = y[:split_index]
y_test = y[split_index:]

X_train.shape, X_test.shape, y_train.shape, y_test.shape
X_train = X_train.reshape((-1, lookback, 1))
X_test = X_test.reshape((-1, lookback, 1))

y_train = y_train.reshape((-1, 1))
y_test = y_test.reshape((-1, 1))

X_train.shape, X_test.shape, y_train.shape, y_test.shape
X_train = torch.tensor(X_train).float()
y_train = torch.tensor(y_train).float()
X_test = torch.tensor(X_test).float()
y_test = torch.tensor(y_test).float()

X_train.shape, X_test.shape, y_train.shape, y_test.shape
from torch.utils.data import Dataset

class TimeSeriesDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, i):
        return self.X[i], self.y[i]

train_dataset = TimeSeriesDataset(X_train, y_train)
test_dataset = TimeSeriesDataset(X_test, y_test)
train_dataset
from torch.utils.data import DataLoader

batch_size = 16

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
for _, batch in enumerate(train_loader):
    x_batch, y_batch = batch[0].to(device), batch[1].to(device)
    print(x_batch.shape, y_batch.shape)
    break
class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_stacked_layers):
        super().__init__()
        self.hidden_size = hidden_size
        self.num_stacked_layers = num_stacked_layers

        self.lstm = nn.LSTM(input_size, hidden_size, num_stacked_layers,
                            batch_first=True)

        self.fc = nn.Linear(hidden_size, 1)

    def forward(self, x):
        batch_size = x.size(0)
        h0 = torch.zeros(self.num_stacked_layers, batch_size, self.hidden_size).to(device)
        c0 = torch.zeros(self.num_stacked_layers, batch_size, self.hidden_size).to(device)

        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])
        return out

model = LSTM(1, 4, 1)
model.to(device)
model
def train_one_epoch():
    model.train(True)
    print(f'Epoch: {epoch + 1}')
    running_loss = 0.0

    for batch_index, batch in enumerate(train_loader):
        x_batch, y_batch = batch[0].to(device), batch[1].to(device)

        output = model(x_batch)
        loss = loss_function(output, y_batch)
        running_loss += loss.item()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch_index % 100 == 99:  # print every 100 batches
            avg_loss_across_batches = running_loss / 100
            print('Batch {0}, Loss: {1:.3f}'.format(batch_index+1,
                                                    avg_loss_across_batches))
            running_loss = 0.0
    print()
def validate_one_epoch():
    model.train(False)
    running_loss = 0.0

    for batch_index, batch in enumerate(test_loader):
        x_batch, y_batch = batch[0].to(device), batch[1].to(device)

        with torch.no_grad():
            output = model(x_batch)
            loss = loss_function(output, y_batch)
            running_loss += loss.item()

    avg_loss_across_batches = running_loss / len(test_loader)

    print('Val Loss: {0:.3f}'.format(avg_loss_across_batches))
    print('***************************************************')
    print()
learning_rate = 0.001
num_epochs = 10
loss_function = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

for epoch in range(num_epochs):
    train_one_epoch()
    validate_one_epoch()
with torch.no_grad():
    predicted = model(X_train.to(device)).to('cpu').numpy()

plt.plot(y_train, label='Actual Close')
plt.plot(predicted, label='Predicted Close')
plt.xlabel('Day')
plt.ylabel('Close')
plt.legend()
plt.show()

train_predictions = predicted.flatten()

dummies = np.zeros((X_train.shape[0], lookback+1))
dummies[:, 0] = train_predictions
dummies = scaler.inverse_transform(dummies)

train_predictions = dc(dummies[:, 0])
train_predictions
dummies = np.zeros((X_train.shape[0], lookback+1))
dummies[:, 0] = y_train.flatten()
dummies = scaler.inverse_transform(dummies)

new_y_train = dc(dummies[:, 0])
new_y_train
plt.plot(new_y_train, label='Actual Close')
plt.plot(train_predictions, label='Predicted Close')
plt.xlabel('Day')
plt.ylabel('Close')
plt.legend()
plt.show()
test_predictions = model(X_test.to(device)).detach().cpu().numpy().flatten()

dummies = np.zeros((X_test.shape[0], lookback+1))
dummies[:, 0] = test_predictions
dummies = scaler.inverse_transform(dummies)

test_predictions = dc(dummies[:, 0])
test_predictions
dummies = np.zeros((X_test.shape[0], lookback+1))
dummies[:, 0] = y_test.flatten()
dummies = scaler.inverse_transform(dummies)

new_y_test = dc(dummies[:, 0])
new_y_test
plt.plot(new_y_test, label='Actual Close')
plt.plot(test_predictions, label='Predicted Close')
plt.xlabel('Day')
plt.ylabel('Close')
plt.legend()
plt.show()
#@title pytorch-r^2
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from torch.utils.data import Dataset, DataLoader
from copy import deepcopy as dc

# Load the data
data = pd.read_csv('E:/AMZN.csv', encoding='latin1')

# Preprocess the data
data = data[['Date', 'Close']]

# Convert non-numeric columns to categorical type and encode them
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
for column in data.select_dtypes(include=['object']):
    data[column] = data[column].astype(str)

label_encoder = LabelEncoder()
for column in data.select_dtypes(include=['object']):
    data[column] = label_encoder.fit_transform(data[column])

data['Date'] = data['Date'].to_numpy()

# Plotting the data
plt.plot(data['Date'], data['Close'])
plt.xlabel('Date')
plt.ylabel('Close')
plt.show()

# Prepare data for LSTM
def prepare_dataframe_for_lstm(df, n_steps):
    df = dc(df)
    df.set_index('Date', inplace=True)
    for i in range(1, n_steps+1):
        df[f'Close(t-{i})'] = df['Close'].shift(i)
    df.dropna(inplace=True)
    return df

lookback = 7
shifted_df = prepare_dataframe_for_lstm(data, lookback)

# Normalize the data
shifted_df_as_np = shifted_df.to_numpy()
scaler = MinMaxScaler(feature_range=(-1, 1))
shifted_df_as_np = scaler.fit_transform(shifted_df_as_np)

X = shifted_df_as_np[:, 1:]
y = shifted_df_as_np[:, 0]

# Reverse the input data for LSTM
X = dc(np.flip(X, axis=1))

# Train-test split
split_index = int(len(X) * 0.95)
X_train = X[:split_index]
X_test = X[split_index:]
y_train = y[:split_index]
y_test = y[split_index:]

# Reshaping data for LSTM
X_train = X_train.reshape((-1, lookback, 1))
X_test = X_test.reshape((-1, lookback, 1))
y_train = y_train.reshape((-1, 1))
y_test = y_test.reshape((-1, 1))

# Convert to torch tensors
X_train = torch.tensor(X_train).float()
y_train = torch.tensor(y_train).float()
X_test = torch.tensor(X_test).float()
y_test = torch.tensor(y_test).float()

# Create custom Dataset
class TimeSeriesDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, i):
        return self.X[i], self.y[i]

train_dataset = TimeSeriesDataset(X_train, y_train)
test_dataset = TimeSeriesDataset(X_test, y_test)

# DataLoader for batching
batch_size = 16
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# LSTM model
class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_stacked_layers):
        super().__init__()
        self.hidden_size = hidden_size
        self.num_stacked_layers = num_stacked_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_stacked_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, 1)

    def forward(self, x):
        batch_size = x.size(0)
        h0 = torch.zeros(self.num_stacked_layers, batch_size, self.hidden_size).to(device)
        c0 = torch.zeros(self.num_stacked_layers, batch_size, self.hidden_size).to(device)
        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])
        return out

# Initialize model
model = LSTM(1, 4, 1)
model.to(device)

# Accuracy calculation metrics
def calculate_accuracy(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)
    return mae, rmse, r2

# Training and validation
learning_rate = 0.001
num_epochs = 10
loss_function = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

def train_one_epoch():
    model.train(True)
    print(f'Epoch: {epoch + 1}')
    running_loss = 0.0

    for batch_index, batch in enumerate(train_loader):
        x_batch, y_batch = batch[0].to(device), batch[1].to(device)
        output = model(x_batch)
        loss = loss_function(output, y_batch)
        running_loss += loss.item()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch_index % 100 == 99:
            avg_loss_across_batches = running_loss / 100
            print('Batch {0}, Loss: {1:.3f}'.format(batch_index+1, avg_loss_across_batches))
            running_loss = 0.0

def validate_one_epoch():
    model.train(False)
    running_loss = 0.0

    for batch_index, batch in enumerate(test_loader):
        x_batch, y_batch = batch[0].to(device), batch[1].to(device)
        with torch.no_grad():
            output = model(x_batch)
            loss = loss_function(output, y_batch)
            running_loss += loss.item()

    avg_loss_across_batches = running_loss / len(test_loader)
    print('Val Loss: {0:.3f}'.format(avg_loss_across_batches))
    print('***************************************************')
    print()

# Training loop
for epoch in range(num_epochs):
    train_one_epoch()
    validate_one_epoch()

# Prediction on training data
with torch.no_grad():
    predicted = model(X_train.to(device)).to('cpu').numpy()

train_predictions = predicted.flatten()

# Rescale the predictions
dummies = np.zeros((X_train.shape[0], lookback+1))
dummies[:, 0] = train_predictions
dummies = scaler.inverse_transform(dummies)
train_predictions_rescaled = dc(dummies[:, 0])

# Rescale actual values
dummies[:, 0] = y_train.flatten()
dummies = scaler.inverse_transform(dummies)
new_y_train_rescaled = dc(dummies[:, 0])

# Calculate accuracy for training
mae_train, rmse_train, r2_train = calculate_accuracy(new_y_train_rescaled, train_predictions_rescaled)
print(f'Training Accuracy - MAE: {mae_train:.3f}, RMSE: {rmse_train:.3f}, R²: {r2_train:.3f}')

# Prediction on test data
test_predictions = model(X_test.to(device)).detach().cpu().numpy().flatten()

# Rescale the test predictions
dummies = np.zeros((X_test.shape[0], lookback+1))
dummies[:, 0] = test_predictions
dummies = scaler.inverse_transform(dummies)
test_predictions_rescaled = dc(dummies[:, 0])

# Rescale actual test values
dummies[:, 0] = y_test.flatten()
dummies = scaler.inverse_transform(dummies)
new_y_test_rescaled = dc(dummies[:, 0])

# Calculate accuracy for testing
mae_test, rmse_test, r2_test = calculate_accuracy(new_y_test_rescaled, test_predictions_rescaled)
print(f'Test Accuracy - MAE: {mae_test:.3f}, RMSE: {rmse_test:.3f}, R²: {r2_test:.3f}')

# Plot the results
plt.plot(new_y_train_rescaled, label='Actual Close (Train)')
plt.plot(train_predictions_rescaled, label='Predicted Close (Train)')
plt.xlabel('Day')
plt.ylabel('Close')
plt.legend()
plt.show()

plt.plot(new_y_test_rescaled, label='Actual Close (Test)')
plt.plot(test_predictions_rescaled, label='Predicted Close (Test)')
plt.xlabel('Day')
plt.ylabel('Close')
plt.legend()
plt.show()
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from torch.utils.data import Dataset, DataLoader
from copy import deepcopy as dc

# Load the data
data = pd.read_csv('E:/AMZN.csv', encoding='latin1')

# Preprocess the data
data = data[['Date', 'Close']]

# Convert non-numeric columns to categorical type and encode them
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
for column in data.select_dtypes(include=['object']):
    data[column] = data[column].astype(str)

label_encoder = LabelEncoder()
for column in data.select_dtypes(include=['object']):
    data[column] = label_encoder.fit_transform(data[column])

data['Date'] = data['Date'].to_numpy()

# Plotting the data
plt.plot(data['Date'], data['Close'])
plt.xlabel('Date')
plt.ylabel('Close')
plt.show()

# Prepare data for LSTM
def prepare_dataframe_for_lstm(df, n_steps):
    df = dc(df)
    df.set_index('Date', inplace=True)
    for i in range(1, n_steps+1):
        df[f'Close(t-{i})'] = df['Close'].shift(i)
    df.dropna(inplace=True)
    return df

lookback = 7
shifted_df = prepare_dataframe_for_lstm(data, lookback)

# Normalize the data
shifted_df_as_np = shifted_df.to_numpy()
scaler = MinMaxScaler(feature_range=(-1, 1))
shifted_df_as_np = scaler.fit_transform(shifted_df_as_np)

X = shifted_df_as_np[:, 1:]
y = shifted_df_as_np[:, 0]

# Reverse the input data for LSTM
X = dc(np.flip(X, axis=1))

# Train-test split
split_index = int(len(X) * 0.95)
X_train = X[:split_index]
X_test = X[split_index:]
y_train = y[:split_index]
y_test = y[split_index:]

# Reshaping data for LSTM
X_train = X_train.reshape((-1, lookback, 1))
X_test = X_test.reshape((-1, lookback, 1))
y_train = y_train.reshape((-1, 1))
y_test = y_test.reshape((-1, 1))

# Convert to torch tensors
X_train = torch.tensor(X_train).float()
y_train = torch.tensor(y_train).float()
X_test = torch.tensor(X_test).float()
y_test = torch.tensor(y_test).float()

# Create custom Dataset
class TimeSeriesDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, i):
        return self.X[i], self.y[i]

train_dataset = TimeSeriesDataset(X_train, y_train)
test_dataset = TimeSeriesDataset(X_test, y_test)

# DataLoader for batching
batch_size = 16
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# LSTM model
class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_stacked_layers):
        super().__init__()
        self.hidden_size = hidden_size
        self.num_stacked_layers = num_stacked_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_stacked_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, 1)

    def forward(self, x):
        batch_size = x.size(0)
        h0 = torch.zeros(self.num_stacked_layers, batch_size, self.hidden_size).to(device)
        c0 = torch.zeros(self.num_stacked_layers, batch_size, self.hidden_size).to(device)
        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])
        return out

# Initialize model
model = LSTM(1, 4, 1)
model.to(device)

# Training loop
learning_rate = 0.001
num_epochs = 10
loss_function = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

def train_one_epoch():
    model.train(True)
    running_loss = 0.0

    for batch_index, batch in enumerate(train_loader):
        x_batch, y_batch = batch[0].to(device), batch[1].to(device)
        output = model(x_batch)
        loss = loss_function(output, y_batch)
        running_loss += loss.item()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

def validate_one_epoch():
    model.train(False)
    running_loss = 0.0

    for batch_index, batch in enumerate(test_loader):
        x_batch, y_batch = batch[0].to(device), batch[1].to(device)
        with torch.no_grad():
            output = model(x_batch)
            loss = loss_function(output, y_batch)
            running_loss += loss.item()

    avg_loss_across_batches = running_loss / len(test_loader)
    print(f'Val Loss: {avg_loss_across_batches:.3f}')

# Training loop
for epoch in range(num_epochs):
    train_one_epoch()
    validate_one_epoch()

# Prediction on test data
with torch.no_grad():
    test_predictions = model(X_test.to(device)).detach().cpu().numpy().flatten()

# Rescale the test predictions
dummies = np.zeros((X_test.shape[0], lookback+1))
dummies[:, 0] = test_predictions
dummies = scaler.inverse_transform(dummies)
test_predictions_rescaled = dc(dummies[:, 0])

# Rescale actual test values
dummies[:, 0] = y_test.flatten()
dummies = scaler.inverse_transform(dummies)
new_y_test_rescaled = dc(dummies[:, 0])

# Convert predictions and actual values into binary classes (Up/Down)
# If predicted value is higher than the actual, label it as 1 (Up), else 0 (Down)
y_pred_class = (test_predictions_rescaled > new_y_test_rescaled).astype(int)
y_true_class = (new_y_test_rescaled > np.roll(new_y_test_rescaled, 1)).astype(int)  # Shift to get the direction of movement

# Calculate confusion matrix
cm = confusion_matrix(y_true_class[1:], y_pred_class[1:])  # Exclude the first point to avoid index error

# Plot confusion matrix
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap='Blues')
plt.title('Confusion Matrix')
plt.show()

# Calculate other metrics
mae_test = mean_absolute_error(new_y_test_rescaled, test_predictions_rescaled)
rmse_test = np.sqrt(mean_squared_error(new_y_test_rescaled, test_predictions_rescaled))
r2_test = r2_score(new_y_test_rescaled, test_predictions_rescaled)
print(f'Test Accuracy - MAE: {mae_test:.3f}, RMSE: {rmse_test:.3f}, R²: {r2_test:.3f}')


SYMBOLIC-REGRESSION-2(MORE ACCURATE)

In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.preprocessing import PolynomialFeatures, LabelEncoder
from sklearn.impute import SimpleImputer

# Small epsilon to prevent log(0)
epsilon = 1e-10

# Load dataset
data_path = r"E:/URP/thesis/train file-kuznet curve.csv"
df = pd.read_csv(data_path, encoding='latin1')

# Ensure target column exists and drop rows where target is NaN
target_column = "gdp"
df = df.dropna(subset=[target_column]).reset_index(drop=True)

# Encode categorical features
label_encoder = LabelEncoder()
for column in df.select_dtypes(include=['object']):
    df[column] = df[column].astype(str)
    df[column] = label_encoder.fit_transform(df[column])

# Automatically detect feature columns (all except target)
feature_columns = [col for col in df.columns if col != target_column]

# Handle missing values in features using mean imputation
imputer = SimpleImputer(strategy='mean')
X = imputer.fit_transform(df[feature_columns])
y = df[target_column].values

# Conditional logarithmic transformation for features
def signed_log(x):
    """If x > 0: ln(x), if x < 0: -ln(|x|), if x=0: 0"""
    x = np.array(x)
    result = np.zeros_like(x, dtype=float)
    pos_mask = x > 0
    neg_mask = x < 0
    result[pos_mask] = np.log(x[pos_mask] + epsilon)
    result[neg_mask] = -np.log(np.abs(x[neg_mask]) + epsilon)
    return result

ln_X = signed_log(X)
ln_y = signed_log(y).reshape(-1, 1)

# Create polynomial features
degree = 2  # Adjust degree as needed
poly = PolynomialFeatures(degree=degree, include_bias=False)
X_poly = poly.fit_transform(ln_X)

# Fit linear regression on transformed data
model = LinearRegression()
model.fit(X_poly, ln_y.ravel())

# Extract coefficients and intercept
coefficients = model.coef_
intercept = model.intercept_

# Predict ln(y)
ln_y_pred = model.predict(X_poly)

# Calculate R-squared
r2 = r2_score(ln_y, ln_y_pred)

# Generate dynamic equation
feature_names = poly.get_feature_names_out([f"sln({col})" for col in feature_columns])
equation = f"sln(gdp) = {intercept:.5f}"
for coef, name in zip(coefficients, feature_names):
    equation += f" + {coef:.5f} * {name}"

# Print results
print("Equation in transformed space (signed-log with polynomial features):")
print(equation)
print(f"R-squared (transformed space): {r2:.5f}")


In [None]:
import pandas as pd
import numpy as np

# Load test data
test_df = pd.read_csv('/kaggle/input/russian-car-train-s1/russian_car_train.csv')

# Load encoded mappings
encoded_letters_dict = pd.read_csv('/kaggle/input/encoded-letters-for-russian-car/encoded_letters.csv') \
    .set_index('original')['encoded'].to_dict()
encoded_home_dict = pd.read_csv('/kaggle/input/unique-region-for-russian-cars/unique_region_name_encodings.csv') \
    .set_index('region_name')['region_name_encoded'].to_dict()

# Map categorical columns to encoded values
test_df['letters'] = test_df['letters'].apply(lambda x: encoded_letters_dict.get(x, 0))
test_df['region_name'] = test_df['region_name'].apply(lambda x: encoded_home_dict.get(x, 0))

# Convert boolean/int flags to int
bool_cols = ['is_repeating_letters', 'is_repeating_digits', 'is_palindrome_digits',
             'contains_lucky_digit_7', 'contains_lucky_digit_8', 'is_government_plate',
             'has_road_advantage', 'is_weekend']
for col in bool_cols:
    test_df[col] = test_df[col].astype(int)

# Replace zeros with epsilon for log safety
epsilon = 1e-9
test_df = test_df.replace(0, epsilon).astype(float)


def safe_ln(x, epsilon=1e-10):
    return np.log(np.abs(x) + epsilon)

def calculate_plate_price(df):
  # Compute log-transformed features
  ln_digits = safe_ln(df['digits'])
  ln_region = safe_ln(df['region'])
  ln_letters = safe_ln(df['letters'])
  ln_region_name = safe_ln(df['region_name'])
  ln_is_government_plate = safe_ln(df['is_government_plate'])
  ln_has_road_advantage = safe_ln(df['has_road_advantage'])
  ln_gov_significance_score = safe_ln(df['gov_significance_score'])
  ln_is_repeating_letters = safe_ln(df['is_repeating_letters'])
  ln_is_repeating_digits = safe_ln(df['is_repeating_digits'])
  ln_is_palindrome_digits = safe_ln(df['is_palindrome_digits'])
  ln_contains_lucky_digit_7 = safe_ln(df['contains_lucky_digit_7'])
  ln_contains_lucky_digit_8 = safe_ln(df['contains_lucky_digit_8'])
  ln_year = safe_ln(df['year'])
  ln_month = safe_ln(df['month'])
  ln_day = safe_ln(df['day'])
  ln_weekday = safe_ln(df['weekday'])
  ln_is_weekend = safe_ln(df['is_weekend'])
  ln_hour = safe_ln(df['hour'])
  ln_minute = safe_ln(df['minute'])
  ln_second = safe_ln(df['second'])
  ln_plate_listing_count = safe_ln(df['plate_listing_count'])
    
  # Apply symbolic equation to compute predictions
  price = (9910345.44054 +
             445.22990 * ln_digits +
             1223.03444 * ln_region +
             8.39760 * ln_letters +
             -44.25892 * ln_region_name +
             21959.09750 * ln_is_government_plate +
             22112.47253 * ln_has_road_advantage +
             -2054.66321 * ln_gov_significance_score +
             7526.61748 * ln_is_repeating_letters +
             1493.58991 * ln_is_repeating_digits +
             82.20587 * ln_is_palindrome_digits +
             1595.16359 * ln_contains_lucky_digit_7 +
             1570.42135 * ln_contains_lucky_digit_8 +
             -2605741.06766 * ln_year +
             -390.45876 * ln_month +
             -86.99772 * ln_day +
             -8.20003 * ln_weekday +
             1648.11501 * ln_is_weekend +
             -38.43041 * ln_hour +
             -19.95811 * ln_minute +
             4.49435 * ln_second +
             -145.28560 * ln_plate_listing_count +
             0.07441 * ln_digits**2 +
             -0.01274 * ln_digits * ln_region +
             -0.00379 * ln_digits * ln_letters +
             -0.00053 * ln_digits * ln_region_name +
             0.05214 * ln_digits * ln_is_government_plate +
             -0.00534 * ln_digits * ln_has_road_advantage +
             -0.04696 * ln_digits * ln_gov_significance_score +
             -0.00900 * ln_digits * ln_is_repeating_letters +
             0.00301 * ln_digits * ln_is_repeating_digits +
             -0.00455 * ln_digits * ln_is_palindrome_digits +
             -0.00591 * ln_digits * ln_contains_lucky_digit_7 +
             -0.00280 * ln_digits * ln_contains_lucky_digit_8 +
             -58.62074 * ln_digits * ln_year +
             -0.00101 * ln_digits * ln_month +
             -0.00500 * ln_digits * ln_day +
             0.00004 * ln_digits * ln_weekday +
             0.00014 * ln_digits * ln_is_weekend +
             -0.00063 * ln_digits * ln_hour +
             0.00074 * ln_digits * ln_minute +
             0.00086 * ln_digits * ln_second +
             0.03658 * ln_digits * ln_plate_listing_count +
             -0.02610 * ln_region**2 +
             0.00203 * ln_region * ln_letters +
             -0.00453 * ln_region * ln_region_name +
             0.24540 * ln_region * ln_is_government_plate +
             0.03635 * ln_region * ln_has_road_advantage +
             -0.24897 * ln_region * ln_gov_significance_score +
             0.00853 * ln_region * ln_is_repeating_letters +
             0.00420 * ln_region * ln_is_repeating_digits +
             0.00383 * ln_region * ln_is_palindrome_digits +
             0.00218 * ln_region * ln_contains_lucky_digit_7 +
             -0.00060 * ln_region * ln_contains_lucky_digit_8 +
             -160.49811 * ln_region * ln_year +
             0.00178 * ln_region * ln_month +
             0.00317 * ln_region * ln_day +
             0.00016 * ln_region * ln_weekday +
             -0.00045 * ln_region * ln_is_weekend +
             0.00242 * ln_region * ln_hour +
             -0.00010 * ln_region * ln_minute +
             -0.00540 * ln_region * ln_second +
             -0.01395 * ln_region * ln_plate_listing_count +
             0.00162 * ln_letters**2 +
             0.00021 * ln_letters * ln_region_name +
             0.07941 * ln_letters * ln_is_government_plate +
             -0.03561 * ln_letters * ln_has_road_advantage +
             -0.03972 * ln_letters * ln_gov_significance_score +
             0.00407 * ln_letters * ln_is_repeating_letters +
             0.00160 * ln_letters * ln_is_repeating_digits +
             0.00019 * ln_letters * ln_is_palindrome_digits +
             -0.00001 * ln_letters * ln_contains_lucky_digit_7 +
             -0.00002 * ln_letters * ln_contains_lucky_digit_8 +
             -1.08391 * ln_letters * ln_year +
             0.00049 * ln_letters * ln_month +
             -0.00051 * ln_letters * ln_day +
             0.00011 * ln_letters * ln_weekday +
             -0.00023 * ln_letters * ln_is_weekend +
             -0.00026 * ln_letters * ln_hour +
             -0.00000 * ln_letters * ln_minute +
             0.00031 * ln_letters * ln_second +
             -0.00788 * ln_letters * ln_plate_listing_count +
             -0.00233 * ln_region_name**2 +
             0.16600 * ln_region_name * ln_is_government_plate +
             -0.03346 * ln_region_name * ln_has_road_advantage +
             -0.14930 * ln_region_name * ln_gov_significance_score +
             0.00044 * ln_region_name * ln_is_repeating_letters +
             -0.00062 * ln_region_name * ln_is_repeating_digits +
             -0.00001 * ln_region_name * ln_is_palindrome_digits +
             -0.00006 * ln_region_name * ln_contains_lucky_digit_7 +
             -0.00009 * ln_region_name * ln_contains_lucky_digit_8 +
             5.75453 * ln_region_name * ln_year +
             0.00164 * ln_region_name * ln_month +
             -0.00173 * ln_region_name * ln_day +
             0.00001 * ln_region_name * ln_weekday +
             -0.00002 * ln_region_name * ln_is_weekend +
             -0.01463 * ln_region_name * ln_hour +
             -0.00700 * ln_region_name * ln_minute +
             0.01981 * ln_region_name * ln_second +
             -0.00040 * ln_region_name * ln_plate_listing_count +
             -623.79252 * ln_is_government_plate**2 +
             93.11939 * ln_is_government_plate * ln_has_road_advantage +
             1392.28005 * ln_is_government_plate * ln_gov_significance_score +
             0.00097 * ln_is_government_plate * ln_is_repeating_letters +
             -0.03138 * ln_is_government_plate * ln_is_repeating_digits +
             0.00738 * ln_is_government_plate * ln_is_palindrome_digits +
             0.00400 * ln_is_government_plate * ln_contains_lucky_digit_7 +
             0.01188 * ln_is_government_plate * ln_contains_lucky_digit_8 +
             -272.01339 * ln_is_government_plate * ln_year +
             0.08776 * ln_is_government_plate * ln_month +
             -0.06306 * ln_is_government_plate * ln_day +
             0.00184 * ln_is_government_plate * ln_weekday +
             0.00368 * ln_is_government_plate * ln_is_weekend +
             -0.01281 * ln_is_government_plate * ln_hour +
             0.00070 * ln_is_government_plate * ln_minute +
             -0.00325 * ln_is_government_plate * ln_second +
             -0.26698 * ln_is_government_plate * ln_plate_listing_count +
             958.23524 * ln_has_road_advantage**2 +
             -0.10460 * ln_has_road_advantage * ln_gov_significance_score +
             -0.00099 * ln_has_road_advantage * ln_is_repeating_letters +
             -0.00117 * ln_has_road_advantage * ln_is_repeating_digits +
             0.00071 * ln_has_road_advantage * ln_is_palindrome_digits +
             -0.00023 * ln_has_road_advantage * ln_contains_lucky_digit_7 +
             0.00138 * ln_has_road_advantage * ln_contains_lucky_digit_8 +
             -6.29474 * ln_has_road_advantage * ln_year +
             -0.00390 * ln_has_road_advantage * ln_month +
             -0.00253 * ln_has_road_advantage * ln_day +
             -0.00098 * ln_has_road_advantage * ln_weekday +
             0.00103 * ln_has_road_advantage * ln_is_weekend +
             -0.00961 * ln_has_road_advantage * ln_hour +
             0.01802 * ln_has_road_advantage * ln_minute +
             -0.00964 * ln_has_road_advantage * ln_second +
             -0.05788 * ln_has_road_advantage * ln_plate_listing_count +
             2.14539 * ln_gov_significance_score**2 +
             -0.00177 * ln_gov_significance_score * ln_is_repeating_letters +
             0.02980 * ln_gov_significance_score * ln_is_repeating_digits +
             -0.00703 * ln_gov_significance_score * ln_is_palindrome_digits +
             -0.00385 * ln_gov_significance_score * ln_contains_lucky_digit_7 +
             -0.01227 * ln_gov_significance_score * ln_contains_lucky_digit_8 +
             269.57368 * ln_gov_significance_score * ln_year +
             -0.08170 * ln_gov_significance_score * ln_month +
             0.06395 * ln_gov_significance_score * ln_day +
             -0.00101 * ln_gov_significance_score * ln_weekday +
             -0.00425 * ln_gov_significance_score * ln_is_weekend +
             0.02166 * ln_gov_significance_score * ln_hour +
             -0.01772 * ln_gov_significance_score * ln_minute +
             0.01140 * ln_gov_significance_score * ln_second +
             0.29529 * ln_gov_significance_score * ln_plate_listing_count +
             331.83945 * ln_is_repeating_letters**2 +
             0.00119 * ln_is_repeating_letters * ln_is_repeating_digits +
             0.00145 * ln_is_repeating_letters * ln_is_palindrome_digits +
             0.00005 * ln_is_repeating_letters * ln_contains_lucky_digit_7 +
             -0.00002 * ln_is_repeating_letters * ln_contains_lucky_digit_8 +
             15.01581 * ln_is_repeating_letters * ln_year +
             0.00019 * ln_is_repeating_letters * ln_month +
             0.00037 * ln_is_repeating_letters * ln_day +
             -0.00012 * ln_is_repeating_letters * ln_weekday +
             -0.00010 * ln_is_repeating_letters * ln_is_weekend +
             -0.00013 * ln_is_repeating_letters * ln_hour +
             -0.00006 * ln_is_repeating_letters * ln_minute +
             -0.00005 * ln_is_repeating_letters * ln_second +
             -0.00959 * ln_is_repeating_letters * ln_plate_listing_count +
             70.24220 * ln_is_repeating_digits**2 +
             -60301.93418 * ln_is_repeating_digits * ln_is_palindrome_digits +
             0.00112 * ln_is_repeating_digits * ln_contains_lucky_digit_7 +
             0.00030 * ln_is_repeating_digits * ln_contains_lucky_digit_8 +
             16.26346 * ln_is_repeating_digits * ln_year +
             0.00205 * ln_is_repeating_digits * ln_month +
             -0.00033 * ln_is_repeating_digits * ln_day +
             -0.00004 * ln_is_repeating_digits * ln_weekday +
             0.00001 * ln_is_repeating_digits * ln_is_weekend +
             0.00027 * ln_is_repeating_digits * ln_hour +
             -0.00009 * ln_is_repeating_digits * ln_minute +
             -0.00004 * ln_is_repeating_digits * ln_second +
             -0.01162 * ln_is_repeating_digits * ln_plate_listing_count +
             60307.11675 * ln_is_palindrome_digits**2 +
             0.00045 * ln_is_palindrome_digits * ln_contains_lucky_digit_7 +
             0.00032 * ln_is_palindrome_digits * ln_contains_lucky_digit_8 +
             4.88521 * ln_is_palindrome_digits * ln_year +
             -0.00046 * ln_is_palindrome_digits * ln_month +
             -0.00033 * ln_is_palindrome_digits * ln_day +
             0.00007 * ln_is_palindrome_digits * ln_weekday +
             -0.00005 * ln_is_palindrome_digits * ln_is_weekend +
             -0.00009 * ln_is_palindrome_digits * ln_hour +
             -0.00013 * ln_is_palindrome_digits * ln_minute +
             -0.00015 * ln_is_palindrome_digits * ln_second +
             0.00269 * ln_is_palindrome_digits * ln_plate_listing_count +
             69.40251 * ln_contains_lucky_digit_7**2 +
             -0.00007 * ln_contains_lucky_digit_7 * ln_contains_lucky_digit_8 +
             0.38824 * ln_contains_lucky_digit_7 * ln_year +
             0.00007 * ln_contains_lucky_digit_7 * ln_month +
             -0.00081 * ln_contains_lucky_digit_7 * ln_day +
             0.00007 * ln_contains_lucky_digit_7 * ln_weekday +
             0.00001 * ln_contains_lucky_digit_7 * ln_is_weekend +
             -0.00004 * ln_contains_lucky_digit_7 * ln_hour +
             0.00002 * ln_contains_lucky_digit_7 * ln_minute +
             0.00024 * ln_contains_lucky_digit_7 * ln_second +
             -0.00396 * ln_contains_lucky_digit_7 * ln_plate_listing_count +
             67.56364 * ln_contains_lucky_digit_8**2 +
             -1.92583 * ln_contains_lucky_digit_8 * ln_year +
             -0.00036 * ln_contains_lucky_digit_8 * ln_month +
             0.00027 * ln_contains_lucky_digit_8 * ln_day +
             -0.00008 * ln_contains_lucky_digit_8 * ln_weekday +
             0.00002 * ln_contains_lucky_digit_8 * ln_is_weekend +
             -0.00051 * ln_contains_lucky_digit_8 * ln_hour +
             0.00051 * ln_contains_lucky_digit_8 * ln_minute +
             0.00006 * ln_contains_lucky_digit_8 * ln_second +
             -0.00328 * ln_contains_lucky_digit_8 * ln_plate_listing_count +
             171283.00356 * ln_year**2 +
             51.34567 * ln_year * ln_month +
             11.43708 * ln_year * ln_day +
             1.04852 * ln_year * ln_weekday +
             -1.54772 * ln_year * ln_is_weekend +
             5.05070 * ln_year * ln_hour +
             2.62844 * ln_year * ln_minute +
             -0.59840 * ln_year * ln_second +
             18.92864 * ln_year * ln_plate_listing_count +
             -0.05463 * ln_month**2 +
             -0.04369 * ln_month * ln_day +
             0.00043 * ln_month * ln_weekday +
             -0.00144 * ln_month * ln_is_weekend +
             0.00306 * ln_month * ln_hour +
             -0.00013 * ln_month * ln_minute +
             -0.00208 * ln_month * ln_second +
             -0.00143 * ln_month * ln_plate_listing_count +
             0.00044 * ln_day**2 +
             0.00117 * ln_day * ln_weekday +
             -0.00010 * ln_day * ln_is_weekend +
             -0.00070 * ln_day * ln_hour +
             0.00119 * ln_day * ln_minute +
             -0.00005 * ln_day * ln_second +
             -0.00430 * ln_day * ln_plate_listing_count +
             -0.00107 * ln_weekday**2 +
             -0.00782 * ln_weekday * ln_is_weekend +
             -0.00037 * ln_weekday * ln_hour +
             -0.00013 * ln_weekday * ln_minute +
             0.00036 * ln_weekday * ln_second +
             0.00124 * ln_weekday * ln_plate_listing_count +
             71.06380 * ln_is_weekend**2 +
             0.00013 * ln_is_weekend * ln_hour +
             0.00004 * ln_is_weekend * ln_minute +
             -0.00019 * ln_is_weekend * ln_second +
             0.00046 * ln_is_weekend * ln_plate_listing_count +
             -0.00020 * ln_hour**2 +
             0.00105 * ln_hour * ln_minute +
             -0.00101 * ln_hour * ln_second +
             -0.00597 * ln_hour * ln_plate_listing_count +
             -0.00002 * ln_minute**2 +
             0.00049 * ln_minute * ln_second +
             -0.00165 * ln_minute * ln_plate_listing_count +
             0.00068 * ln_second**2 +
             -0.00192 * ln_second * ln_plate_listing_count +
             0.02578 * ln_plate_listing_count**2)
  return np.abs(price)
  
test_df['price'] = calculate_plate_price(test_df)
# Prepare submission DataFrame
submission_df = pd.DataFrame()
submission_df['id'] = range(51636, 51636 + len(test_df))
submission_df['price'] = test_df['price']

submission_df.to_csv("submission.csv1", index=False)
print(submission_df.head())

inference

In [None]:
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.preprocessing import PolynomialFeatures

# Given data
x1 = np.array([1, 2, 3, 4])
x2 = np.array([1, 2, 3, 4])
x3 = np.array([1, 3, 2, 4])
y = np.array([-9, -8836, -1079, -67776])

# Take absolute values of y and input features
y_positive = np.abs(y)
x1_positive = np.abs(x1)
x2_positive = np.abs(x2)
x3_positive = np.abs(x3)

# Logarithmic transformations
ln_y = np.log(y_positive)
ln_x1 = np.log(x1_positive)
ln_x2 = np.log(x2_positive)
ln_x3 = np.log(x3_positive)

# Combine x1, x2, x3 into a multivariate feature matrix
X_log = np.vstack((ln_x1, ln_x2, ln_x3)).T

# Create polynomial features (degree can be adjusted)
degree = 2  # Adjust this value for higher-degree nurturing
poly = PolynomialFeatures(degree=degree, include_bias=False)
X_poly = poly.fit_transform(X_log)

# Perform multivariate linear regression on transformed polynomial data
model = LinearRegression()
model.fit(X_poly, ln_y)

# Extract coefficients and intercept
coefficients = model.coef_
intercept = model.intercept_

# Predict ln(y) values using the model
ln_y_pred = model.predict(X_poly)

# Calculate R-squared score
r2 = r2_score(ln_y, ln_y_pred)

# Display the equation in transformed space
feature_names = poly.get_feature_names_out(["ln(|x1|)", "ln(|x2|)", "ln(|x3|)"])
equation = f"ln([y]) = {intercept:.2f}"
for coef, name in zip(coefficients, feature_names):
    equation += f" + {coef:.2f}*{name}"

print("Equation in transformed space (log-linearized with polynomial features):")
print(equation)
print(f"R-squared (transformed space): {r2:.2f}")
#COULMN-STACK:
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.preprocessing import PolynomialFeatures

# Given data
x1 = np.array([1, 2, 3, 4])
x2 = np.array([1, 2, 3, 4])
x3 = np.array([1, 3, 2, 4])
y = np.array([-9, -8836, -1079, -67776])

# Take absolute values of y and input features
y_positive = np.abs(y)
x1_positive = np.abs(x1)
x2_positive = np.abs(x2)
x3_positive = np.abs(x3)

# Logarithmic transformations
ln_y = np.log(y_positive)
ln_x1 = np.log(x1_positive)
ln_x2 = np.log(x2_positive)
ln_x3 = np.log(x3_positive)

# Combine x1, x2, x3 into a multivariate feature matrix
X_log = np.column_stack((ln_x1, ln_x2, ln_x3))

# Create polynomial features (degree can be adjusted)
degree = 2  # Adjust this value for higher-degree nurturing
poly = PolynomialFeatures(degree=degree, include_bias=False)
X_poly = poly.fit_transform(X_log)

# Perform multivariate linear regression on transformed polynomial data
model = LinearRegression()
model.fit(X_poly, ln_y)

# Extract coefficients and intercept
coefficients = model.coef_
intercept = model.intercept_

# Predict ln(y) values using the model
ln_y_pred = model.predict(X_poly)

# Calculate R-squared score
r2 = r2_score(ln_y, ln_y_pred)

# Display the equation in transformed space
feature_names = poly.get_feature_names_out(["ln(|x1|)", "ln(|x2|)", "ln(|x3|)"])
equation = f"ln([y]) = {intercept:.2f}"
for coef, name in zip(coefficients, feature_names):
    equation += f" + {coef:.2f}*{name}"

print("Equation in transformed space (log-linearized with polynomial features):")
print(equation)
print(f"R-squared (transformed space): {r2:.2f}")
#2D Array
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.preprocessing import PolynomialFeatures

# Given data
x1 = np.array([1, 2, 3, 4])
x2 = np.array([1, 2, 3, 4])
x3 = np.array([1, 3, 2, 4])
y = np.array([-9, -8836, -1079, -67776])

# Take absolute values of y and input features
y_positive = np.abs(y) + 1e-10
x1_positive = np.abs(x1) + 1e-10
x2_positive = np.abs(x2) + 1e-10
x3_positive = np.abs(x3) + 1e-10

# Logarithmic transformations with reshaping
ln_y = np.log(y_positive).reshape(-1, 1)  # Target variable, reshaped to 2D
ln_x1 = np.log(x1_positive).reshape(-1, 1)
ln_x2 = np.log(x2_positive).reshape(-1, 1)
ln_x3 = np.log(x3_positive).reshape(-1, 1)

# Combine x1, x2, x3 into a multivariate feature matrix
X_log = np.hstack((ln_x1, ln_x2, ln_x3))  # Combine features horizontally into a matrix

# Create polynomial features (degree can be adjusted)
degree = 2  # Adjust this value for higher-degree nurturing
poly = PolynomialFeatures(degree=degree, include_bias=False)
X_poly = poly.fit_transform(X_log)

# Perform multivariate linear regression on transformed polynomial data
model = LinearRegression()
model.fit(X_poly, ln_y.ravel())  # Use .ravel() to flatten ln_y for regression

# Extract coefficients and intercept
coefficients = model.coef_
intercept = model.intercept_

# Predict ln(y) values using the model
ln_y_pred = model.predict(X_poly)

# Calculate R-squared score
r2 = r2_score(ln_y, ln_y_pred)

# Display the equation in transformed space
feature_names = poly.get_feature_names_out(["ln(|x1|)", "ln(|x2|)", "ln(|x3|)"])
equation = f"ln([y]) = {intercept:.2f}"
for coef, name in zip(coefficients, feature_names):
    equation += f" + {coef:.2f}*{name}"

# Print the results
print("Equation in transformed space (log-linearized with polynomial features):")
print(equation)
print(f"R-squared (transformed space): {r2:.2f}")
#1 feature
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.preprocessing import PolynomialFeatures

# Given data
x1 = np.array([30, 45, 60])
y = np.array([0.5, 0.71, 0.867])

# Take absolute values of y and input features
y_positive = np.abs(y) + 1e-10
x1_positive = np.abs(x1) + 1e-10

# Logarithmic transformations
ln_y = np.log(y_positive)
ln_x1 = np.log(x1_positive)

# Reshape ln_x1 into a 2D array (required for PolynomialFeatures)
X_log = ln_x1.reshape(-1, 1)

# Create polynomial features (degree can be adjusted)
degree = 2  # Adjust this value for higher-degree nurturing
poly = PolynomialFeatures(degree=degree, include_bias=False)
X_poly = poly.fit_transform(X_log)

# Perform multivariate linear regression on transformed polynomial data
model = LinearRegression()
model.fit(X_poly, ln_y)

# Extract coefficients and intercept
coefficients = model.coef_
intercept = model.intercept_

# Predict ln(y) values using the model
ln_y_pred = model.predict(X_poly)

# Calculate R-squared score
r2 = r2_score(ln_y, ln_y_pred)

# Display the equation in transformed space
feature_names = poly.get_feature_names_out(["ln(|x1|)"])
equation = f"ln([y]) = {intercept:.2f}"
for coef, name in zip(coefficients, feature_names):
    equation += f" + {coef:.2f}*{name}"

print("Equation in transformed space (log-linearized with polynomial features):")
print(equation)
print(f"R-squared (transformed space): {r2:.2f}")
#Linear-Transformation
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import linregress

x = np.array([3.4, 3.81,4.1])

y = np.array([-0.69,-0.34,-0.143])



# Perform linear regression
slope, intercept, r_value, p_value, std_err = linregress(x, y)
y_fit = slope * x + intercept

# Plot the graph
plt.figure(figsize=(8, 6))
plt.plot(x, y, marker='o', linestyle='-', color='blue', label='Original Data')
plt.plot(x, y_fit, linestyle='--', color='red', label=f'Linear Fit: y={slope:.2f}x+{intercept:.2f}')

# Add title and labels
plt.title('Plot of Given x and y Values with Linear Fit', fontsize=14)
plt.xlabel('x', fontsize=12)
plt.ylabel('y', fontsize=12)

# Add grid and legend
plt.grid(True, linestyle='--', alpha=0.7)
plt.legend(fontsize=12)

# Show the plot
plt.show()

# Print the equation and R-squared
print(f"Linear Equation: y = {slope:.2f}x + {intercept:.2f}")
print(f"R-squared: {r_value**2:.2f}")
#Multi-feature fraction symbolic Regression
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

# Given data
x1 = np.array([1, 2, 3, 4])
x2 = np.array([1, 2, 3, 4])
x3 = np.array([1, 3, 2, 4])
y = np.array([-9, -8836, -1079, -67776])

# Take absolute values of y and input features
y_positive = np.abs(y) + 1e-10
x1_positive = np.abs(x1) + 1e-10
x2_positive = np.abs(x2) + 1e-10
x3_positive = np.abs(x3) + 1e-10

# Logarithmic transformations with reshaping
ln_y = np.log(y_positive).reshape(-1, 1)
ln_x1 = np.log(x1_positive).reshape(-1, 1)
ln_x2 = np.log(x2_positive).reshape(-1, 1)
ln_x3 = np.log(x3_positive).reshape(-1, 1)

# Generate custom polynomial features for degrees 1 to 3 and fractional degree 3.5
feature_names = []
X_custom = np.empty((ln_x1.shape[0], 0))  # Initialize empty feature matrix

# Add features for degrees 1 to 3
for degree in [1, 2, 3]:
    X_custom = np.hstack((X_custom, ln_x1**degree, ln_x2**degree, ln_x3**degree))
    feature_names.extend([
        f"ln(|x1|)^{degree}",
        f"ln(|x2|)^{degree}",
        f"ln(|x3|)^{degree}"
    ])

# Add fractional degree 3.5
fractional_degree = 3.5
X_custom = np.hstack((X_custom, ln_x1**fractional_degree, ln_x2**fractional_degree, ln_x3**fractional_degree))
feature_names.extend([
    f"ln(|x1|)^{fractional_degree}",
    f"ln(|x2|)^{fractional_degree}",
    f"ln(|x3|)^{fractional_degree}"
])

# Perform multivariate linear regression on transformed data
model = LinearRegression()
model.fit(X_custom, ln_y.ravel())

# Extract coefficients and intercept
coefficients = model.coef_
intercept = model.intercept_

# Predict ln(y) values using the model
ln_y_pred = model.predict(X_custom)

# Calculate R-squared score
r2 = r2_score(ln_y, ln_y_pred)

# Display the equation in transformed space
equation = f"ln([y]) = {intercept:.2f}"
for coef, name in zip(coefficients, feature_names):
    equation += f" + {coef:.2f}*{name}"

# Print the results
print("Equation in transformed space (with fractional polynomial features):")
print(equation)
print(f"R-squared (transformed space): {r2:.2f}")
#single feature fraction symbolic regression
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

# Given data
x1 = np.array([1, 5, 7,11,20])
y = np.array([3, 37, 135,2059,1048596])

# Take absolute values of y and input features
y_positive = np.abs(y) + 1e-10
x1_positive = np.abs(x1) + 1e-10

# Logarithmic transformations
ln_y = np.log(y_positive)
ln_x1 = np.log(x1_positive)

# Manually applying polynomial features, including fractional degree 3.5
fractional_degree = 4.3

# Create a custom feature matrix including ln(x1)^1, ln(x1)^2, ln(x1)^3, and ln(x1)^3.5
X_custom = np.column_stack((
    ln_x1**1,    # ln(x1)^1
    ln_x1**2,    # ln(x1)^2
    ln_x1**3,    # ln(x1)^3
    ln_x1**4,    # ln(x1)^3
    ln_x1**fractional_degree  # ln(x1)^3.5
))

# Perform multivariate linear regression on the transformed data
model = LinearRegression()
model.fit(X_custom, ln_y)

# Extract coefficients and intercept
coefficients = model.coef_
intercept = model.intercept_

# Predict ln(y) values using the model
ln_y_pred = model.predict(X_custom)

# Calculate R-squared score
r2 = r2_score(ln_y, ln_y_pred)

# Feature names for the custom features
feature_names = [
    "ln(|x1|)^1", "ln(|x1|)^2", "ln(|x1|)^3","ln(|x1|)^4", f"ln(|x1|)^{fractional_degree}"
]

# Display the equation in transformed space
equation = f"ln([y]) = {intercept:.2f}"
for coef, name in zip(coefficients, feature_names):
    equation += f" + {coef:.2f}*{name}"

# Print the results
print("Equation in transformed space (log-linearized with polynomial features):")
print(equation)
print(f"R-squared (transformed space): {r2:.2f}")
#Number of symbolic regression
import numpy as np
from scipy.optimize import fsolve

# Function to calculate ln(y) for each pair of x1 and y
def equation(N, x1, y):
    results = []
    for xi, yi in zip(x1, y):
        ln_y = np.log(yi)  # Calculate ln(y)
        # Calculate the terms based on the given equation
        term1 = 8.32541478
        term2 = -23.19830324 * np.log(np.abs(xi)) ** 1
        term3 = 23.86326923 * np.log(np.abs(xi)) ** 2
        term4 = -9.14175787 * np.log(np.abs(xi)) ** N[0]  # Ensure N is treated as a scalar
        # Append the difference for this pair
        results.append(ln_y - (term1 + term2 + term3 + term4))
    return np.array(results)  # Return a NumPy array of results

# Example usage
x1 = [69]  # Two values for x1
y = [325188814]  # Corresponding values for y

# Use fsolve to solve for N
N_initial_guess = [1.0]  # Provide an initial guess for N
N_solution = fsolve(equation, x0=N_initial_guess, args=(x1, y))

# Output the solution for N
print(f"The value of N that satisfies both pairs is approximately: {N_solution[0]}")



Symbolic-Regression


In [None]:
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
import os

# Enable inline plotting for Spyder


# Define the file path
file_path = os.path.join("E:/LHC-CERN/Z_boson.csv")

# Load the CSV file
df = pd.read_csv(file_path)

# Print basic information
print(df)
print(df.dtypes)
print(df.shape)
print(df[df['class'] == 'Zee'])
print(df[df['class'] == 'Zmumu'])

# Extract pt1 values for both classes
pt1_class_zmumu = df[df['class'] == 'Zmumu']['pt1']
pt1_class_zee = df[df['class'] == 'Zee']['pt1']

# Plot settings
plt.figure(figsize=(8,6))  # Set figure size
plt.xlabel("pt1")
plt.ylabel("Number Of Class")
plt.title("Class Prediction Visualization")

# Histogram
plt.hist([pt1_class_zmumu, pt1_class_zee], bins=20, rwidth=0.95, color=['green','red'], label=['Class=Zmumu', 'Class=Zee'])

# Add legend and show the plot
plt.legend()
plt.show()

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os

# Define the file path
file_path = os.path.join("E:/LHC-CERN/Z_boson.csv")

# Load the CSV file
df = pd.read_csv(file_path)

# Extract pt1 values for both classes
pt1_class_zmumu = df[df['class'] == 'Zmumu']['pt1']
pt1_class_zee = df[df['class'] == 'Zee']['pt1']

# Plot settings
plt.figure(figsize=(8, 6))  # Set figure size
plt.xlabel("pt1")
plt.ylabel("Number of Class")
plt.title("Class Prediction Visualization")

# Histogram with returned values
n_zmumu, bins, _ = plt.hist(pt1_class_zmumu, bins=20, rwidth=0.95, color='green', alpha=0.7, label='Class=Zmumu', density=False)
n_zee, _, _ = plt.hist(pt1_class_zee, bins=bins, rwidth=0.95, color='red', alpha=0.7, label='Class=Zee', density=False)

# Add legend and show the plot
plt.legend()
plt.show()

# Compute bin centers
bin_centers = (bins[:-1] + bins[1:]) / 2  # Midpoints of bins

# Create a DataFrame with all bins
data = pd.DataFrame({
    'pt1_bin_center': bin_centers, 
    'Zmumu_counts': n_zmumu, 
    'Zee_counts': n_zee
})

# **Remove rows where either class count is zero**
filtered_data = data[(data['Zmumu_counts'] > 0) & (data['Zee_counts'] > 0)]

# Print the filtered lists horizontally
print("\nFiltered pt1_bin_center:", list(filtered_data['pt1_bin_center']))
print("Filtered Zmumu_counts:   ", list(filtered_data['Zmumu_counts']))
print("Filtered Zee_counts:     ", list(filtered_data['Zee_counts']))
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

# Small epsilon to avoid log(0)
epsilon = 1e-2  

# Sample bin centers for Tenure and Monthly Charges
tenure_bins = np.array([10.1041075, 23.3849225, 36.6657375, 49.946552499999996, 63.2273675, 76.5081825, 89.7889975, 
                     103.06981249999998, 116.3506275, 129.6314425, 142.91225749999995, 156.19307249999997, 169.4738875, 
                     182.75470249999995, 196.03551749999997, 209.3163325, 222.59714749999995, 235.87796249999997, 
                     249.15877749999999, 262.4395925])

# Churn=Yes and Churn=No counts
churn_yes_counts = np.array([592, 1870, 4286, 2632, 387, 112, 69, 26, 8, 6, 4, 2, 2, 1, 2, 0, 0, 0, 0, 1])
churn_no_counts = np.array([0, 1877, 5033, 2455, 421, 133, 45, 20, 8, 1, 3, 1, 0, 2, 0, 0, 0, 0, 1, 0])

# Apply natural logarithm (ln) with epsilon to prevent log(0)
ln_churn_yes = np.log(churn_yes_counts + epsilon)
ln_churn_no = np.log(churn_no_counts + epsilon)

# Compute log-odds
log_odds = ln_churn_yes - ln_churn_no

# Apply natural logarithm to tenure bins
ln_tenure = np.log(tenure_bins + epsilon)

# Stack transformed features
X_data = ln_tenure.reshape(-1, 1)  # Ensure it's 2D for sklearn

# Fit a polynomial model
degree = 301  # Adjust the degree (too high can overfit)
poly = PolynomialFeatures(degree, include_bias=False)
X_poly = poly.fit_transform(X_data)

# Fit linear regression model
model = LinearRegression()
model.fit(X_poly, log_odds)

# Get polynomial coefficients
params = model.coef_
intercept = model.intercept_

# Generate fitted curve
tenure_fit = np.linspace(ln_tenure.min(), ln_tenure.max(), 100).reshape(-1, 1)

# Transform fitted features
X_fit_poly = poly.transform(tenure_fit)
y_fit_log_odds = model.predict(X_fit_poly)

# Convert to probability using logistic function
y_fit_prob = 1 / (1 + np.exp(-y_fit_log_odds))

# Plot log-odds fit
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.scatter(ln_tenure, log_odds, color='blue', label="Log-Odds (Actual)")
plt.plot(tenure_fit, y_fit_log_odds, color='black', linestyle="dashed", label="Log-Odds (Fitted)")
plt.xlabel("ln(Tenure)")
plt.ylabel("Log-Odds of Churn")
plt.title("Polynomial Fit to Log-Odds of Churn")
plt.legend()

# Plot logistic probability fit
plt.subplot(1, 2, 2)
plt.scatter(ln_tenure, churn_yes_counts / (churn_yes_counts + churn_no_counts), color='green', label="Churn Probability (Actual)")
plt.plot(tenure_fit, y_fit_prob, color='red', linestyle="dashed", label="Churn Probability (Fitted)")
plt.xlabel("ln(Tenure)")
plt.ylabel("Probability of Churn=Yes")
plt.title("Logistic Curve Fit for Churn Probability")
plt.legend()

plt.show()

# Print logistic equation
equation_terms = " + ".join([
    f"{coef:.6f} * {feature.replace(' ', ' * ')}"
    for coef, feature in zip(params, poly.get_feature_names_out(['ln_tenure']))
])
print(f"\nFinal Logistic Equation:\nP(Churn=Yes) = 1 / (1 + e^(-({intercept:.6f} + {equation_terms})))")

# Compute R² Score
log_odds_pred = model.predict(X_poly)
r2 = r2_score(log_odds, log_odds_pred)
print(f"R² Score for Polynomial Fit: {r2:.4f}")

import numpy as np
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

# Small epsilon to prevent log(0)
epsilon = 1e-10

# Sample bin centers for Tenure and Monthly Charges (from image)
tenure_bins = np.array([5, 15, 25, 35, 45, 55, 65])  # 7 tenure bins
monthly_charges_bins = np.array([20, 40, 60, 80, 100, 110, 120])  # 7 monthly charge bins

# Churn=Yes and Churn=No counts (must match bin lengths)
churn_yes_counts = np.array([900, 250, 150, 120, 100, 90, 80])  # 7 values
churn_no_counts = np.array([850, 450, 400, 380, 350, 400, 450])  # 7 values

# Apply natural logarithm (ln) to absolute values of churn counts
ln_churn_yes = np.log(np.abs(churn_yes_counts) + epsilon)
ln_churn_no = np.log(np.abs(churn_no_counts) + epsilon)

# Compute log-odds using log-transformed churn values
log_odds = ln_churn_yes - ln_churn_no

# Apply natural logarithm to absolute feature values
ln_tenure = np.log(np.abs(tenure_bins) + epsilon)
ln_charges = np.log(np.abs(monthly_charges_bins) + epsilon)

# Stack transformed features
X_data = np.vstack((ln_tenure, ln_charges)).T

# Fit a polynomial model to the transformed features
degree = 2  # Set polynomial degree
poly = PolynomialFeatures(degree, include_bias=False)
X_poly = poly.fit_transform(X_data)

# Fit a linear regression model to the polynomial-transformed features
model = LinearRegression()
model.fit(X_poly, log_odds)

# Get polynomial coefficients
params = model.coef_
intercept = model.intercept_

# Generate fitted curve
tenure_fit = np.linspace(ln_tenure.min(), ln_tenure.max(), 100)
charges_fit = np.linspace(ln_charges.min(), ln_charges.max(), 100)
X_fit = np.array(np.meshgrid(tenure_fit, charges_fit)).T.reshape(-1, 2)

# Transform fitted features
X_fit_poly = poly.transform(X_fit)
y_fit_log_odds = model.predict(X_fit_poly)

# Convert to probability using logistic function
y_fit_prob = 1 / (1 + np.exp(-y_fit_log_odds))

# Plot log-odds fit
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.scatter(ln_tenure, log_odds, color='blue', label="Log-Odds (Actual)")
plt.plot(tenure_fit, y_fit_log_odds[:100], color='black', linestyle="dashed", label="Log-Odds (Fitted)")
plt.xlabel("ln(Tenure)")
plt.ylabel("Log-Odds of Churn")
plt.title("Polynomial Fit to Log-Odds of Churn (Log-Transformed Features & Churn)")
plt.legend()

# Plot logistic probability fit
plt.subplot(1, 2, 2)
plt.scatter(ln_tenure, churn_yes_counts / (churn_yes_counts + churn_no_counts), color='green', label="Churn Probability (Actual)")
plt.plot(tenure_fit, y_fit_prob[:100], color='red', linestyle="dashed", label="Churn Probability (Fitted)")
plt.xlabel("ln(Tenure)")
plt.ylabel("Probability of Churn=Yes")
plt.title("Logistic Curve Fit for Churn Probability (Log-Transformed Features & Churn)")
plt.legend()

plt.show()

# Print the final logistic equation with explicit multiplication signs
equation_terms = " + ".join([
    f"{coef:.5f} * {feature.replace(' ', ' * ')}"
    for coef, feature in zip(params, poly.get_feature_names_out(['ln_[tenure]', 'ln_[charges]']))
])
print(f"\nFinal Logistic Equation:\nP(Churn=Yes) = 1 / (1 + e^(-({intercept:.5f} + {equation_terms})))")

# Compute R² Score for the polynomial fit
log_odds_pred = model.predict(X_poly)
r2 = r2_score(log_odds, log_odds_pred)
print(f"R² Score for Polynomial Fit: {r2:.4f}")


Main Logistic

In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
# Small epsilon to prevent log(0)
epsilon = 1e-10

# Load data from CSV
df = pd.read_csv("E:/AIRCRAFT/Aircraft Engines.csv",encoding='latin1')  # Change "data.csv" to your actual filename
for column in df.select_dtypes(include=['object']):
    df[column] = df[column].astype(str)

label_encoder = LabelEncoder()
for column in df.select_dtypes(include=['object']):
    df[column] = label_encoder.fit_transform(df[column])

# Automatically detect feature columns (all except the last column)
#feature_columns = df.columns[:-1]  # Assumes last column is the target (y)
#target_column = df.columns[-1]  # Assumes last column is y
target_column = "power"  # Set your target column explicitly
feature_columns = [col for col in df.columns if col != target_column] 
# Extract features (X) and target (y)
X = df[feature_columns].values  # Feature matrix
y = df[target_column].values  # Target variable

# Take absolute values and add epsilon for stability
y_positive = np.abs(y) + epsilon
X_positive = np.abs(X) + epsilon  # Apply to all feature columns

# Logarithmic transformations
ln_y = np.log(y_positive).reshape(-1, 1)  # Log-transformed target
ln_X = np.log(X_positive)  # Log-transformed features

# Create polynomial features (degree can be adjusted)
degree = 7  # Adjust this value for higher-degree polynomial regression
poly = PolynomialFeatures(degree=degree, include_bias=False)
X_poly = poly.fit_transform(ln_X)

# Perform multivariate linear regression on transformed polynomial data
model = LinearRegression()
model.fit(X_poly, ln_y.ravel())  # Use .ravel() to flatten ln_y

# Extract coefficients and intercept
coefficients = model.coef_
intercept = model.intercept_

# Predict ln(y) values using the model
ln_y_pred = model.predict(X_poly)

# Calculate R-squared score
r2 = r2_score(ln_y, ln_y_pred)

# Generate equation dynamically based on input features
feature_names = poly.get_feature_names_out([f"ln(|{col}|)" for col in feature_columns])
equation = f"ln(y) = {intercept:.5f}"
for coef, name in zip(coefficients, feature_names):
    equation += f" + {coef:.5f} * {name}"

# Print the results
print("Equation in transformed space (log-linearized with polynomial features):")
print(equation)
print(f"R-squared (transformed space): {r2:.5f}")


CSV SYMBOLIC

In [None]:
#@title polynomial to logistic 
import numpy as np
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit
from sklearn.metrics import r2_score

# Define a cubic polynomial function
def poly_func(x, a, b, c, d):
    return a*x**3 + b*x**2 + c*x + d

# Define the logistic function using polynomial output
def logistic_func(x, a, b, c, d):
    return 1 / (1 + np.exp(-(a*x**3 + b*x**2 + c*x + d)))

# Sample bin centers (adjusted to ensure matching counts)
bin_centers = np.array([5, 15, 25, 35, 45, 55, 65])  # 7 bins

# Churn=Yes and Churn=No counts (must match bin_centers length)
churn_yes_counts = np.array([900, 250, 150, 120, 100, 90, 80])  # 7 values
churn_no_counts = np.array([850, 450, 400, 380, 350, 400, 450])  # 7 values

# Compute log-odds: log(Churn Yes / Churn No)
log_odds = np.log(churn_yes_counts / churn_no_counts)

# Fit the polynomial to the log-odds data
params, _ = curve_fit(poly_func, bin_centers, log_odds)

# Generate fitted curve
x_fit = np.linspace(0, 70, 100)
y_fit_log_odds = poly_func(x_fit, *params)

# Convert to probability using the logistic function
y_fit_prob = logistic_func(x_fit, *params)

# Plot log-odds fit
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.scatter(bin_centers, log_odds, color='blue', label="Log-Odds (Actual)")
plt.plot(x_fit, y_fit_log_odds, color='black', linestyle="dashed", label="Log-Odds (Fitted)")
plt.xlabel("Tenure")
plt.ylabel("Log-Odds of Churn")
plt.title("Polynomial Fit to Log-Odds of Churn")
plt.legend()

# Plot logistic probability fit
plt.subplot(1, 2, 2)
plt.scatter(bin_centers, churn_yes_counts / (churn_yes_counts + churn_no_counts), color='green', label="Churn Probability (Actual)")
plt.plot(x_fit, y_fit_prob, color='red', linestyle="dashed", label="Churn Probability (Fitted)")
plt.xlabel("Tenure")
plt.ylabel("Probability of Churn=Yes")
plt.title("Logistic Curve Fit for Churn Probability")
plt.legend()

plt.show()

# Print the final logistic equation
a, b, c, d = params
print(f"Final Logistic Equation:")
print(f"P(Churn=Yes) = 1 / (1 + e^(-({a:.5f}x³ + {b:.5f}x² + {c:.5f}x + {d:.5f})))")

# Compute R² Score for the polynomial fit
log_odds_pred = poly_func(bin_centers, *params)
r2 = r2_score(log_odds, log_odds_pred)
print(f"R² Score for Polynomial Fit: {r2:.4f}")


polynomial to logistic

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit
from sklearn.metrics import r2_score

# Define a polynomial function with two input features (tenure, monthly charges)
def poly_func(X, a, b, c, d, e, f, g):
    tenure, charges = X
    return a * tenure**2 + b * tenure + c * charges**2 + d * charges + e * tenure * charges + f * tenure + g

# Define the logistic function using polynomial output
def logistic_func(X, a, b, c, d, e, f, g):
    return 1 / (1 + np.exp(-(poly_func(X, a, b, c, d, e, f, g))))

# Sample bin centers for Tenure and Monthly Charges (from image)
tenure_bins = np.array([5, 15, 25, 35, 45, 55, 65])  # 7 tenure bins
monthly_charges_bins = np.array([20, 40, 60, 80, 100, 110, 120])  # 7 monthly charge bins

# Churn=Yes and Churn=No counts (must match bin lengths)
churn_yes_counts = np.array([900, 250, 150, 120, 100, 90, 80])  # 7 values
churn_no_counts = np.array([850, 450, 400, 380, 350, 400, 450])  # 7 values

# Compute log-odds: log(Churn Yes / Churn No)
log_odds = np.log(churn_yes_counts / churn_no_counts)

# Fit the polynomial to the log-odds data
X_data = np.vstack((tenure_bins, monthly_charges_bins))  # Stacking both features as input
params, _ = curve_fit(poly_func, X_data, log_odds)

# Generate fitted curve
tenure_fit = np.linspace(0, 70, 100)
charges_fit = np.linspace(20, 120, 100)
X_fit = np.meshgrid(tenure_fit, charges_fit)
y_fit_log_odds = poly_func((X_fit[0], X_fit[1]), *params)

# Convert to probability using logistic function
y_fit_prob = logistic_func((X_fit[0], X_fit[1]), *params)

# Plot log-odds fit
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.scatter(tenure_bins, log_odds, color='blue', label="Log-Odds (Actual)")
plt.plot(tenure_fit, y_fit_log_odds.mean(axis=0), color='black', linestyle="dashed", label="Log-Odds (Fitted)")
plt.xlabel("Tenure")
plt.ylabel("Log-Odds of Churn")
plt.title("Polynomial Fit to Log-Odds of Churn")
plt.legend()

# Plot logistic probability fit
plt.subplot(1, 2, 2)
plt.scatter(tenure_bins, churn_yes_counts / (churn_yes_counts + churn_no_counts), color='green', label="Churn Probability (Actual)")
plt.plot(tenure_fit, y_fit_prob.mean(axis=0), color='red', linestyle="dashed", label="Churn Probability (Fitted)")
plt.xlabel("Tenure")
plt.ylabel("Probability of Churn=Yes")
plt.title("Logistic Curve Fit for Churn Probability")
plt.legend()

plt.show()

# Print the final logistic equation
a, b, c, d, e, f, g = params
print(f"Final Logistic Equation:")
print(f"P(Churn=Yes) = 1 / (1 + e^(-({a:.5f} * tenure² + {b:.5f} * tenure + {c:.5f} * charges² + {d:.5f} * charges + {e:.5f} * tenure * charges + {f:.5f} * tenure + {g:.5f})))")

# Compute R² Score for the polynomial fit
log_odds_pred = poly_func((tenure_bins, monthly_charges_bins), *params)
r2 = r2_score(log_odds, log_odds_pred)
print(f"R² Score for Polynomial Fit: {r2:.4f}")


polynomial to logistic multifeature

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score

# 🔹 Set the maximum polynomial degree (adjustable)
max_poly_degree = 2  # Set to 1 or adjust based on available data

# Sample bin centers for Tenure and Monthly Charges (from image)
tenure_bins = np.array([5, 15, 25, 35, 45, 55, 65])  # 7 tenure bins
monthly_charges_bins = np.array([20, 40, 60, 80, 100, 110, 120])  # 7 monthly charge bins

# Churn=Yes and Churn=No counts (must match bin lengths)
churn_yes_counts = np.array([900, 250, 150, 120, 100, 90, 80])  # 7 values
churn_no_counts = np.array([850, 450, 400, 380, 350, 400, 450])  # 7 values

# Compute log-odds: log(Churn Yes / Churn No)
log_odds = np.log(churn_yes_counts / churn_no_counts)

# 🔹 Generate polynomial features
X_data = np.vstack((tenure_bins, monthly_charges_bins)).T  # Shape (7,2)
poly = PolynomialFeatures(degree=max_poly_degree, include_bias=False)
X_poly = poly.fit_transform(X_data)  # Expand features

# 🔹 Ensure the number of parameters does not exceed available data points
num_params = X_poly.shape[1]  # Number of polynomial terms
num_data_points = len(log_odds)

if num_params > num_data_points:
    print(f"⚠️ Too many parameters ({num_params}) for {num_data_points} data points! Using Ridge regression instead.")
    model = Ridge(alpha=0.1)  # Regularization
else:
    model = np.linalg.lstsq(X_poly, log_odds, rcond=None)[0]  # Least squares fit

# 🔹 Fit polynomial model
if isinstance(model, Ridge):
    model.fit(X_poly, log_odds)
    params = model.coef_
else:
    params = model

# 🔹 Generate fitted curve
tenure_fit = np.linspace(0, 70, 100)
charges_fit = np.linspace(20, 120, 100)
X_fit = np.array(np.meshgrid(tenure_fit, charges_fit)).T.reshape(-1, 2)
X_fit_poly = poly.transform(X_fit)
y_fit_log_odds = X_fit_poly @ params  # Polynomial transformation

# Convert to probability using logistic function
y_fit_prob = 1 / (1 + np.exp(-y_fit_log_odds))

# 🔹 Plot log-odds fit
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.scatter(tenure_bins, log_odds, color='blue', label="Log-Odds (Actual)")
plt.plot(tenure_fit, y_fit_log_odds.reshape(100, -1).mean(axis=1), color='black', linestyle="dashed", label="Log-Odds (Fitted)")
plt.xlabel("Tenure")
plt.ylabel("Log-Odds of Churn")
plt.title("Polynomial Fit to Log-Odds of Churn")
plt.legend()

# 🔹 Plot logistic probability fit
plt.subplot(1, 2, 2)
plt.scatter(tenure_bins, churn_yes_counts / (churn_yes_counts + churn_no_counts), color='green', label="Churn Probability (Actual)")
plt.plot(tenure_fit, y_fit_prob.reshape(100, -1).mean(axis=1), color='red', linestyle="dashed", label="Churn Probability (Fitted)")
plt.xlabel("Tenure")
plt.ylabel("Probability of Churn=Yes")
plt.title("Logistic Curve Fit for Churn Probability")
plt.legend()

plt.show()


# Corrected equation formatting
equation_terms = " + ".join([
    f"{coef:.5f} * {feature.replace(' ', ' * ')}"  # Ensure multiplication signs
    for coef, feature in zip(params, poly.get_feature_names_out(['tenure', 'charges']))
])
print(f"\nFinal Logistic Equation:\nP(Churn=Yes) = 1 / (1 + e^(-({equation_terms})))")

# 🔹 Compute R² Score
log_odds_pred = X_poly @ params
r2 = r2_score(log_odds, log_odds_pred)
print(f"R² Score for Polynomial Fit: {r2:.4f}")

import numpy as np
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

# Sample bin centers for Tenure and Monthly Charges (from image)
tenure_bins = np.array([5, 15, 25, 35, 45, 55, 65])  # 7 tenure bins
monthly_charges_bins = np.array([20, 40, 60, 80, 100, 110, 120])  # 7 monthly charge bins

# Churn=Yes and Churn=No counts (must match bin lengths)
churn_yes_counts = np.array([900, 250, 150, 120, 100, 90, 80])  # 7 values
churn_no_counts = np.array([850, 450, 400, 380, 350, 400, 450])  # 7 values

# Apply natural logarithm (ln) to absolute values of churn counts
ln_churn_yes = np.log(np.abs(churn_yes_counts))
ln_churn_no = np.log(np.abs(churn_no_counts))

# Compute log-odds using log-transformed churn values
log_odds = ln_churn_yes - ln_churn_no

# Apply natural logarithm to absolute feature values
ln_tenure = np.log(np.abs(tenure_bins))
ln_charges = np.log(np.abs(monthly_charges_bins))

# Stack transformed features
X_data = np.vstack((ln_tenure, ln_charges)).T

# Fit a polynomial model to the transformed features
degree = 2  # Set polynomial degree
poly = PolynomialFeatures(degree, include_bias=False)
X_poly = poly.fit_transform(X_data)

# Fit a linear regression model to the polynomial-transformed features
model = LinearRegression()
model.fit(X_poly, log_odds)

# Get polynomial coefficients
params = model.coef_
intercept = model.intercept_

# Generate fitted curve
tenure_fit = np.linspace(ln_tenure.min(), ln_tenure.max(), 100)
charges_fit = np.linspace(ln_charges.min(), ln_charges.max(), 100)
X_fit = np.array(np.meshgrid(tenure_fit, charges_fit)).T.reshape(-1, 2)

# Transform fitted features
X_fit_poly = poly.transform(X_fit)
y_fit_log_odds = model.predict(X_fit_poly)

# Convert to probability using logistic function
y_fit_prob = 1 / (1 + np.exp(-y_fit_log_odds))

# Plot log-odds fit
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.scatter(ln_tenure, log_odds, color='blue', label="Log-Odds (Actual)")
plt.plot(tenure_fit, y_fit_log_odds[:100], color='black', linestyle="dashed", label="Log-Odds (Fitted)")
plt.xlabel("ln(Tenure)")
plt.ylabel("Log-Odds of Churn")
plt.title("Polynomial Fit to Log-Odds of Churn (Log-Transformed Features & Churn)")
plt.legend()

# Plot logistic probability fit
plt.subplot(1, 2, 2)
plt.scatter(ln_tenure, churn_yes_counts / (churn_yes_counts + churn_no_counts), color='green', label="Churn Probability (Actual)")
plt.plot(tenure_fit, y_fit_prob[:100], color='red', linestyle="dashed", label="Churn Probability (Fitted)")
plt.xlabel("ln(Tenure)")
plt.ylabel("Probability of Churn=Yes")
plt.title("Logistic Curve Fit for Churn Probability (Log-Transformed Features & Churn)")
plt.legend()

plt.show()

# Print the final logistic equation with explicit multiplication signs
equation_terms = " + ".join([
    f"{coef:.5f} * {feature.replace(' ', ' * ')}"
    for coef, feature in zip(params, poly.get_feature_names_out(['ln_[tenure]', 'ln_[charges]']))
])
print(f"\nFinal Logistic Equation:\nP(Churn=Yes) = 1 / (1 + e^(-({intercept:.5f} + {equation_terms})))")

# Compute R² Score for the polynomial fit
log_odds_pred = model.predict(X_poly)
r2 = r2_score(log_odds, log_odds_pred)
print(f"R² Score for Polynomial Fit: {r2:.4f}")

symbolic logistic multifeature

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score

# 🔹 Set the maximum polynomial degree (adjustable)
max_poly_degree = 2  # Adjust based on available data

# Sample bin centers for Tenure (from image)
tenure_bins = np.array([5, 15, 25, 35, 45, 55, 65])  # 7 tenure bins

# Churn=Yes and Churn=No counts
churn_yes_counts = np.array([900, 250, 150, 120, 100, 90, 80])  # 7 values
churn_no_counts = np.array([850, 450, 400, 380, 350, 400, 450])  # 7 values

# Compute log-odds: log(Churn Yes / Churn No)
log_odds = np.log(churn_yes_counts / churn_no_counts).reshape(-1, 1)  # Reshape to (7,1)

# 🔹 Generate polynomial features
X_data = tenure_bins.reshape(-1, 1)  # Ensure (7,1) shape for sklearn
poly = PolynomialFeatures(degree=max_poly_degree, include_bias=False)
X_poly = poly.fit_transform(X_data)  # Expand features

# 🔹 Ensure the number of parameters does not exceed available data points
num_params = X_poly.shape[1]  # Number of polynomial terms
num_data_points = len(log_odds)

if num_params > num_data_points:
    print(f"⚠️ Too many parameters ({num_params}) for {num_data_points} data points! Using Ridge regression instead.")
    model = Ridge(alpha=0.1)  # Regularization
    model.fit(X_poly, log_odds.ravel())  # Fit Ridge model
    params = model.coef_
else:
    params = np.linalg.lstsq(X_poly, log_odds, rcond=None)[0]  # Least squares fit

# 🔹 Generate fitted curve
tenure_fit = np.linspace(0, 70, 100).reshape(-1, 1)  # Ensure shape (100,1)
X_fit_poly = poly.transform(tenure_fit)
y_fit_log_odds = X_fit_poly @ params  # Polynomial transformation

# Convert to probability using logistic function
y_fit_prob = 1 / (1 + np.exp(-y_fit_log_odds))

# 🔹 Plot log-odds fit
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.scatter(tenure_bins, log_odds, color='blue', label="Log-Odds (Actual)")
plt.plot(tenure_fit, y_fit_log_odds, color='black', linestyle="dashed", label="Log-Odds (Fitted)")
plt.xlabel("Tenure")
plt.ylabel("Log-Odds of Churn")
plt.title("Polynomial Fit to Log-Odds of Churn")
plt.legend()

# 🔹 Plot logistic probability fit
plt.subplot(1, 2, 2)
plt.scatter(tenure_bins, churn_yes_counts / (churn_yes_counts + churn_no_counts), color='green', label="Churn Probability (Actual)")
plt.plot(tenure_fit, y_fit_prob, color='red', linestyle="dashed", label="Churn Probability (Fitted)")
plt.xlabel("Tenure")
plt.ylabel("Probability of Churn=Yes")
plt.title("Logistic Curve Fit for Churn Probability")
plt.legend()

plt.show()

# 🔹 Generate final logistic equation
equation_terms = " + ".join([
    f"{coef:.5f} * {feature.replace(' ', ' * ')}"
    for coef, feature in zip(params.flatten(), poly.get_feature_names_out(['tenure']))
])
print(f"\nFinal Logistic Equation:\nP(Churn=Yes) = 1 / (1 + e^(-({equation_terms})))")

# 🔹 Compute R² Score
log_odds_pred = X_poly @ params
r2 = r2_score(log_odds, log_odds_pred)
print(f"R² Score for Polynomial Fit: {r2:.4f}")


import numpy as np
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

# Sample bin centers for Tenure and Monthly Charges (from image)
tenure_bins = np.array([5, 15, 25, 35, 45, 55, 65])  # 7 tenure bins

# Churn=Yes and Churn=No counts (must match bin lengths)
churn_yes_counts = np.array([900, 250, 150, 120, 100, 90, 80])  # 7 values
churn_no_counts = np.array([850, 450, 400, 380, 350, 400, 450])  # 7 values

# Apply natural logarithm (ln) to absolute values of churn counts
ln_churn_yes = np.log(np.abs(churn_yes_counts))
ln_churn_no = np.log(np.abs(churn_no_counts))

# Compute log-odds using log-transformed churn values
log_odds = ln_churn_yes - ln_churn_no

# Apply natural logarithm to absolute feature values
ln_tenure = np.log(np.abs(tenure_bins))

# Stack transformed features
X_data = ln_tenure.reshape(-1, 1)  # Ensure it's a 2D array

# Fit a polynomial model to the transformed features
degree = 1  # Set polynomial degree
poly = PolynomialFeatures(degree, include_bias=False)
X_poly = poly.fit_transform(X_data)

# Fit a linear regression model to the polynomial-transformed features
model = LinearRegression()
model.fit(X_poly, log_odds)

# Get polynomial coefficients
params = model.coef_
intercept = model.intercept_

# Generate fitted curve
tenure_fit = np.linspace(ln_tenure.min(), ln_tenure.max(), 100).reshape(-1, 1)

# Transform fitted features
X_fit_poly = poly.transform(tenure_fit)
y_fit_log_odds = model.predict(X_fit_poly)

# Convert to probability using logistic function
y_fit_prob = 1 / (1 + np.exp(-y_fit_log_odds))

# Plot log-odds fit
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.scatter(ln_tenure, log_odds, color='blue', label="Log-Odds (Actual)")
plt.plot(tenure_fit, y_fit_log_odds, color='black', linestyle="dashed", label="Log-Odds (Fitted)")
plt.xlabel("ln(Tenure)")
plt.ylabel("Log-Odds of Churn")
plt.title("Polynomial Fit to Log-Odds of Churn (Log-Transformed Features & Churn)")
plt.legend()

# Plot logistic probability fit
plt.subplot(1, 2, 2)
plt.scatter(ln_tenure, churn_yes_counts / (churn_yes_counts + churn_no_counts), color='green', label="Churn Probability (Actual)")
plt.plot(tenure_fit, y_fit_prob, color='red', linestyle="dashed", label="Churn Probability (Fitted)")
plt.xlabel("ln(Tenure)")
plt.ylabel("Probability of Churn=Yes")
plt.title("Logistic Curve Fit for Churn Probability (Log-Transformed Features & Churn)")
plt.legend()

plt.show()

# Print the final logistic equation with explicit multiplication signs
equation_terms = " + ".join([
    f"{coef:.5f} * {feature.replace(' ', ' * ')}"
    for coef, feature in zip(params, poly.get_feature_names_out(['ln_tenure']))
])
print(f"\nFinal Logistic Equation:\nP(Churn=Yes) = 1 / (1 + e^(-({intercept:.5f} + {equation_terms})))")

# Compute R² Score for the polynomial fit
log_odds_pred = model.predict(X_poly)
r2 = r2_score(log_odds, log_odds_pred)
print(f"R² Score for Polynomial Fit: {r2:.4f}")

symbolic logistic single feature

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

# Small epsilon to avoid log(0)
epsilon = 1e-2  

# Sample bin centers for Tenure and Monthly Charges
tenure_bins = np.array([10.1041075, 23.3849225, 36.6657375, 49.946552499999996, 63.2273675, 76.5081825, 89.7889975, 
                     103.06981249999998, 116.3506275, 129.6314425, 142.91225749999995, 156.19307249999997, 169.4738875, 
                     182.75470249999995, 196.03551749999997, 209.3163325, 222.59714749999995, 235.87796249999997, 
                     249.15877749999999, 262.4395925])

# Churn=Yes and Churn=No counts
churn_yes_counts = np.array([592, 1870, 4286, 2632, 387, 112, 69, 26, 8, 6, 4, 2, 2, 1, 2, 0, 0, 0, 0, 1])
churn_no_counts = np.array([0, 1877, 5033, 2455, 421, 133, 45, 20, 8, 1, 3, 1, 0, 2, 0, 0, 0, 0, 1, 0])

# Apply natural logarithm (ln) with epsilon to prevent log(0)
ln_churn_yes = np.log(churn_yes_counts + epsilon)
ln_churn_no = np.log(churn_no_counts + epsilon)

# Compute log-odds
log_odds = ln_churn_yes - ln_churn_no

# Apply natural logarithm to tenure bins
ln_tenure = np.log(tenure_bins + epsilon)

# Stack transformed features
X_data = ln_tenure.reshape(-1, 1)  # Ensure it's 2D for sklearn

# Fit a polynomial model
degree = 301  # Adjust the degree (too high can overfit)
poly = PolynomialFeatures(degree, include_bias=False)
X_poly = poly.fit_transform(X_data)

# Fit linear regression model
model = LinearRegression()
model.fit(X_poly, log_odds)

# Get polynomial coefficients
params = model.coef_
intercept = model.intercept_

# Generate fitted curve
tenure_fit = np.linspace(ln_tenure.min(), ln_tenure.max(), 100).reshape(-1, 1)

# Transform fitted features
X_fit_poly = poly.transform(tenure_fit)
y_fit_log_odds = model.predict(X_fit_poly)

# Convert to probability using logistic function
y_fit_prob = 1 / (1 + np.exp(-y_fit_log_odds))

# Plot log-odds fit
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.scatter(ln_tenure, log_odds, color='blue', label="Log-Odds (Actual)")
plt.plot(tenure_fit, y_fit_log_odds, color='black', linestyle="dashed", label="Log-Odds (Fitted)")
plt.xlabel("ln(Tenure)")
plt.ylabel("Log-Odds of Churn")
plt.title("Polynomial Fit to Log-Odds of Churn")
plt.legend()

# Plot logistic probability fit
plt.subplot(1, 2, 2)
plt.scatter(ln_tenure, churn_yes_counts / (churn_yes_counts + churn_no_counts), color='green', label="Churn Probability (Actual)")
plt.plot(tenure_fit, y_fit_prob, color='red', linestyle="dashed", label="Churn Probability (Fitted)")
plt.xlabel("ln(Tenure)")
plt.ylabel("Probability of Churn=Yes")
plt.title("Logistic Curve Fit for Churn Probability")
plt.legend()

plt.show()

# Print logistic equation
equation_terms = " + ".join([
    f"{coef:.6f} * {feature.replace(' ', ' * ')}"
    for coef, feature in zip(params, poly.get_feature_names_out(['ln_tenure']))
])
print(f"\nFinal Logistic Equation:\nP(Churn=Yes) = 1 / (1 + e^(-({intercept:.6f} + {equation_terms})))")

# Compute R² Score
log_odds_pred = model.predict(X_poly)
r2 = r2_score(log_odds, log_odds_pred)
print(f"R² Score for Polynomial Fit: {r2:.4f}")

import numpy as np
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

# Small epsilon to prevent log(0)
epsilon = 1e-10

# Sample bin centers for Tenure and Monthly Charges (from image)
tenure_bins = np.array([5, 15, 25, 35, 45, 55, 65])  # 7 tenure bins
monthly_charges_bins = np.array([20, 40, 60, 80, 100, 110, 120])  # 7 monthly charge bins

# Churn=Yes and Churn=No counts (must match bin lengths)
churn_yes_counts = np.array([900, 250, 150, 120, 100, 90, 80])  # 7 values
churn_no_counts = np.array([850, 450, 400, 380, 350, 400, 450])  # 7 values

# Apply natural logarithm (ln) to absolute values of churn counts
ln_churn_yes = np.log(np.abs(churn_yes_counts) + epsilon)
ln_churn_no = np.log(np.abs(churn_no_counts) + epsilon)

# Compute log-odds using log-transformed churn values
log_odds = ln_churn_yes - ln_churn_no

# Apply natural logarithm to absolute feature values
ln_tenure = np.log(np.abs(tenure_bins) + epsilon)
ln_charges = np.log(np.abs(monthly_charges_bins) + epsilon)

# Stack transformed features
X_data = np.vstack((ln_tenure, ln_charges)).T

# Fit a polynomial model to the transformed features
degree = 2  # Set polynomial degree
poly = PolynomialFeatures(degree, include_bias=False)
X_poly = poly.fit_transform(X_data)

# Fit a linear regression model to the polynomial-transformed features
model = LinearRegression()
model.fit(X_poly, log_odds)

# Get polynomial coefficients
params = model.coef_
intercept = model.intercept_

# Generate fitted curve
tenure_fit = np.linspace(ln_tenure.min(), ln_tenure.max(), 100)
charges_fit = np.linspace(ln_charges.min(), ln_charges.max(), 100)
X_fit = np.array(np.meshgrid(tenure_fit, charges_fit)).T.reshape(-1, 2)

# Transform fitted features
X_fit_poly = poly.transform(X_fit)
y_fit_log_odds = model.predict(X_fit_poly)

# Convert to probability using logistic function
y_fit_prob = 1 / (1 + np.exp(-y_fit_log_odds))

# Plot log-odds fit
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.scatter(ln_tenure, log_odds, color='blue', label="Log-Odds (Actual)")
plt.plot(tenure_fit, y_fit_log_odds[:100], color='black', linestyle="dashed", label="Log-Odds (Fitted)")
plt.xlabel("ln(Tenure)")
plt.ylabel("Log-Odds of Churn")
plt.title("Polynomial Fit to Log-Odds of Churn (Log-Transformed Features & Churn)")
plt.legend()

# Plot logistic probability fit
plt.subplot(1, 2, 2)
plt.scatter(ln_tenure, churn_yes_counts / (churn_yes_counts + churn_no_counts), color='green', label="Churn Probability (Actual)")
plt.plot(tenure_fit, y_fit_prob[:100], color='red', linestyle="dashed", label="Churn Probability (Fitted)")
plt.xlabel("ln(Tenure)")
plt.ylabel("Probability of Churn=Yes")
plt.title("Logistic Curve Fit for Churn Probability (Log-Transformed Features & Churn)")
plt.legend()

plt.show()

# Print the final logistic equation with explicit multiplication signs
equation_terms = " + ".join([
    f"{coef:.5f} * {feature.replace(' ', ' * ')}"
    for coef, feature in zip(params, poly.get_feature_names_out(['ln_[tenure]', 'ln_[charges]']))
])
print(f"\nFinal Logistic Equation:\nP(Churn=Yes) = 1 / (1 + e^(-({intercept:.5f} + {equation_terms})))")

# Compute R² Score for the polynomial fit
log_odds_pred = model.predict(X_poly)
r2 = r2_score(log_odds, log_odds_pred)
print(f"R² Score for Polynomial Fit: {r2:.4f}")

symbolic logistic multifeature

In [None]:
#@title symbolic regression linear transformation-epsilon
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.preprocessing import PolynomialFeatures

# Small epsilon to prevent log(0)
epsilon = 1e-10

# Given data
x1 = np.array([1, 2, 3, 4])
x2 = np.array([1, 2, 3, 4])
x3 = np.array([1, 3, 2, 4])
y = np.array([-9, -8836, -1079, -67776])

# Take absolute values and add epsilon for safety
y_positive = np.abs(y) + epsilon
x1_positive = np.abs(x1) + epsilon
x2_positive = np.abs(x2) + epsilon
x3_positive = np.abs(x3) + epsilon

# Logarithmic transformations with reshaping
ln_y = np.log(y_positive).reshape(-1, 1)  # Target variable, reshaped to 2D
ln_x1 = np.log(x1_positive).reshape(-1, 1)
ln_x2 = np.log(x2_positive).reshape(-1, 1)
ln_x3 = np.log(x3_positive).reshape(-1, 1)

# Combine x1, x2, x3 into a multivariate feature matrix
X_log = np.hstack((ln_x1, ln_x2, ln_x3))  # Combine features horizontally into a matrix

# Create polynomial features (degree can be adjusted)
degree = 2  # Adjust this value for higher-degree nurturing
poly = PolynomialFeatures(degree=degree, include_bias=False)
X_poly = poly.fit_transform(X_log)

# Perform multivariate linear regression on transformed polynomial data
model = LinearRegression()
model.fit(X_poly, ln_y.ravel())  # Use .ravel() to flatten ln_y for regression

# Extract coefficients and intercept
coefficients = model.coef_
intercept = model.intercept_

# Predict ln(y) values using the model
ln_y_pred = model.predict(X_poly)

# Calculate R-squared score
r2 = r2_score(ln_y, ln_y_pred)

# Display the equation in transformed space
feature_names = poly.get_feature_names_out(["ln(|x1|)", "ln(|x2|)", "ln(|x3|)"])
equation = f"ln(y) = {intercept:.5f}"
for coef, name in zip(coefficients, feature_names):
    equation += f" + {coef:.5f} * {name}"

# Print the results
print("Equation in transformed space (log-linearized with polynomial features):")
print(equation)
print(f"R-squared (transformed space): {r2:.5f}")
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.preprocessing import PolynomialFeatures

# Small epsilon to prevent log(0)
epsilon = 1e-10

# Given data
x1 = np.array([30, 45, 60])
y = np.array([0.5, 0.71, 0.867])

# Take absolute values and add epsilon for safety
y_positive = np.abs(y) + epsilon
x1_positive = np.abs(x1) + epsilon

# Logarithmic transformations
ln_y = np.log(y_positive)
ln_x1 = np.log(x1_positive)

# Reshape ln_x1 into a 2D array (required for PolynomialFeatures)
X_log = ln_x1.reshape(-1, 1)

# Create polynomial features (degree can be adjusted)
degree = 2  # Adjust this value for higher-degree nurturing
poly = PolynomialFeatures(degree=degree, include_bias=False)
X_poly = poly.fit_transform(X_log)

# Perform multivariate linear regression on transformed polynomial data
model = LinearRegression()
model.fit(X_poly, ln_y)

# Extract coefficients and intercept
coefficients = model.coef_
intercept = model.intercept_

# Predict ln(y) values using the model
ln_y_pred = model.predict(X_poly)

# Calculate R-squared score
r2 = r2_score(ln_y, ln_y_pred)

# Display the equation in transformed space
feature_names = poly.get_feature_names_out(["ln(|x1|)"])
equation = f"ln(y) = {intercept:.5f}"
for coef, name in zip(coefficients, feature_names):
    equation += f" + {coef:.5f} * {name}"

print("Equation in transformed space (log-linearized with polynomial features):")
print(equation)
print(f"R-squared (transformed space): {r2:.5f}")


symbolic reg linear transformation epsilon

In [None]:
#@title SYMBOLIC REGRESSION
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
# Small epsilon to prevent log(0)
epsilon = 1e-10

# Load data from CSV
df = pd.read_csv("E:/AIRCRAFT/Aircraft Engines.csv",encoding='latin1')  # Change "data.csv" to your actual filename
for column in df.select_dtypes(include=['object']):
    df[column] = df[column].astype(str)

label_encoder = LabelEncoder()
for column in df.select_dtypes(include=['object']):
    df[column] = label_encoder.fit_transform(df[column])

# Automatically detect feature columns (all except the last column)
#feature_columns = df.columns[:-1]  # Assumes last column is the target (y)
#target_column = df.columns[-1]  # Assumes last column is y
target_column = "power"  # Set your target column explicitly
feature_columns = [col for col in df.columns if col != target_column] 
# Extract features (X) and target (y)
X = df[feature_columns].values  # Feature matrix
y = df[target_column].values  # Target variable

# Take absolute values and add epsilon for stability
y_positive = np.abs(y) + epsilon
X_positive = np.abs(X) + epsilon  # Apply to all feature columns

# Logarithmic transformations
ln_y = np.log(y_positive).reshape(-1, 1)  # Log-transformed target
ln_X = np.log(X_positive)  # Log-transformed features

# Create polynomial features (degree can be adjusted)
degree = 7  # Adjust this value for higher-degree polynomial regression
poly = PolynomialFeatures(degree=degree, include_bias=False)
X_poly = poly.fit_transform(ln_X)

# Perform multivariate linear regression on transformed polynomial data
model = LinearRegression()
model.fit(X_poly, ln_y.ravel())  # Use .ravel() to flatten ln_y

# Extract coefficients and intercept
coefficients = model.coef_
intercept = model.intercept_

# Predict ln(y) values using the model
ln_y_pred = model.predict(X_poly)

# Calculate R-squared score
r2 = r2_score(ln_y, ln_y_pred)

# Generate equation dynamically based on input features
feature_names = poly.get_feature_names_out([f"ln(|{col}|)" for col in feature_columns])
equation = f"ln(y) = {intercept:.5f}"
for coef, name in zip(coefficients, feature_names):
    equation += f" + {coef:.5f} * {name}"

# Print the results
print("Equation in transformed space (log-linearized with polynomial features):")
print(equation)
print(f"R-squared (transformed space): {r2:.5f}")


SYMBOLIC REG-CSV

In [None]:
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
import os

# Enable inline plotting for Spyder


# Define the file path
file_path = os.path.join("E:/LHC-CERN/Z_boson.csv")

# Load the CSV file
df = pd.read_csv(file_path)

# Print basic information
print(df)
print(df.dtypes)
print(df.shape)
print(df[df['class'] == 'Zee'])
print(df[df['class'] == 'Zmumu'])

# Extract pt1 values for both classes
pt1_class_zmumu = df[df['class'] == 'Zmumu']['pt1']
pt1_class_zee = df[df['class'] == 'Zee']['pt1']

# Plot settings
plt.figure(figsize=(8,6))  # Set figure size
plt.xlabel("pt1")
plt.ylabel("Number Of Class")
plt.title("Class Prediction Visualization")

# Histogram
plt.hist([pt1_class_zmumu, pt1_class_zee], bins=20, rwidth=0.95, color=['green','red'], label=['Class=Zmumu', 'Class=Zee'])

# Add legend and show the plot
plt.legend()
plt.show()

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os

# Define the file path
file_path = os.path.join("E:/LHC-CERN/Z_boson.csv")

# Load the CSV file
df = pd.read_csv(file_path)

# Extract pt1 values for both classes
pt1_class_zmumu = df[df['class'] == 'Zmumu']['pt1']
pt1_class_zee = df[df['class'] == 'Zee']['pt1']

# Plot settings
plt.figure(figsize=(8, 6))  # Set figure size
plt.xlabel("pt1")
plt.ylabel("Number of Class")
plt.title("Class Prediction Visualization")

# Histogram with returned values
n_zmumu, bins, _ = plt.hist(pt1_class_zmumu, bins=20, rwidth=0.95, color='green', alpha=0.7, label='Class=Zmumu', density=False)
n_zee, _, _ = plt.hist(pt1_class_zee, bins=bins, rwidth=0.95, color='red', alpha=0.7, label='Class=Zee', density=False)

# Add legend and show the plot
plt.legend()
plt.show()

# Compute bin centers
bin_centers = (bins[:-1] + bins[1:]) / 2  # Midpoints of bins

# Create a DataFrame with all bins
data = pd.DataFrame({
    'pt1_bin_center': bin_centers, 
    'Zmumu_counts': n_zmumu, 
    'Zee_counts': n_zee
})

# **Remove rows where either class count is zero**
filtered_data = data[(data['Zmumu_counts'] > 0) & (data['Zee_counts'] > 0)]

# Print the filtered lists horizontally
print("\nFiltered pt1_bin_center:", list(filtered_data['pt1_bin_center']))
print("Filtered Zmumu_counts:   ", list(filtered_data['Zmumu_counts']))
print("Filtered Zee_counts:     ", list(filtered_data['Zee_counts']))

Logistic curve value extraction

In [None]:
#@title MainLogistic
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
import os

# Enable inline plotting for Spyder


# Define the file path
file_path = os.path.join("E:/LHC-CERN/Z_boson.csv")

# Load the CSV file
df = pd.read_csv(file_path)

# Print basic information
print(df)
print(df.dtypes)
print(df.shape)
print(df[df['class'] == 'Zee'])
print(df[df['class'] == 'Zmumu'])

# Extract pt1 values for both classes
pt1_class_zmumu = df[df['class'] == 'Zmumu']['pt1']
pt1_class_zee = df[df['class'] == 'Zee']['pt1']

# Plot settings
plt.figure(figsize=(8,6))  # Set figure size
plt.xlabel("pt1")
plt.ylabel("Number Of Class")
plt.title("Class Prediction Visualization")

# Histogram
plt.hist([pt1_class_zmumu, pt1_class_zee], bins=20, rwidth=0.95, color=['green','red'], label=['Class=Zmumu', 'Class=Zee'])

# Add legend and show the plot
plt.legend()
plt.show()

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os

# Define the file path
file_path = os.path.join("E:/LHC-CERN/Z_boson.csv")

# Load the CSV file
df = pd.read_csv(file_path)

# Extract pt1 values for both classes
pt1_class_zmumu = df[df['class'] == 'Zmumu']['pt1']
pt1_class_zee = df[df['class'] == 'Zee']['pt1']

# Plot settings
plt.figure(figsize=(8, 6))  # Set figure size
plt.xlabel("pt1")
plt.ylabel("Number of Class")
plt.title("Class Prediction Visualization")

# Histogram with returned values
n_zmumu, bins, _ = plt.hist(pt1_class_zmumu, bins=20, rwidth=0.95, color='green', alpha=0.7, label='Class=Zmumu', density=False)
n_zee, _, _ = plt.hist(pt1_class_zee, bins=bins, rwidth=0.95, color='red', alpha=0.7, label='Class=Zee', density=False)

# Add legend and show the plot
plt.legend()
plt.show()

# Compute bin centers
bin_centers = (bins[:-1] + bins[1:]) / 2  # Midpoints of bins

# Create a DataFrame with all bins
data = pd.DataFrame({
    'pt1_bin_center': bin_centers, 
    'Zmumu_counts': n_zmumu, 
    'Zee_counts': n_zee
})

# **Remove rows where either class count is zero**
filtered_data = data[(data['Zmumu_counts'] > 0) & (data['Zee_counts'] > 0)]

# Print the filtered lists horizontally
print("\nFiltered pt1_bin_center:", list(filtered_data['pt1_bin_center']))
print("Filtered Zmumu_counts:   ", list(filtered_data['Zmumu_counts']))
print("Filtered Zee_counts:     ", list(filtered_data['Zee_counts']))

import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

# Small epsilon to avoid log(0)
epsilon = 1e-2  

# Sample bin centers for Tenure and Monthly Charges
tenure_bins = np.array([10.1041075, 23.3849225, 36.6657375, 49.946552499999996, 63.2273675, 76.5081825, 89.7889975, 
                     103.06981249999998, 116.3506275, 129.6314425, 142.91225749999995, 156.19307249999997, 169.4738875, 
                     182.75470249999995, 196.03551749999997, 209.3163325, 222.59714749999995, 235.87796249999997, 
                     249.15877749999999, 262.4395925])

# Churn=Yes and Churn=No counts
churn_yes_counts = np.array([592, 1870, 4286, 2632, 387, 112, 69, 26, 8, 6, 4, 2, 2, 1, 2, 0, 0, 0, 0, 1])
churn_no_counts = np.array([0, 1877, 5033, 2455, 421, 133, 45, 20, 8, 1, 3, 1, 0, 2, 0, 0, 0, 0, 1, 0])

# Apply natural logarithm (ln) with epsilon to prevent log(0)
ln_churn_yes = np.log(churn_yes_counts + epsilon)
ln_churn_no = np.log(churn_no_counts + epsilon)

# Compute log-odds
log_odds = ln_churn_yes - ln_churn_no

# Apply natural logarithm to tenure bins
ln_tenure = np.log(tenure_bins + epsilon)

# Stack transformed features
X_data = ln_tenure.reshape(-1, 1)  # Ensure it's 2D for sklearn

# Fit a polynomial model
degree = 301  # Adjust the degree (too high can overfit)
poly = PolynomialFeatures(degree, include_bias=False)
X_poly = poly.fit_transform(X_data)

# Fit linear regression model
model = LinearRegression()
model.fit(X_poly, log_odds)

# Get polynomial coefficients
params = model.coef_
intercept = model.intercept_

# Generate fitted curve
tenure_fit = np.linspace(ln_tenure.min(), ln_tenure.max(), 100).reshape(-1, 1)

# Transform fitted features
X_fit_poly = poly.transform(tenure_fit)
y_fit_log_odds = model.predict(X_fit_poly)

# Convert to probability using logistic function
y_fit_prob = 1 / (1 + np.exp(-y_fit_log_odds))

# Plot log-odds fit
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.scatter(ln_tenure, log_odds, color='blue', label="Log-Odds (Actual)")
plt.plot(tenure_fit, y_fit_log_odds, color='black', linestyle="dashed", label="Log-Odds (Fitted)")
plt.xlabel("ln(Tenure)")
plt.ylabel("Log-Odds of Churn")
plt.title("Polynomial Fit to Log-Odds of Churn")
plt.legend()

# Plot logistic probability fit
plt.subplot(1, 2, 2)
plt.scatter(ln_tenure, churn_yes_counts / (churn_yes_counts + churn_no_counts), color='green', label="Churn Probability (Actual)")
plt.plot(tenure_fit, y_fit_prob, color='red', linestyle="dashed", label="Churn Probability (Fitted)")
plt.xlabel("ln(Tenure)")
plt.ylabel("Probability of Churn=Yes")
plt.title("Logistic Curve Fit for Churn Probability")
plt.legend()

plt.show()

# Print logistic equation
equation_terms = " + ".join([
    f"{coef:.6f} * {feature.replace(' ', ' * ')}"
    for coef, feature in zip(params, poly.get_feature_names_out(['ln_tenure']))
])
print(f"\nFinal Logistic Equation:\nP(Churn=Yes) = 1 / (1 + e^(-({intercept:.6f} + {equation_terms})))")

# Compute R² Score
log_odds_pred = model.predict(X_poly)
r2 = r2_score(log_odds, log_odds_pred)
print(f"R² Score for Polynomial Fit: {r2:.4f}")

import numpy as np
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

# Small epsilon to prevent log(0)
epsilon = 1e-10

# Sample bin centers for Tenure and Monthly Charges (from image)
tenure_bins = np.array([5, 15, 25, 35, 45, 55, 65])  # 7 tenure bins
monthly_charges_bins = np.array([20, 40, 60, 80, 100, 110, 120])  # 7 monthly charge bins

# Churn=Yes and Churn=No counts (must match bin lengths)
churn_yes_counts = np.array([900, 250, 150, 120, 100, 90, 80])  # 7 values
churn_no_counts = np.array([850, 450, 400, 380, 350, 400, 450])  # 7 values

# Apply natural logarithm (ln) to absolute values of churn counts
ln_churn_yes = np.log(np.abs(churn_yes_counts) + epsilon)
ln_churn_no = np.log(np.abs(churn_no_counts) + epsilon)

# Compute log-odds using log-transformed churn values
log_odds = ln_churn_yes - ln_churn_no

# Apply natural logarithm to absolute feature values
ln_tenure = np.log(np.abs(tenure_bins) + epsilon)
ln_charges = np.log(np.abs(monthly_charges_bins) + epsilon)

# Stack transformed features
X_data = np.vstack((ln_tenure, ln_charges)).T

# Fit a polynomial model to the transformed features
degree = 2  # Set polynomial degree
poly = PolynomialFeatures(degree, include_bias=False)
X_poly = poly.fit_transform(X_data)

# Fit a linear regression model to the polynomial-transformed features
model = LinearRegression()
model.fit(X_poly, log_odds)

# Get polynomial coefficients
params = model.coef_
intercept = model.intercept_

# Generate fitted curve
tenure_fit = np.linspace(ln_tenure.min(), ln_tenure.max(), 100)
charges_fit = np.linspace(ln_charges.min(), ln_charges.max(), 100)
X_fit = np.array(np.meshgrid(tenure_fit, charges_fit)).T.reshape(-1, 2)

# Transform fitted features
X_fit_poly = poly.transform(X_fit)
y_fit_log_odds = model.predict(X_fit_poly)

# Convert to probability using logistic function
y_fit_prob = 1 / (1 + np.exp(-y_fit_log_odds))

# Plot log-odds fit
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.scatter(ln_tenure, log_odds, color='blue', label="Log-Odds (Actual)")
plt.plot(tenure_fit, y_fit_log_odds[:100], color='black', linestyle="dashed", label="Log-Odds (Fitted)")
plt.xlabel("ln(Tenure)")
plt.ylabel("Log-Odds of Churn")
plt.title("Polynomial Fit to Log-Odds of Churn (Log-Transformed Features & Churn)")
plt.legend()

# Plot logistic probability fit
plt.subplot(1, 2, 2)
plt.scatter(ln_tenure, churn_yes_counts / (churn_yes_counts + churn_no_counts), color='green', label="Churn Probability (Actual)")
plt.plot(tenure_fit, y_fit_prob[:100], color='red', linestyle="dashed", label="Churn Probability (Fitted)")
plt.xlabel("ln(Tenure)")
plt.ylabel("Probability of Churn=Yes")
plt.title("Logistic Curve Fit for Churn Probability (Log-Transformed Features & Churn)")
plt.legend()

plt.show()

# Print the final logistic equation with explicit multiplication signs
equation_terms = " + ".join([
    f"{coef:.5f} * {feature.replace(' ', ' * ')}"
    for coef, feature in zip(params, poly.get_feature_names_out(['ln_[tenure]', 'ln_[charges]']))
])
print(f"\nFinal Logistic Equation:\nP(Churn=Yes) = 1 / (1 + e^(-({intercept:.5f} + {equation_terms})))")

# Compute R² Score for the polynomial fit
log_odds_pred = model.predict(X_poly)
r2 = r2_score(log_odds, log_odds_pred)
print(f"R² Score for Polynomial Fit: {r2:.4f}")


Main logistic

In [None]:
#@title Non-linear-logistic-main
import numpy as np
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Small epsilon to prevent numerical instability
epsilon = 1e-10

# Given data
x1 = np.array([1, 2, 3, 4])
x2 = np.array([1, 2, 3, 4])
x3 = np.array([1, 3, 2, 4])
y = np.array([-9, -8836, -1079, -67776])

# Ensure non-negative values by taking absolute values and adding epsilon
x1_transformed = np.abs(x1) + epsilon
x2_transformed = np.abs(x2) + epsilon
x3_transformed = np.abs(x3) + epsilon

# Convert y into binary classification (Threshold: Choose based on the problem)
y_binary = (y < -5000).astype(int)  # Example: Classify as 1 if y < -5000, else 0

# Combine features into a single matrix
X = np.column_stack((x1_transformed, x2_transformed, x3_transformed))

# Create polynomial features (degree can be adjusted)
degree = 2  # Adjust for higher-degree polynomial regression
poly = PolynomialFeatures(degree=degree, include_bias=False)
X_poly = poly.fit_transform(X)

# Print feature names for debugging
feature_names = poly.get_feature_names_out(["x1", "x2", "x3"])
print("Generated polynomial features:", feature_names)

# Perform logistic regression
model = LogisticRegression(max_iter=1000)  # Increase iterations to ensure convergence
model.fit(X_poly, y_binary)  # Fit the logistic model

# Predict class labels
y_pred = model.predict(X_poly)

# Compute accuracy
accuracy = accuracy_score(y_binary, y_pred)

# Extract coefficients and intercept
coefficients = model.coef_[0]
intercept = model.intercept_[0]

# Display the equation
equation = f"sigmoid(y) = 1 / (1 + exp(-({intercept:.5f}"
for coef, name in zip(coefficients, feature_names):
    equation += f" + {coef:.5f} * {name.replace(' ', ' * ')}"
equation += ")))"

# Print results
print("\nLogistic Regression Equation (with polynomial features):")
print(equation)
print(f"Model Accuracy: {accuracy:.5f}")

import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import accuracy_score

# Small epsilon to prevent numerical instability
epsilon = 1e-10

# Given data
x1 = np.array([30, 45, 60])
y = np.array([0.5, 0.71, 0.867])

# Convert y into binary classification (Threshold: Choose based on the problem)
y_binary = (y > 0.7).astype(int)  # Classify as 1 if y > 0.7, else 0

# Take absolute values and add epsilon for safety
x1_transformed = np.abs(x1) + epsilon

# Reshape x1 into a 2D array (required for PolynomialFeatures)
X = x1_transformed.reshape(-1, 1)

# Create polynomial features (degree can be adjusted)
degree = 2  # Adjust this for higher-degree polynomial regression
poly = PolynomialFeatures(degree=degree, include_bias=False)
X_poly = poly.fit_transform(X)

# Perform logistic regression
model = LogisticRegression(max_iter=1000)  # Increased iterations for stability
model.fit(X_poly, y_binary)

# Predict class labels
y_pred = model.predict(X_poly)

# Compute accuracy
accuracy = accuracy_score(y_binary, y_pred)

# Extract coefficients and intercept
coefficients = model.coef_[0]
intercept = model.intercept_[0]

# Display the logistic regression equation
feature_names = poly.get_feature_names_out(["x1"])
equation = f"sigmoid(y) = 1 / (1 + exp(-({intercept:.5f}"
for coef, name in zip(coefficients, feature_names):
    equation += f" + {coef:.5f} * {name}"
equation += ")))"

# Print results
print("\nLogistic Regression Equation (with polynomial features):")
print(equation)
print(f"Model Accuracy: {accuracy:.5f}")


import numpy as np
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Small epsilon to prevent log(0)
epsilon = 1e-10

# Given data
x1 = np.array([1, 2, 3, 4])
x2 = np.array([1, 2, 3, 4])
x3 = np.array([1, 3, 2, 4])
y = np.array([-9, -8836, -1079, -67776])

# Apply ln transformation to features
x1_ln = np.log(np.abs(x1) + epsilon)
x2_ln = np.log(np.abs(x2) + epsilon)
x3_ln = np.log(np.abs(x3) + epsilon)

# Combine transformed features into a single matrix
X_ln = np.column_stack((x1_ln, x2_ln, x3_ln))

# Create polynomial features
degree = 2  
poly = PolynomialFeatures(degree=degree, include_bias=False)
X_poly = poly.fit_transform(X_ln)

# Correct feature name formatting (replace spaces with *)
feature_names = poly.get_feature_names_out(["ln(x1)", "ln(x2)", "ln(x3)"])
feature_names = [name.replace(" ", " * ") for name in feature_names]  

# Train logistic regression
model = LogisticRegression(max_iter=1000)
model.fit(X_poly, (y < -5000).astype(int))  # Binary classification

# Get model coefficients
coefficients = model.coef_[0]
intercept = model.intercept_[0]

# Construct the equation properly
equation = f"sigmoid(y) = 1 / (1 + exp(-({intercept:.5f}"
for coef, name in zip(coefficients, feature_names):
    equation += f" + {coef:.5f} * {name}"
equation += ")))"

# Print results
print("\nCorrected Logistic Regression Equation:")
print(equation)
print(f"Model Accuracy: {accuracy_score((y < -5000).astype(int), model.predict(X_poly)):.5f}")


import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import accuracy_score

# Small epsilon to prevent log(0)
epsilon = 1e-10

# Given data
x1 = np.array([30, 45, 60])
y = np.array([0.5, 0.71, 0.867])

# Convert y into binary classification (Threshold: Choose based on the problem)
y_binary = (y > 0.7).astype(int)  # Classify as 1 if y > 0.7, else 0

# Apply ln transformation to x1 (ensure non-negative input)
x1_ln = np.log(np.abs(x1) + epsilon)

# Reshape x1_ln into a 2D array (required for PolynomialFeatures)
X_ln = x1_ln.reshape(-1, 1)

# Create polynomial features (degree can be adjusted)
degree = 2  # Adjust for higher-degree polynomial regression
poly = PolynomialFeatures(degree=degree, include_bias=False)
X_poly = poly.fit_transform(X_ln)

# Perform logistic regression
model = LogisticRegression(max_iter=1000)  # Increased iterations for stability
model.fit(X_poly, y_binary)

# Predict class labels
y_pred = model.predict(X_poly)

# Compute accuracy
accuracy = accuracy_score(y_binary, y_pred)

# Extract coefficients and intercept
coefficients = model.coef_[0]
intercept = model.intercept_[0]

# Display the logistic regression equation
feature_names = poly.get_feature_names_out(["ln(x1)"])
equation = f"sigmoid(y) = 1 / (1 + exp(-({intercept:.5f}"
for coef, name in zip(coefficients, feature_names):
    equation += f" + {coef:.5f} * {name}"
equation += ")))"

# Print results
print("\nLogistic Regression Equation (with ln-transformed polynomial features):")
print(equation)
print(f"Model Accuracy: {accuracy:.5f}")

import numpy as np
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Given data
x1 = np.array([1, 2, 3, 4])
x2 = np.array([1, 2, 3, 4])
x3 = np.array([1, 3, 2, 4])
y = np.array([-9, -8836, -1079, -67776])

# Convert y into binary classification (Threshold: Choose based on the problem)
y_binary = (y < -5000).astype(int)  # Example: Classify as 1 if y < -5000, else 0

# Combine features into a single matrix (no ln transformation)
X = np.column_stack((x1, x2, x3))

# Create polynomial features (degree can be adjusted)
degree = 2  # Adjust for higher-degree polynomial regression
poly = PolynomialFeatures(degree=degree, include_bias=False)
X_poly = poly.fit_transform(X)

# Print feature names for debugging
feature_names = poly.get_feature_names_out(["x1", "x2", "x3"])
feature_names = [name.replace(" ", " * ") for name in feature_names]  # Fix formatting

# Perform logistic regression
model = LogisticRegression(max_iter=1000)  # Increase iterations to ensure convergence
model.fit(X_poly, y_binary)  # Fit the logistic model

# Predict class labels
y_pred = model.predict(X_poly)

# Compute accuracy
accuracy = accuracy_score(y_binary, y_pred)

# Extract coefficients and intercept
coefficients = model.coef_[0]
intercept = model.intercept_[0]

# Display the equation
equation = f"sigmoid(y) = 1 / (1 + exp(-({intercept:.5f}"
for coef, name in zip(coefficients, feature_names):
    equation += f" + {coef:.5f} * {name}"
equation += ")))"

# Print results
print("\nLogistic Regression Equation (with polynomial features):")
print(equation)
print(f"Model Accuracy: {accuracy:.5f}")


import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import accuracy_score

# Given data
x1 = np.array([30, 45, 60])
y = np.array([0.5, 0.71, 0.867])

# Convert y into binary classification (Threshold: Choose based on the problem)
y_binary = (y > 0.7).astype(int)  # Classify as 1 if y > 0.7, else 0

# Reshape x1 into a 2D array (required for PolynomialFeatures)
X = x1.reshape(-1, 1)  # Removed log transformation

# Create polynomial features (degree can be adjusted)
degree = 2  # Adjust this for higher-degree polynomial regression
poly = PolynomialFeatures(degree=degree, include_bias=False)
X_poly = poly.fit_transform(X)

# Perform logistic regression
model = LogisticRegression(max_iter=1000)  # Increased iterations for stability
model.fit(X_poly, y_binary)

# Predict class labels
y_pred = model.predict(X_poly)

# Compute accuracy
accuracy = accuracy_score(y_binary, y_pred)

# Extract coefficients and intercept
coefficients = model.coef_[0]
intercept = model.intercept_[0]

# Display the logistic regression equation
feature_names = poly.get_feature_names_out(["x1"])  # Removed ln(x1)
equation = f"sigmoid(y) = 1 / (1 + exp(-({intercept:.5f}"
for coef, name in zip(coefficients, feature_names):
    equation += f" + {coef:.5f} * {name.replace(' ', ' * ')}"  # Ensure correct formatting
equation += ")))"

# Print results
print("\nLogistic Regression Equation (with polynomial features):")
print(equation)
print(f"Model Accuracy: {accuracy:.5f}")

import numpy as np
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Small epsilon to prevent numerical instability
epsilon = 1e-10
#phi2_bins = np.array([-3.0, -3.0, -2.0, -2.0, -2.0, -1.0, -1.0, -1.0, -0.0, -0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 3.0, 3.0])
#Q2_bins = np.array([-1.0, -1.0, -1.0, -1.0, -1.0, -0.0, -0.0, -0.0, -0.0, -0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0])
#class_bins = np.array([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0])
# Given data
x1 = np.array([-3.0, -3.0, -2.0, -2.0, -2.0, -1.0, -1.0, -1.0, -0.0, -0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 3.0, 3.0])
x2 = np.array([-1.0, -1.0, -1.0, -1.0, -1.0, -0.0, -0.0, -0.0, -0.0, -0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0])

y = np.array([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0])

# Ensure non-negative values by taking absolute values and adding epsilon
#x1_transformed = np.abs(x1) + epsilon
#x2_transformed = np.abs(x2) + epsilon

# Convert y into binary classification (Threshold: Choose based on the problem)
#y_binary = (y < -5000).astype(int)  # Example: Classify as 1 if y < -5000, else 0
y_binary=(y==1)
# Combine features into a single matrix

X = np.column_stack((x1, x2))
# Create polynomial features (degree can be adjusted)
degree = 2  # Adjust for higher-degree polynomial regression
poly = PolynomialFeatures(degree=degree, include_bias=False)
X_poly = poly.fit_transform(X)

# Print feature names for debugging
feature_names = poly.get_feature_names_out(["x1", "x2"])
print("Generated polynomial features:", feature_names)

# Perform logistic regression
model = LogisticRegression(max_iter=1000)  # Increase iterations to ensure convergence
model.fit(X_poly, y_binary)  # Fit the logistic model

# Predict class labels
y_pred = model.predict(X_poly)

# Compute accuracy
accuracy = accuracy_score(y_binary, y_pred)

# Extract coefficients and intercept
coefficients = model.coef_[0]
intercept = model.intercept_[0]

# Display the equation
equation = f"sigmoid(y) = 1 / (1 + exp(-({intercept:.15f}"
for coef, name in zip(coefficients, feature_names):
    equation += f" + {coef:.15f} * {name.replace(' ', ' * ')}"
equation += ")))"

# Print results
print("\nLogistic Regression Equation (with polynomial features):")
print(equation)
print(f"Model Accuracy: {accuracy:.5f}")
import numpy as np
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Small epsilon to prevent numerical instability
epsilon = 1e-10
#phi2_bins = np.array([-3.0, -3.0, -2.0, -2.0, -2.0, -1.0, -1.0, -1.0, -0.0, -0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 3.0, 3.0])
#Q2_bins = np.array([-1.0, -1.0, -1.0, -1.0, -1.0, -0.0, -0.0, -0.0, -0.0, -0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0])
#class_bins = np.array([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0])
# Given data
x1 = np.array([-3.0, -3.0, -2.0, -2.0, -2.0, -1.0, -1.0, -1.0, -0.0, -0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 3.0, 3.0])
#x2 = np.array([-1.0, -1.0, -1.0, -1.0, -1.0, -0.0, -0.0, -0.0, -0.0, -0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0])

y = np.array([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0])

# Ensure non-negative values by taking absolute values and adding epsilon
#x1_transformed = np.abs(x1) + epsilon
#x2_transformed = np.abs(x2) + epsilon

# Convert y into binary classification (Threshold: Choose based on the problem)
#y_binary = (y < -5000).astype(int)  # Example: Classify as 1 if y < -5000, else 0
y_binary=(y==1)
# Combine features into a single matrix

#X = np.column_stack((x1))
X = x1.reshape(-1, 1)

# Create polynomial features (degree can be adjusted)
degree = 2  # Adjust for higher-degree polynomial regression
poly = PolynomialFeatures(degree=degree, include_bias=False)
X_poly = poly.fit_transform(X)

# Print feature names for debugging
feature_names = poly.get_feature_names_out(["x1"])
print("Generated polynomial features:", feature_names)

# Perform logistic regression
model = LogisticRegression(max_iter=1000)  # Increase iterations to ensure convergence
model.fit(X_poly, y_binary)  # Fit the logistic model

# Predict class labels
y_pred = model.predict(X_poly)

# Compute accuracy
accuracy = accuracy_score(y_binary, y_pred)

# Extract coefficients and intercept
coefficients = model.coef_[0]
intercept = model.intercept_[0]

# Display the equation
equation = f"sigmoid(y) = 1 / (1 + exp(-({intercept:.15f}"
for coef, name in zip(coefficients, feature_names):
    equation += f" + {coef:.15f} * {name.replace(' ', ' * ')}"
equation += ")))"

# Print results
print("\nLogistic Regression Equation (with polynomial features):")
print(equation)
print(f"Model Accuracy: {accuracy:.5f}")

import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Read the CSV file
data = pd.read_csv('E:/LHC-CERN/Z_boson.csv', encoding='latin1')

# Convert non-numeric columns to categorical type
for column in data.select_dtypes(include=['object']):
    data[column] = data[column].astype(str)

# Encode categorical variables
label_encoder = LabelEncoder()
for column in data.select_dtypes(include=['object']):
    data[column] = label_encoder.fit_transform(data[column])

# Handle missing values in the features
imputer = SimpleImputer(strategy='mean')
features_imputed = imputer.fit_transform(data.drop("class", axis=1))

# Combine features and target into a DataFrame for plotting
df_plot = pd.DataFrame(features_imputed, columns=data.columns[:-1])  # Exclude target variable
df_plot['class'] = data['class']

# Select a subset of features for binning
selected_features = df_plot.columns  # Use all features

# Number of bins (adjust as needed)
num_bins = 7

# Store bins for each feature
feature_bins = {}

for feature in selected_features:
    min_val, max_val = df_plot[feature].min(), df_plot[feature].max()
    
    # Create bins evenly spaced between min and max
    bins = np.linspace(min_val, max_val, num_bins)
    
    feature_bins[feature] = np.round(bins)  # Round for cleaner output

    # Plot histogram with bin markers
    plt.figure(figsize=(8, 5))
    sns.histplot(df_plot[feature], bins=num_bins, kde=True, color="blue", alpha=0.6)
    
    # Overlay bin edges as vertical lines
    for bin_edge in bins:
        plt.axvline(bin_edge, color='red', linestyle='dashed', alpha=0.7)
    
    plt.title(f"Histogram of {feature} with Bins")
    plt.xlabel(feature)
    plt.ylabel("Frequency")
    plt.grid(True)
    plt.show()

# Print extracted bins in requested format
for feature, bins in feature_bins.items():
    print(f"{feature}_bins = np.array({bins.tolist()})")  # Convert to np.array format

import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Read the CSV file
data = pd.read_csv('E:/LHC-CERN/Z_boson.csv', encoding='latin1')

# Convert non-numeric columns to categorical type
for column in data.select_dtypes(include=['object']):
    data[column] = data[column].astype(str)

# Encode categorical variables
label_encoder = LabelEncoder()
for column in data.select_dtypes(include=['object']):
    data[column] = label_encoder.fit_transform(data[column])

# Handle missing values in the features
imputer = SimpleImputer(strategy='mean')
features_imputed = imputer.fit_transform(data.drop("class", axis=1))

# Combine features and target into a DataFrame for plotting
df_plot = pd.DataFrame(features_imputed, columns=data.columns[:-1])  # Exclude target variable
df_plot['class'] = data['class']

# Function to calculate Freedman-Diaconis bin width
def freedman_diaconis_bins(data):
    q25, q75 = np.percentile(data, [25, 75])
    iqr = q75 - q25  # Interquartile range
    bin_width = 2 * iqr / (len(data) ** (1 / 3))  # Freedman-Diaconis rule
    if bin_width == 0:  # Prevent division by zero
        return 5  # Default minimum bin count
    return max(5, int((data.max() - data.min()) / bin_width))  # At least 5 bins

# Determine max bin count across all features
num_bins = max(freedman_diaconis_bins(df_plot[col]) for col in df_plot.columns if col != "class")

# Store uniform bins for each feature
feature_bins = {}

# Apply consistent binning for each feature
for feature in df_plot.columns:
    if feature == "class":
        continue  # Skip the target variable

    min_val, max_val = df_plot[feature].min(), df_plot[feature].max()
    bins = np.linspace(min_val, max_val, num_bins)  # Use same bin count for all features
    feature_bins[feature] = np.round(bins, decimals=4)  # Round for cleaner output

    # Plot histogram with uniform bin markers
    plt.figure(figsize=(8, 5))
    sns.histplot(df_plot[feature], bins=bins, kde=True, color="blue", alpha=0.6)
    
    # Overlay bin edges as vertical lines
    for bin_edge in bins:
        plt.axvline(bin_edge, color='red', linestyle='dashed', alpha=0.7)
    
    plt.title(f"Histogram of {feature} with Uniform Bins ({num_bins} bins)")
    plt.xlabel(feature)
    plt.ylabel("Frequency")
    plt.grid(True)
    plt.show()

# Print extracted bins in requested format
for feature, bins in feature_bins.items():
    print(f"{feature}_bins = np.array({bins.tolist()})")  # Convert to np.array format
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Read the CSV file
data = pd.read_csv('E:/LHC-CERN/Z_boson.csv', encoding='latin1')

# Convert non-numeric columns to categorical type
for column in data.select_dtypes(include=['object']):
    data[column] = data[column].astype(str)

# Encode categorical variables
label_encoder = LabelEncoder()
for column in data.select_dtypes(include=['object']):
    data[column] = label_encoder.fit_transform(data[column])

# Handle missing values in the features
imputer = SimpleImputer(strategy='mean')
features_imputed = imputer.fit_transform(data.drop("class", axis=1))

# Combine features and target into a DataFrame for plotting
df_plot = pd.DataFrame(features_imputed, columns=data.columns[:-1])  # Exclude target variable
df_plot['class'] = data['class']

# Function to calculate Freedman-Diaconis bin width
def freedman_diaconis_bins(data):
    q25, q75 = np.percentile(data, [25, 75])
    iqr = q75 - q25  # Interquartile range
    bin_width = 2 * iqr / (len(data) ** (1 / 3))  # Freedman-Diaconis rule
    if bin_width == 0:  # Prevent division by zero
        return 5  # Default minimum bin count
    return max(5, int((data.max() - data.min()) / bin_width))  # At least 5 bins

# Determine max bin count across all features (including class)
num_bins = max(freedman_diaconis_bins(df_plot[col]) for col in df_plot.columns)

# Store uniform bins for each feature
feature_bins = {}

# Apply consistent binning for each feature
for feature in df_plot.columns:
    min_val, max_val = df_plot[feature].min(), df_plot[feature].max()
    bins = np.linspace(min_val, max_val, num_bins)  # Use same bin count for all features
    feature_bins[feature] = np.round(bins, decimals=4)  # Round for cleaner output

    # Plot histogram with uniform bin markers
    plt.figure(figsize=(8, 5))
    sns.histplot(df_plot[feature], bins=bins, kde=True, color="blue", alpha=0.6)
    
    # Overlay bin edges as vertical lines
    for bin_edge in bins:
        plt.axvline(bin_edge, color='red', linestyle='dashed', alpha=0.7)
    
    plt.title(f"Histogram of {feature} with Uniform Bins ({num_bins} bins)")
    plt.xlabel(feature)
    plt.ylabel("Frequency")
    plt.grid(True)
    plt.show()

# Print extracted bins in NumPy array format
for feature, bins in feature_bins.items():
    print(f"{feature}_bins = np.array({bins.tolist()})")


Non linear logistic main

In [None]:
#@title Curve+value extraction
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Read the CSV file
data = pd.read_csv('E:/LHC-CERN/Z_boson.csv', encoding='latin1')

# Convert non-numeric columns to categorical type
for column in data.select_dtypes(include=['object']):
    data[column] = data[column].astype(str)

# Encode categorical variables
label_encoder = LabelEncoder()
for column in data.select_dtypes(include=['object']):
    data[column] = label_encoder.fit_transform(data[column])

# Handle missing values in the features
imputer = SimpleImputer(strategy='mean')
features_imputed = imputer.fit_transform(data.drop("class", axis=1))

# Combine features and target into a DataFrame for plotting
df_plot = pd.DataFrame(features_imputed, columns=data.columns[:-1])  # Exclude target variable
df_plot['class'] = data['class']

# Select a subset of features for binning
selected_features = df_plot.columns  # Use all features

# Number of bins (adjust as needed)
num_bins = 7

# Store bins for each feature
feature_bins = {}

for feature in selected_features:
    min_val, max_val = df_plot[feature].min(), df_plot[feature].max()
    
    # Create bins evenly spaced between min and max
    bins = np.linspace(min_val, max_val, num_bins)
    
    feature_bins[feature] = np.round(bins)  # Round for cleaner output

    # Plot histogram with bin markers
    plt.figure(figsize=(8, 5))
    sns.histplot(df_plot[feature], bins=num_bins, kde=True, color="blue", alpha=0.6)
    
    # Overlay bin edges as vertical lines
    for bin_edge in bins:
        plt.axvline(bin_edge, color='red', linestyle='dashed', alpha=0.7)
    
    plt.title(f"Histogram of {feature} with Bins")
    plt.xlabel(feature)
    plt.ylabel("Frequency")
    plt.grid(True)
    plt.show()

# Print extracted bins in requested format
for feature, bins in feature_bins.items():
    print(f"{feature}_bins = np.array({bins.tolist()})")  # Convert to np.array format

import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Read the CSV file
data = pd.read_csv('E:/LHC-CERN/Z_boson.csv', encoding='latin1')

# Convert non-numeric columns to categorical type
for column in data.select_dtypes(include=['object']):
    data[column] = data[column].astype(str)

# Encode categorical variables
label_encoder = LabelEncoder()
for column in data.select_dtypes(include=['object']):
    data[column] = label_encoder.fit_transform(data[column])

# Handle missing values in the features
imputer = SimpleImputer(strategy='mean')
features_imputed = imputer.fit_transform(data.drop("class", axis=1))

# Combine features and target into a DataFrame for plotting
df_plot = pd.DataFrame(features_imputed, columns=data.columns[:-1])  # Exclude target variable
df_plot['class'] = data['class']

# Function to calculate Freedman-Diaconis bin width
def freedman_diaconis_bins(data):
    q25, q75 = np.percentile(data, [25, 75])
    iqr = q75 - q25  # Interquartile range
    bin_width = 2 * iqr / (len(data) ** (1 / 3))  # Freedman-Diaconis rule
    if bin_width == 0:  # Prevent division by zero
        return 5  # Default minimum bin count
    return max(5, int((data.max() - data.min()) / bin_width))  # At least 5 bins

# Determine max bin count across all features
num_bins = max(freedman_diaconis_bins(df_plot[col]) for col in df_plot.columns if col != "class")

# Store uniform bins for each feature
feature_bins = {}

# Apply consistent binning for each feature
for feature in df_plot.columns:
    if feature == "class":
        continue  # Skip the target variable

    min_val, max_val = df_plot[feature].min(), df_plot[feature].max()
    bins = np.linspace(min_val, max_val, num_bins)  # Use same bin count for all features
    feature_bins[feature] = np.round(bins, decimals=4)  # Round for cleaner output

    # Plot histogram with uniform bin markers
    plt.figure(figsize=(8, 5))
    sns.histplot(df_plot[feature], bins=bins, kde=True, color="blue", alpha=0.6)
    
    # Overlay bin edges as vertical lines
    for bin_edge in bins:
        plt.axvline(bin_edge, color='red', linestyle='dashed', alpha=0.7)
    
    plt.title(f"Histogram of {feature} with Uniform Bins ({num_bins} bins)")
    plt.xlabel(feature)
    plt.ylabel("Frequency")
    plt.grid(True)
    plt.show()

# Print extracted bins in requested format
for feature, bins in feature_bins.items():
    print(f"{feature}_bins = np.array({bins.tolist()})")  # Convert to np.array format


curve+value extraction

In [None]:
# @title Sea-sns-curve
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer

# Load the dataset
data = pd.read_csv('E:/LHC-CERN/Z_boson.csv', encoding='latin1')

# Convert categorical columns to string type
for column in data.select_dtypes(include=['object']):
    data[column] = data[column].astype(str)

# Encode categorical variables if any
columns_to_encode = []
encoder = LabelEncoder()
for column in columns_to_encode:
    data[column] = encoder.fit_transform(data[column])

# Handle missing values
imputer = SimpleImputer(strategy='mean')
features_imputed = imputer.fit_transform(data.drop("class", axis=1))

# Convert back to DataFrame
df_plot = pd.DataFrame(features_imputed, columns=data.columns[:-1])
df_plot['class'] = data['class']

# Get all features
selected_features = df_plot.columns[:-1]  # Exclude 'class'

# Downsample dataset for visualization (keep original data for modeling)
df_sample = df_plot.sample(2000, random_state=42)  # Adjust sample size for speed

# Plot each feature separately on a new page
for feature in selected_features:
    plt.figure(figsize=(8, 6))  # Create a new figure for each feature
    
    # Scatter plot of feature vs. class
    sns.scatterplot(x=df_sample[feature], y=df_sample["class"], alpha=0.5)
    
    plt.xlabel(feature)
    plt.ylabel("Class")
    plt.title(f"{feature} vs Class")
    
    plt.show()  # Show one figure at a time (separate page)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression

# Load dataset
data = pd.read_csv('E:/LHC-CERN/Z_boson.csv', encoding='latin1')

# Convert categorical columns to string type
for column in data.select_dtypes(include=['object']):
    data[column] = data[column].astype(str)

# Encode categorical variables if any
columns_to_encode = []
encoder = LabelEncoder()
for column in columns_to_encode:
    data[column] = encoder.fit_transform(data[column])

# Handle missing values
imputer = SimpleImputer(strategy='mean')
features_imputed = imputer.fit_transform(data.drop("class", axis=1))

# Convert back to DataFrame
df = pd.DataFrame(features_imputed, columns=data.columns[:-1])
df['class'] = data['class']

# Get all features
selected_features = df.columns[:-1]  # Exclude 'class'

# Train a separate logistic regression model for each feature and show each plot separately
for feature in selected_features:
    plt.figure(figsize=(8, 6))  # Create a new figure for each feature
    
    # Extract single feature
    X = df[[feature]]  # Keep it as DataFrame
    y = df["class"]
    
    # Train Logistic Regression Model
    model = LogisticRegression()
    model.fit(X, y)
    
    # Generate predictions (probabilities)
    X_test = np.linspace(X.min(), X.max(), 100).reshape(-1, 1)  # Test data for smooth curve
    y_prob = model.predict_proba(X_test)[:, 1]  # Get probability for class 1
    
    # Scatter plot of actual data
    sns.scatterplot(x=X.squeeze(), y=y, alpha=0.5, label="Actual Data")
    
    # Plot predicted probability curve
    plt.plot(X_test, y_prob, color='red', linestyle="dashed", label="Predicted Probability")
    
    plt.xlabel(feature)
    plt.ylabel("Class")
    plt.title(f"Prediction for {feature}")
    plt.legend()
    
    plt.show()  # Show one figure at a time (separate page)


sea-sns-curve

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.metrics import mean_squared_error, r2_score

# Read the CSV file
FILENAME='/kaggle/input/aircrashes-and-fatalities/air_crashes and fatalities-3.csv'
data = pd.read_csv(FILENAME, encoding='latin1')

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Track encoded columns and their mappings
encoded_mappings = {}

# Encode categorical variables
for column in data.select_dtypes(include=['object']):
    # Display the unique values before encoding
    print(f"Original values in '{column}': {data[column].unique()}")
    
    # Fit the encoder and transform the column
    data[column] = label_encoder.fit_transform(data[column].astype(str))
    
    # Store the mapping of original values to encoded labels
    encoded_mappings[column] = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
    
    # Display the mapping and the encoded values
    print(f"Mapping for '{column}': {encoded_mappings[column]}")
    print(f"Encoded values in '{column}': {data[column].unique()}\n")

# Define features and target variable
features = data.drop("Fatalities", axis=1)
target = data["Fatalities"]

# Display the first few rows of the dataframe
print("First few rows of the encoded data:")
print(data)
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.metrics import mean_squared_error, r2_score

# Read the CSV file
FILENAME = '/kaggle/input/wikipedia-molecule-sorted/wikipedia molecules-sorted.csv'
data = pd.read_csv(FILENAME, encoding='latin1')

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Track encoded columns and their mappings
encoded_mappings = {}

# Encode categorical variables
for column in data.select_dtypes(include=['object']):
    # Display the unique values before encoding
    print(f"Original values in '{column}': {data[column].unique()}")
    
    # Fit the encoder and transform the column, adding 1 to each encoded value
    data[column] = label_encoder.fit_transform(data[column].astype(str)) + 1
    
    # Store the mapping of original values to encoded labels, adjusted by +1
    encoded_mappings[column] = {cls: code + 1 for cls, code in zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_))}
    
    # Display the mapping and the encoded values
    print(f"Mapping for '{column}': {encoded_mappings[column]}")
    print(f"Encoded values in '{column}': {data[column].unique()}\n")

# Define features and target variable
features = data.drop("Molecule", axis=1)
target = data["Molecule"]

# Display the first few rows of the dataframe
print("First few rows of the encoded data:")
print(data)
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

from sklearn.metrics import mean_squared_error, r2_score

# Read the CSV file
FILENAME = 'E:/MARS-ROVER-RSR/SCIENCE/structural protien synthesis/wikipedia molecules-sorted.csv'
data = pd.read_csv(FILENAME, encoding='latin1')

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Track encoded columns and their mappings
encoded_mappings = {}

# Encode categorical variables
for column in data.select_dtypes(include=['object']):
    # Display the unique values before encoding
    print(f"Original values in '{column}': {data[column].unique()}")
    
    # Fit the encoder and transform the column, adding 1 to each encoded value
    data[column] = label_encoder.fit_transform(data[column].astype(str)) + 1
    
    # Store the mapping of original values to encoded labels, adjusted by +1
    encoded_mappings[column] = {cls: code + 1 for cls, code in zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_))}
    
    # Display the mapping and the encoded values
    print(f"Mapping for '{column}': {encoded_mappings[column]}")
    print(f"Encoded values in '{column}': {data[column].unique()}\n")

# Define features and target variable
features = data.drop("Molecule", axis=1)
target = data["Molecule"]

# Display the first few rows of the dataframe
print("First few rows of the encoded data:")
print(data)

# Save the encoded data to a new CSV file
output_filename = 'encoded_data1.csv'
data.to_csv(output_filename, index=False)
print(f"Encoded data saved to {output_filename}")

encoded-every-value


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from sklearn.tree import plot_tree, export_text

# Read the CSV file and select desired columns
try:
    data = pd.read_csv('E:/AIRCRAFT/DAMAGE.csv', encoding='latin1')
    print("Data loaded successfully.")
except FileNotFoundError:
    print("The file was not found. Please check the file path.")
    exit()

# Convert non-numeric columns to categorical type
for column in data.select_dtypes(include=['object']):
    data[column] = data[column].astype(str)

# Define columns to encode
columns_to_encode = ['Record ID', 'Incident Year', 'Incident Month', 'Incident Day', 'Operator ID', 'Operator', 'Aircraft',
                     'Aircraft Type', 'Aircraft Make', 'Aircraft Model', 'Aircraft Mass', 'Engine Make', 'Engine Model',
                     'Engines', 'Engine Type', 'Engine1 Position', 'Engine2 Position', 'Engine3 Position', 'Engine4 Position',
                     'Airport ID', 'Airport', 'State', 'FAA Region', 'Warning Issued', 'Flight Phase', 'Visibility',
                     'Precipitation', 'Height', 'Speed', 'Distance', 'Species ID', 'Species Name', 'Species Quantity',
                     'Flight Impact', 'Fatalities', 'Injuries', 'Aircraft Damage', 'Radome Strike', 'Radome Damage',
                     'Windshield Strike', 'Windshield Damage', 'Nose Strike', 'Nose Damage', 'Engine1 Strike', 'Engine1 Damage',
                     'Engine2 Strike', 'Engine2 Damage', 'Engine3 Strike', 'Engine3 Damage', 'Engine4 Strike', 'Engine4 Damage',
                     'Engine Ingested', 'Propeller Strike', 'Propeller Damage', 'Wing or Rotor Strike', 'Wing or Rotor Damage',
                     'Fuselage Strike', 'Fuselage Damage', 'Landing Gear Strike', 'Landing Gear Damage', 'Tail Strike',
                     'Tail Damage', 'Lights Strike', 'Lights Damage', 'Other Strike', 'Other Damage']

# Initialize OrdinalEncoder
encoder = OrdinalEncoder()

# Encode categorical variables
data[columns_to_encode] = encoder.fit_transform(data[columns_to_encode])
print("Categorical columns encoded.")

# Define features and target variable
features = data.drop("Aircraft Damage", axis=1)
target = data["Aircraft Damage"]

# Handle missing values in the encoded features
imputer = SimpleImputer(strategy='most_frequent')
features_imputed = imputer.fit_transform(features)
print("Missing values in features handled.")

# Handle missing values in the target variable
target_imputed = imputer.fit_transform(target.values.reshape(-1, 1)).flatten()
print("Missing values in target handled.")

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(features_imputed, target_imputed, test_size=0.1, random_state=42)
print("Data split into training and testing sets.")

# Create an instance of the DecisionTreeClassifier
decision_tree = DecisionTreeClassifier(random_state=42)

# Fit the model on the training data
decision_tree.fit(x_train, y_train)
print("Model training completed.")

# Make predictions on the test data
y_pred = decision_tree.predict(x_test)

# Evaluate the model's accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Decision Tree Accuracy:", accuracy)

# Print the decision tree in text format for better readability
tree_rules = export_text(decision_tree, feature_names=features.columns.tolist())
print(tree_rules)

# Optionally, visualize the tree using plot_tree with adjusted plot parameters
plt.figure(figsize=(40,20))  # Adjust the size to make the text more readable
plot_tree(decision_tree, filled=True, feature_names=features.columns, class_names=True, fontsize=10, proportion=True, rounded=True)
plt.show()
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from sklearn.tree import plot_tree, export_text
import numpy as np

# Read the CSV file and select desired columns
try:
    data = pd.read_csv('/content/DAMAGE.csv', encoding='latin1')
    print("Data loaded successfully.")
except FileNotFoundError:
    print("The file was not found. Please check the file path.")
    exit()

# Convert non-numeric columns to categorical type
for column in data.select_dtypes(include=['object']):
    data[column] = data[column].astype(str)

# Define columns to encode
columns_to_encode = ['Record ID', 'Incident Year', 'Incident Month', 'Incident Day', 'Operator ID', 'Operator', 'Aircraft', 
                     'Aircraft Type', 'Aircraft Make', 'Aircraft Model', 'Aircraft Mass', 'Engine Make', 'Engine Model', 
                     'Engines', 'Engine Type', 'Engine1 Position', 'Engine2 Position', 'Engine3 Position', 'Engine4 Position', 
                     'Airport ID', 'Airport', 'State', 'FAA Region', 'Warning Issued', 'Flight Phase', 'Visibility', 
                     'Precipitation', 'Height', 'Speed', 'Distance', 'Species ID', 'Species Name', 'Species Quantity', 
                     'Flight Impact', 'Fatalities', 'Injuries', 'Aircraft Damage', 'Radome Strike', 'Radome Damage', 
                     'Windshield Strike', 'Windshield Damage', 'Nose Strike', 'Nose Damage', 'Engine1 Strike', 'Engine1 Damage', 
                     'Engine2 Strike', 'Engine2 Damage', 'Engine3 Strike', 'Engine3 Damage', 'Engine4 Strike', 'Engine4 Damage', 
                     'Engine Ingested', 'Propeller Strike', 'Propeller Damage', 'Wing or Rotor Strike', 'Wing or Rotor Damage', 
                     'Fuselage Strike', 'Fuselage Damage', 'Landing Gear Strike', 'Landing Gear Damage', 'Tail Strike', 
                     'Tail Damage', 'Lights Strike', 'Lights Damage', 'Other Strike', 'Other Damage']

# Initialize OrdinalEncoder
encoder = OrdinalEncoder()

# Encode categorical variables
data[columns_to_encode] = encoder.fit_transform(data[columns_to_encode])
print("Categorical columns encoded.")

# Define features and target variable
features = data.drop("Aircraft Damage", axis=1)
target = data["Aircraft Damage"]

# Handle missing values in the encoded features
imputer = SimpleImputer(strategy='most_frequent')
features_imputed = imputer.fit_transform(features)
print("Missing values in features handled.")

# Handle missing values in the target variable
target_imputed = imputer.fit_transform(target.values.reshape(-1, 1)).flatten()
print("Missing values in target handled.")

# Generate limited new features using arithmetic operations
def add_limited_arithmetic_features(features):
    new_features = features.copy()
    n_cols = features.shape[1]
    
    # Focus on a limited set of operations
    for i in range(n_cols):
        for j in range(i + 1, min(i + 5, n_cols)):  # Limit to a small number of combinations
            new_features = np.column_stack((new_features, 
                                            features[:, i] + features[:, j], 
                                            features[:, i] - features[:, j],
                                            features[:, i] * features[:, j],
                                            np.divide(features[:, i], features[:, j], out=np.zeros_like(features[:, i]), where=features[:, j] != 0)))
    return new_features

features_augmented = add_limited_arithmetic_features(features_imputed)
print("Limited arithmetic features added.")

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(features_augmented, target_imputed, test_size=0.1, random_state=42)
print("Data split into training and testing sets.")

# Create an instance of the DecisionTreeClassifier
decision_tree = DecisionTreeClassifier(random_state=42)

# Fit the model on the training data
decision_tree.fit(x_train, y_train)
print("Model training completed.")

# Make predictions on the test data
y_pred = decision_tree.predict(x_test)

# Evaluate the model's accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Decision Tree Accuracy:", accuracy)

# Print the decision tree in text format for better readability
# Note: We don't have feature names for the newly created features, so using generic names
feature_names = features.columns.tolist()
augmented_feature_names = feature_names.copy()
for i in range(len(feature_names)):
    for j in range(i + 1, min(i + 5, len(feature_names))):  # Limit feature combinations
        augmented_feature_names.extend([f"{feature_names[i]} + {feature_names[j]}",
                                        f"{feature_names[i]} - {feature_names[j]}",
                                        f"{feature_names[i]} * {feature_names[j]}",
                                        f"{feature_names[i]} / {feature_names[j]}"])

tree_rules = export_text(decision_tree, feature_names=augmented_feature_names)
print(tree_rules)

# Optionally, visualize the tree using plot_tree with adjusted plot parameters
plt.figure(figsize=(40, 20))  # Adjust the size to make the text more readable
plot_tree(decision_tree, filled=True, feature_names=augmented_feature_names, class_names=True, fontsize=10, proportion=True, rounded=True)
plt.show()
LN transformation 
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

from sklearn.metrics import mean_squared_error, r2_score

# Read the CSV file
FILENAME = 'E:/MARS-ROVER-RSR/SCIENCE/structural protien synthesis/wikipedia molecules-sorted.csv'
data = pd.read_csv(FILENAME, encoding='latin1')

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Track encoded columns and their mappings
encoded_mappings = {}

# Encode categorical variables
for column in data.select_dtypes(include=['object']):
    # Display the unique values before encoding
    print(f"Original values in '{column}': {data[column].unique()}")
    
    # Fit the encoder and transform the column, adding 1 to each encoded value
    data[column] = label_encoder.fit_transform(data[column].astype(str)) + 1
    
    # Store the mapping of original values to encoded labels, adjusted by +1
    encoded_mappings[column] = {cls: code + 1 for cls, code in zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_))}
    
    # Display the mapping and the encoded values
    print(f"Mapping for '{column}': {encoded_mappings[column]}")
    print(f"Encoded values in '{column}': {data[column].unique()}\n")

# Apply natural logarithm transformation to the target feature
target_column = "Molecule"  # Update this to match the name of your target feature
data[target_column] = np.log(data[target_column])

# Define features and target variable
features = data.drop(target_column, axis=1)
target = data[target_column]

# Display the first few rows of the dataframe
print("First few rows of the encoded and transformed data:")
print(data)

# Save the encoded data with transformed target to a new CSV file
output_filename = 'encoded_log_transformed_data.csv'
data.to_csv(output_filename, index=False)
print(f"Encoded and log-transformed data saved to {output_filename}")


TREE

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Read the CSV file
data = pd.read_csv('E:/AIRCRAFT/DAMAGE.csv', encoding='latin1')

# Convert non-numeric columns to categorical type
for column in data.select_dtypes(include=['object']):
    data[column] = data[column].astype(str)

# Define columns to encode
columns_to_encode = ['Record ID', 'Incident Year', 'Incident Month', 'Incident Day', 'Operator ID', 'Operator', 'Aircraft', 'Aircraft Type', 'Aircraft Make', 'Aircraft Model', 'Aircraft Mass', 'Engine Make', 'Engine Model', 'Engines', 'Engine Type', 'Engine1 Position', 'Engine2 Position', 'Engine3 Position', 'Engine4 Position', 'Airport ID', 'Airport', 'State', 'FAA Region', 'Warning Issued', 'Flight Phase', 'Visibility', 'Precipitation', 'Height', 'Speed', 'Distance', 'Species ID', 'Species Name', 'Species Quantity', 'Flight Impact', 'Fatalities', 'Injuries', 'Aircraft Damage', 'Radome Strike', 'Radome Damage', 'Windshield Strike', 'Windshield Damage', 'Nose Strike', 'Nose Damage', 'Engine1 Strike', 'Engine1 Damage', 'Engine2 Strike', 'Engine2 Damage', 'Engine3 Strike', 'Engine3 Damage', 'Engine4 Strike', 'Engine4 Damage', 'Engine Ingested', 'Propeller Strike', 'Propeller Damage', 'Wing or Rotor Strike', 'Wing or Rotor Damage', 'Fuselage Strike', 'Fuselage Damage', 'Landing Gear Strike', 'Landing Gear Damage', 'Tail Strike', 'Tail Damage', 'Lights Strike', 'Lights Damage', 'Other Strike', 'Other Damage']

# Initialize LabelEncoder
encoder = LabelEncoder()

# Encode categorical variables
for column in columns_to_encode:
    data[column] = encoder.fit_transform(data[column])

# Handle missing values in the features
imputer = SimpleImputer(strategy='mean')
features_imputed = imputer.fit_transform(data.drop("Aircraft Damage", axis=1))

# Handle missing values in the target variable
target = data["Aircraft Damage"].values
target_imputed = imputer.fit_transform(target.reshape(-1, 1)).flatten()

# Combine features and target into a DataFrame for plotting
df_plot = pd.DataFrame(features_imputed, columns=data.columns[:-1])  # Exclude target variable

# Add the target variable to the DataFrame
df_plot['Aircraft Damage'] = target_imputed

# Perform bootstrap sampling to increase sample size
bootstrap_sample_size = len(df_plot) * 5  # Adjust multiplier to increase sample size
bootstrap_sample_indices = np.random.choice(df_plot.index, size=bootstrap_sample_size, replace=True)
df_bootstrap = df_plot.iloc[bootstrap_sample_indices]

# Plot each feature against the target variable separately
for feature in df_bootstrap.columns[:-1]:  # Exclude the target variable itself
    plt.figure(figsize=(10, 6))
    sns.scatterplot(x=df_bootstrap[feature], y=df_bootstrap['Aircraft Damage'])
    plt.title(f'Scatter Plot of {feature} vs Aircraft Damage')
    plt.xlabel(feature)
    plt.ylabel('Aircraft Damage')
    plt.show()


GRAPH-SNS

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
import shap
import os

# Construct the file path
file_path = os.path.join("/kaggle/input/air-damage/DAMAGE.csv")

# Read the CSV file and select desired columns
data = pd.read_csv(file_path)
data = data[['Record ID','Incident Year','Incident Month','Incident Day','Operator ID','Operator','Aircraft','Aircraft Type','Aircraft Make','Aircraft Model','Aircraft Mass','Engine Make','Engine Model','Engines','Engine Type','Engine1 Position','Engine2 Position','Engine3 Position','Engine4 Position','Airport ID','Airport','State','FAA Region','Warning Issued','Flight Phase','Visibility','Precipitation','Height','Speed','Distance','Species ID','Species Name','Species Quantity','Flight Impact','Fatalities','Injuries','Aircraft Damage','Radome Strike','Radome Damage','Windshield Strike','Windshield Damage','Nose Strike','Nose Damage','Engine1 Strike','Engine1 Damage','Engine2 Strike','Engine2 Damage','Engine3 Strike','Engine3 Damage','Engine4 Strike','Engine4 Damage','Engine Ingested','Propeller Strike','Propeller Damage','Wing or Rotor Strike','Wing or Rotor Damage','Fuselage Strike','Fuselage Damage','Landing Gear Strike','Landing Gear Damage','Tail Strike','Tail Damage','Lights Strike','Lights Damage','Other Strike','Other Damage']]

# Convert non-numeric columns to categorical type
for column in data.select_dtypes(include=['object']):
    data[column] = data[column].astype(str)

# Apply OrdinalEncoder to encode categorical variables
encoder = OrdinalEncoder()
data_encoded = encoder.fit_transform(data)

# Define features and target variable
features = pd.DataFrame(data_encoded, columns=data.columns).drop("Aircraft Damage", axis=1)
target = data["Aircraft Damage"]

# Handle missing values using SimpleImputer
imputer = SimpleImputer(strategy='mean')
features_imputed = imputer.fit_transform(features)

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(features_imputed, target, test_size=0.1)

# Create an instance of the Logistic Regression model
logistic = LogisticRegression()

# Fit the model on the training data
logistic.fit(x_train, y_train)

# Get predictions on the test set
y_pred = logistic.predict(x_test)

# Count the predicted values
value_counts = pd.Series(y_pred).value_counts()

print("Count of Predicted Values:")
print(value_counts)
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
import tensorflow as tf
from tensorflow.keras import layers

# Construct the file path
file_path = "/kaggle/input/air-damage/DAMAGE.csv"

# Read the CSV file and select desired columns
data = pd.read_csv(file_path, low_memory=False)
data = data[['Record ID','Incident Year','Incident Month','Incident Day','Operator ID','Operator','Aircraft','Aircraft Type','Aircraft Make','Aircraft Model','Aircraft Mass','Engine Make','Engine Model','Engines','Engine Type','Engine1 Position','Engine2 Position','Engine3 Position','Engine4 Position','Airport ID','Airport','State','FAA Region','Warning Issued','Flight Phase','Visibility','Precipitation','Height','Speed','Distance','Species ID','Species Name','Species Quantity','Flight Impact','Fatalities','Injuries','Aircraft Damage','Radome Strike','Radome Damage','Windshield Strike','Windshield Damage','Nose Strike','Nose Damage','Engine1 Strike','Engine1 Damage','Engine2 Strike','Engine2 Damage','Engine3 Strike','Engine3 Damage','Engine4 Strike','Engine4 Damage','Engine Ingested','Propeller Strike','Propeller Damage','Wing or Rotor Strike','Wing or Rotor Damage','Fuselage Strike','Fuselage Damage','Landing Gear Strike','Landing Gear Damage','Tail Strike','Tail Damage','Lights Strike','Lights Damage','Other Strike','Other Damage']]

# Convert non-numeric columns to categorical type
for column in data.select_dtypes(include=['object']):
    data[column] = data[column].astype(str)

# Apply OrdinalEncoder to encode categorical variables
encoder = OrdinalEncoder()
data_encoded = encoder.fit_transform(data)

# Define features and target variable
features = pd.DataFrame(data_encoded, columns=data.columns).drop("Aircraft Damage", axis=1)
target = data["Aircraft Damage"]

# Handle missing values using SimpleImputer
imputer = SimpleImputer(strategy='mean')
features_imputed = imputer.fit_transform(features)

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(features_imputed, target, test_size=0.1)

# Build a simple neural network
model = tf.keras.Sequential([
    layers.Dense(64, activation='relu', input_shape=(features_imputed.shape[1],)),
    layers.Dense(32, activation='relu'),
    layers.Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

# Train the model
history = model.fit(x_train, y_train, epochs=10, batch_size=64, validation_split=0.1)

# Evaluate the model on test data
test_loss, test_acc = model.evaluate(x_test, y_test)
print('Test accuracy:', test_acc)

# Get predictions on the test set
y_pred_proba = model.predict(x_test)
y_pred = np.round(y_pred_proba).flatten()

# Count the predicted values
value_counts = pd.Series(y_pred).value_counts()

# Calculate percentages
total_predictions = len(y_pred)
percentage_0 = (value_counts.get(0, 0) / total_predictions) * 100
percentage_1 = (value_counts.get(1, 0) / total_predictions) * 100

print("Percentage of Predicted Values:")
print("Class 0: {:.2f}%".format(percentage_0))
print("Class 1: {:.2f}%".format(percentage_1))

TENSOR-PREDICTION-PROB

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.mixture import GaussianMixture

# Read the CSV file
# Read the CSV file
file_path = '/kaggle/input/di-electron/DI-ELECTRON-2.csv'
data = pd.read_csv(file_path, encoding='latin1')
# Assuming 'M' is the target variable, drop it for clustering
features = data.drop("M", axis=1)

# Perform Gaussian Mixture Model clustering
k = 20 # Number of clusters
gmm = GaussianMixture(n_components=k, random_state=42)
clusters = gmm.fit_predict(features)

# Add cluster labels to the original DataFrame
data['Cluster'] = clusters

# Analyze the characteristics of each cluster
cluster_means = data.groupby('Cluster').mean()

# Print characteristics of each cluster
for i, cluster_mean in enumerate(cluster_means.iterrows(), 1):
    print(f"Cluster {i}:\n{cluster_mean[1]}\n")

# Plot characteristics of each cluster
num_clusters = len(cluster_means)
num_features = len(features.columns)

fig, axs = plt.subplots(num_clusters, num_features, figsize=(20, 10))

for i, (cluster_idx, cluster_mean) in enumerate(cluster_means.iterrows(), 1):
    for j, feature in enumerate(features.columns):
        axs[i-1, j].bar(x=feature, height=cluster_mean[feature])
        axs[i-1, j].set_title(f"Cluster {i}")

plt.tight_layout()
plt.show()
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

# Read the CSV file
file_path = '/kaggle/input/di-electron/DI-ELECTRON-2.csv'
data = pd.read_csv(file_path, encoding='latin1')
# Assuming 'M' is the target variable, drop it for clustering
features = data.drop("M", axis=1)

# Perform k-means clustering
k = 20 # Number of clusters
kmeans = KMeans(n_clusters=k, random_state=42)
clusters = kmeans.fit_predict(features)

# Add cluster labels to the original DataFrame
data['Cluster'] = clusters

# Analyze the characteristics of each cluster
cluster_means = data.groupby('Cluster').mean()

# Print characteristics of each cluster
for i, cluster_mean in enumerate(cluster_means.iterrows(), 1):
    print(f"Cluster {i}:\n{cluster_mean[1]}\n")

# Plot characteristics of each cluster
num_clusters = len(cluster_means)
num_features = len(features.columns)

fig, axs = plt.subplots(num_clusters, num_features, figsize=(20, 10))

for i, (cluster_idx, cluster_mean) in enumerate(cluster_means.iterrows(), 1):
    for j, feature in enumerate(features.columns):
        axs[i-1, j].bar(x=feature, height=cluster_mean[feature])
        axs[i-1, j].set_title(f"Cluster {i}")

plt.tight_layout()
plt.show()

CLUSTER

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Read the CSV file
file_path = "/kaggle/input/di-electron/DI-ELECTRON-2.csv"
data = pd.read_csv(file_path, encoding='latin1')

# Convert non-numeric columns to categorical type
for column in data.select_dtypes(include=['object']):
    data[column] = data[column].astype(str)

# Encode categorical variables
label_encoder = LabelEncoder()
data['Run_encoded'] = label_encoder.fit_transform(data['Run'])

# Display the encoded values of the "Run" column
print(data[['Run', 'Run_encoded']].head())
# Specify the row index
row_index = 0  # Example row index

# Retrieve the label encoded value of "Run" at the specified row index
encoded_value = data.at[row_index, 'Run_encoded']

# Display the label encoded value
print(f"Label Encoded Value of 'Run' at Row {row_index}: {encoded_value}")
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder

# Read the CSV file
# Read the CSV file
file_path = "/kaggle/input/di-electron/DI-ELECTRON-2.csv"
data = pd.read_csv(file_path, encoding='latin1')

# Convert non-numeric columns to categorical type
for column in data.select_dtypes(include=['object']):
    data[column] = data[column].astype(str)

# Initialize OrdinalEncoder
ordinal_encoder = OrdinalEncoder()

# Encode categorical variables
data['Run_encoded'] = ordinal_encoder.fit_transform(data[['Run']])

# Display the encoded values of the "Run" column
print(data[['Run', 'Run_encoded']].head())

# Specify the row index
row_index = 0  # Example row index

# Retrieve the label encoded value of "Run" at the specified row index
encoded_value = data.at[row_index, 'Run_encoded']

# Display the label encoded value
print(f"Label Encoded Value of 'Run' at Row {row_index}: {encoded_value}")


LABEL-ENCODED

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.impute import SimpleImputer
from sklearn.metrics import confusion_matrix, accuracy_score
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Read the CSV file
data = pd.read_csv('E:/AIRCRAFT/DAMAGE.csv', encoding='latin1')

# Convert non-numeric columns to categorical type
for column in data.select_dtypes(include=['object']):
    data[column] = data[column].astype(str)

# Define columns to encode
columns_to_encode = ['Record ID', 'Incident Year', 'Incident Month', 'Incident Day', 'Operator ID', 'Operator', 'Aircraft', 'Aircraft Type', 'Aircraft Make', 'Aircraft Model', 'Aircraft Mass', 'Engine Make', 'Engine Model', 'Engines', 'Engine Type', 'Engine1 Position', 'Engine2 Position', 'Engine3 Position', 'Engine4 Position', 'Airport ID', 'Airport', 'State', 'FAA Region', 'Warning Issued', 'Flight Phase', 'Visibility', 'Precipitation', 'Height', 'Speed', 'Distance', 'Species ID', 'Species Name', 'Species Quantity', 'Flight Impact', 'Fatalities', 'Injuries', 'Aircraft Damage', 'Radome Strike', 'Radome Damage', 'Windshield Strike', 'Windshield Damage', 'Nose Strike', 'Nose Damage', 'Engine1 Strike', 'Engine1 Damage', 'Engine2 Strike', 'Engine2 Damage', 'Engine3 Strike', 'Engine3 Damage', 'Engine4 Strike', 'Engine4 Damage', 'Engine Ingested', 'Propeller Strike', 'Propeller Damage', 'Wing or Rotor Strike', 'Wing or Rotor Damage', 'Fuselage Strike', 'Fuselage Damage', 'Landing Gear Strike', 'Landing Gear Damage', 'Tail Strike', 'Tail Damage', 'Lights Strike', 'Lights Damage', 'Other Strike', 'Other Damage']

# Initialize LabelEncoder
encoder = LabelEncoder()

# Encode categorical variables
for column in columns_to_encode:
    data[column] = encoder.fit_transform(data[column])

# Define features and target variable
features = data.drop("Aircraft Damage", axis=1)
target = data["Aircraft Damage"]

# Display the transformed dataframe
print(data.head())

# Handle missing values in the features
imputer = SimpleImputer(strategy='mean')
features_imputed = imputer.fit_transform(features)

# Handle missing values in the target variable
target_imputed = imputer.fit_transform(target.values.reshape(-1, 1)).flatten()

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(features_imputed, target_imputed, test_size=0.1, random_state=42)

# Apply polynomial features transformation
poly = PolynomialFeatures(degree=2, include_bias=False)
x_train_poly = poly.fit_transform(x_train)
x_test_poly = poly.fit_transform(x_test)

# Create an instance of the LinearRegression model
regressor = LinearRegression()

# Fit the model on the polynomial features
regressor.fit(x_train_poly, y_train)

# Predict on test data
y_pred = regressor.predict(x_test_poly)

# Binarize the predicted and actual values
y_pred_binary = (y_pred > 0.5).astype(int)
y_test_binary = (y_test > 0.5).astype(int)

# Compute the confusion matrix
conf_matrix = confusion_matrix(y_test_binary, y_pred_binary, labels=[0, 1])  # Provide labels for clarity
conf_matrix_df = pd.DataFrame(conf_matrix, columns=['Predicted 0', 'Predicted 1'], index=['Actual 0', 'Actual 1'])
print("Confusion Matrix:")
print(conf_matrix_df)

# Calculate accuracy
accuracy = accuracy_score(y_test_binary, y_pred_binary)
print("\nAccuracy:", accuracy)

# Print detailed analysis for each feature
print("\nDetailed Analysis of Confusion Matrix by Feature Impact:")

# Loop through each feature
for feature in features.columns:
    # Create a temporary dataframe with only the feature and target
    temp_df = pd.DataFrame(features[feature])
    temp_df['Aircraft Damage'] = target

    # Train-test split for this feature
    x_feat_train, x_feat_test, y_feat_train, y_feat_test = train_test_split(temp_df[[feature]].values, target, test_size=0.1, random_state=42)

    # Apply polynomial features transformation
    x_feat_train_poly = poly.fit_transform(x_feat_train)
    x_feat_test_poly = poly.fit_transform(x_feat_test)

    # Fit the model on the polynomial features
    regressor.fit(x_feat_train_poly, y_feat_train)

    # Predict on test data
    y_feat_pred = regressor.predict(x_feat_test_poly)

    # Binarize the predicted and actual values
    y_feat_pred_binary = (y_feat_pred > 0.5).astype(int)
    y_feat_test_binary = (y_feat_test > 0.5).astype(int)

    # Compute confusion matrix for this feature
    conf_matrix_feat = confusion_matrix(y_feat_test_binary, y_feat_pred_binary, labels=[0, 1])

    # Print feature header
    print(f"\nFeature: {feature}")
    print("Confusion Matrix:")
    print(pd.DataFrame(conf_matrix_feat, columns=['Predicted 0', 'Predicted 1'], index=['Actual 0', 'Actual 1']))

    # Print interpretation of confusion matrix for this feature
    print("Interpretation of Confusion Matrix:")
    print("True Negative (TN): Predicted 0 (No Damage) and Actual 0 (No Damage)")
    print("False Positive (FP): Predicted 1 (Damage) but Actual 0 (No Damage)")
    print("False Negative (FN): Predicted 0 (No Damage) but Actual 1 (Damage)")
    print("True Positive (TP): Predicted 1 (Damage) and Actual 1 (Damage)")

    # Plot confusion matrix for this feature
    plt.figure(figsize=(6, 4))
    sns.heatmap(conf_matrix_feat, annot=True, fmt='d', cmap='Blues', cbar=False,
                xticklabels=['Predicted 0', 'Predicted 1'], yticklabels=['Actual 0', 'Actual 1'])
    plt.title(f'Confusion Matrix for Feature: {feature}')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.show()

CONFUSION-MATRIX

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_text
import matplotlib.pyplot as plt

# Read the CSV file
file_name='/kaggle/input/air-damage/DAMAGE.csv'
data = pd.read_csv(file_name, encoding='latin1')

# Convert non-numeric columns to categorical type
for column in data.select_dtypes(include=['object']):
    data[column] = data[column].astype(str)

# Define columns to encode
columns_to_encode = ['Record ID', 'Incident Year', 'Incident Month', 'Incident Day', 'Operator ID', 'Operator', 'Aircraft', 'Aircraft Type', 'Aircraft Make', 'Aircraft Model', 'Aircraft Mass', 'Engine Make', 'Engine Model', 'Engines', 'Engine Type', 'Engine1 Position', 'Engine2 Position', 'Engine3 Position', 'Engine4 Position', 'Airport ID', 'Airport', 'State', 'FAA Region', 'Warning Issued', 'Flight Phase', 'Visibility', 'Precipitation', 'Height', 'Speed', 'Distance', 'Species ID', 'Species Name', 'Species Quantity', 'Flight Impact', 'Fatalities', 'Injuries', 'Aircraft Damage', 'Radome Strike', 'Radome Damage', 'Windshield Strike', 'Windshield Damage', 'Nose Strike', 'Nose Damage', 'Engine1 Strike', 'Engine1 Damage', 'Engine2 Strike', 'Engine2 Damage', 'Engine3 Strike', 'Engine3 Damage', 'Engine4 Strike', 'Engine4 Damage', 'Engine Ingested', 'Propeller Strike', 'Propeller Damage', 'Wing or Rotor Strike', 'Wing or Rotor Damage', 'Fuselage Strike', 'Fuselage Damage', 'Landing Gear Strike', 'Landing Gear Damage', 'Tail Strike', 'Tail Damage', 'Lights Strike', 'Lights Damage', 'Other Strike', 'Other Damage']

# Initialize LabelEncoder
encoder = LabelEncoder()

# Encode categorical variables
for column in columns_to_encode:
    data[column] = encoder.fit_transform(data[column])

# Define features and target variable
features = data.drop("Aircraft Damage", axis=1)
target = data["Aircraft Damage"]

# Convert target variable to binary (0 and 1)
target_binary = (target > 0).astype(int)  # Example: 0 if no damage, 1 if damage

# Handle missing values in the features
imputer = SimpleImputer(strategy='median')
features_imputed = imputer.fit_transform(features)

# Handle missing values in the target variable
target_imputed = imputer.fit_transform(target_binary.values.reshape(-1, 1)).flatten()

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(features_imputed, target_imputed, test_size=0.1, random_state=42)

# Standardize the data
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

# Build the ANN model for binary classification
model = keras.Sequential([
    layers.Input(shape=(x_train.shape[1],)),
    layers.Dense(64, activation='relu'),
    layers.Dense(32, activation='relu'),
    layers.Dense(16, activation='relu'),
    layers.Dense(1, activation='sigmoid')  # Sigmoid activation for binary classification
])

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(x_train, y_train, validation_split=0.2, epochs=100, batch_size=32, verbose=1)

# Evaluate the model
y_pred = (model.predict(x_test) > 0.5).astype(int).flatten()  # Convert probabilities to binary predictions
accuracy = accuracy_score(y_test, y_pred)

print(f"ANN Accuracy: {accuracy}")

# Fit a decision tree to the same data
tree = DecisionTreeClassifier(random_state=42)
tree.fit(x_train, y_train)

# Predict with the decision tree
y_tree_pred = tree.predict(x_test)
accuracy_tree = accuracy_score(y_test, y_tree_pred)

print(f"Decision Tree Accuracy: {accuracy_tree}")

# Plot the tree
from sklearn.tree import plot_tree

plt.figure(figsize=(20, 10))
plot_tree(tree, filled=True, feature_names=features.columns)
plt.show()

# Print the decision tree rules as text
tree_rules = export_text(tree, feature_names=list(features.columns))
print(tree_rules)

BINARY-SEARCH

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier, _tree
import matplotlib.pyplot as plt

# Read the CSV file
data = pd.read_csv('/content/DAMAGE.csv', encoding='latin1')

# Convert non-numeric columns to categorical type
for column in data.select_dtypes(include=['object']):
    data[column] = data[column].astype(str)

# Define columns to encode
columns_to_encode = ['Record ID', 'Incident Year', 'Incident Month', 'Incident Day', 'Operator ID', 'Operator', 'Aircraft', 'Aircraft Type', 'Aircraft Make', 'Aircraft Model', 'Aircraft Mass', 'Engine Make', 'Engine Model', 'Engines', 'Engine Type', 'Engine1 Position', 'Engine2 Position', 'Engine3 Position', 'Engine4 Position', 'Airport ID', 'Airport', 'State', 'FAA Region', 'Warning Issued', 'Flight Phase', 'Visibility', 'Precipitation', 'Height', 'Speed', 'Distance', 'Species ID', 'Species Name', 'Species Quantity', 'Flight Impact', 'Fatalities', 'Injuries', 'Aircraft Damage', 'Radome Strike', 'Radome Damage', 'Windshield Strike', 'Windshield Damage', 'Nose Strike', 'Nose Damage', 'Engine1 Strike', 'Engine1 Damage', 'Engine2 Strike', 'Engine2 Damage', 'Engine3 Strike', 'Engine3 Damage', 'Engine4 Strike', 'Engine4 Damage', 'Engine Ingested', 'Propeller Strike', 'Propeller Damage', 'Wing or Rotor Strike', 'Wing or Rotor Damage', 'Fuselage Strike', 'Fuselage Damage', 'Landing Gear Strike', 'Landing Gear Damage', 'Tail Strike', 'Tail Damage', 'Lights Strike', 'Lights Damage', 'Other Strike', 'Other Damage']

# Initialize LabelEncoder
encoder = LabelEncoder()

# Encode categorical variables
for column in columns_to_encode:
    data[column] = encoder.fit_transform(data[column])

# Define features and target variable
features = data.drop("Aircraft Damage", axis=1)
target = data["Aircraft Damage"]

# Convert target variable to binary (0 and 1)
target_binary = (target > 0).astype(int)  # Example: 0 if no damage, 1 if damage

# Handle missing values in the features
imputer = SimpleImputer(strategy='median')
features_imputed = imputer.fit_transform(features)

# Handle missing values in the target variable
target_imputed = imputer.fit_transform(target_binary.values.reshape(-1, 1)).flatten()

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(features_imputed, target_imputed, test_size=0.1, random_state=42)

# Standardize the data
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

# Build the ANN model for binary classification
model = keras.Sequential([
    layers.Input(shape=(x_train.shape[1],)),
    layers.Dense(64, activation='relu'),
    layers.Dense(32, activation='relu'),
    layers.Dense(16, activation='relu'),
    layers.Dense(1, activation='sigmoid')  # Sigmoid activation for binary classification
])

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(x_train, y_train, validation_split=0.2, epochs=100, batch_size=32, verbose=1)

# Evaluate the model
y_pred = (model.predict(x_test) > 0.5).astype(int).flatten()  # Convert probabilities to binary predictions
accuracy = accuracy_score(y_test, y_pred)

print(f"ANN Accuracy: {accuracy}")

# Fit a decision tree to the same data
tree = DecisionTreeClassifier(random_state=42)
tree.fit(x_train, y_train)

# DFS function to predict using the decision tree
def dfs_predict(tree, sample):
    node = 0
    while True:
        if tree.tree_.feature[node] != _tree.TREE_UNDEFINED:  # If node is not a leaf
            feature_index = tree.tree_.feature[node]
            threshold = tree.tree_.threshold[node]
            if sample[feature_index] <= threshold:
                node = tree.tree_.children_left[node]
            else:
                node = tree.tree_.children_right[node]
        else:  # If node is a leaf
            return tree.tree_.value[node].argmax()  # Return the predicted class (0 or 1)

# Predict with DFS for each test sample
dfs_predictions = [dfs_predict(tree, x) for x in x_test]

# Calculate accuracy for DFS predictions
accuracy_dfs = accuracy_score(y_test, dfs_predictions)

print(f"DFS Decision Tree Accuracy: {accuracy_dfs}")

# Plot the tree
plt.figure(figsize=(20, 10))
plot_tree(tree, filled=True, feature_names=features.columns)
plt.show()

# Print the decision tree rules as text
tree_rules = export_text(tree, feature_names=list(features.columns))
print(tree_rules)


DFS

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier, _tree, plot_tree, export_text
import matplotlib.pyplot as plt

# Read the CSV file
file_name='/kaggle/input/air-damage/DAMAGE.csv'
data = pd.read_csv(file_name, encoding='latin1')

# Convert non-numeric columns to categorical type
for column in data.select_dtypes(include=['object']):
    data[column] = data[column].astype(str)

# Define columns to encode
columns_to_encode = ['Record ID', 'Incident Year', 'Incident Month', 'Incident Day', 'Operator ID', 'Operator', 'Aircraft', 'Aircraft Type', 'Aircraft Make', 'Aircraft Model', 'Aircraft Mass', 'Engine Make', 'Engine Model', 'Engines', 'Engine Type', 'Engine1 Position', 'Engine2 Position', 'Engine3 Position', 'Engine4 Position', 'Airport ID', 'Airport', 'State', 'FAA Region', 'Warning Issued', 'Flight Phase', 'Visibility', 'Precipitation', 'Height', 'Speed', 'Distance', 'Species ID', 'Species Name', 'Species Quantity', 'Flight Impact', 'Fatalities', 'Injuries', 'Aircraft Damage', 'Radome Strike', 'Radome Damage', 'Windshield Strike', 'Windshield Damage', 'Nose Strike', 'Nose Damage', 'Engine1 Strike', 'Engine1 Damage', 'Engine2 Strike', 'Engine2 Damage', 'Engine3 Strike', 'Engine3 Damage', 'Engine4 Strike', 'Engine4 Damage', 'Engine Ingested', 'Propeller Strike', 'Propeller Damage', 'Wing or Rotor Strike', 'Wing or Rotor Damage', 'Fuselage Strike', 'Fuselage Damage', 'Landing Gear Strike', 'Landing Gear Damage', 'Tail Strike', 'Tail Damage', 'Lights Strike', 'Lights Damage', 'Other Strike', 'Other Damage']

# Initialize LabelEncoder
encoder = LabelEncoder()

# Encode categorical variables
for column in columns_to_encode:
    data[column] = encoder.fit_transform(data[column])

# Define features and target variable
features = data.drop("Aircraft Damage", axis=1)
target = data["Aircraft Damage"]

# Convert target variable to binary (0 and 1)
target_binary = (target > 0).astype(int)  # Example: 0 if no damage, 1 if damage

# Handle missing values in the features
imputer = SimpleImputer(strategy='median')
features_imputed = imputer.fit_transform(features)

# Handle missing values in the target variable
target_imputed = imputer.fit_transform(target_binary.values.reshape(-1, 1)).flatten()

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(features_imputed, target_imputed, test_size=0.1, random_state=42)

# Standardize the data
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

# Build the ANN model for binary classification
model = keras.Sequential([
    layers.Input(shape=(x_train.shape[1],)),
    layers.Dense(64, activation='relu'),
    layers.Dense(32, activation='relu'),
    layers.Dense(16, activation='relu'),
    layers.Dense(1, activation='sigmoid')  # Sigmoid activation for binary classification
])

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(x_train, y_train, validation_split=0.2, epochs=100, batch_size=32, verbose=1)

# Evaluate the model
y_pred = (model.predict(x_test) > 0.5).astype(int).flatten()  # Convert probabilities to binary predictions
accuracy = accuracy_score(y_test, y_pred)

print(f"ANN Accuracy: {accuracy}")

# Fit a decision tree to the same data
tree = DecisionTreeClassifier(random_state=42)
tree.fit(x_train, y_train)

# BFS function to predict using the decision tree
def bfs_predict(tree, sample):
    queue = [0]  # Start with the root node
    while queue:
        node = queue.pop(0)  # Dequeue the first element
        if tree.tree_.feature[node] != _tree.TREE_UNDEFINED:  # If node is not a leaf
            feature_index = tree.tree_.feature[node]
            threshold = tree.tree_.threshold[node]
            if sample[feature_index] <= threshold:
                queue.append(tree.tree_.children_left[node])  # Go left
            else:
                queue.append(tree.tree_.children_right[node])  # Go right
        else:  # If node is a leaf
            return tree.tree_.value[node].argmax()  # Return the predicted class (0 or 1)

# Predict with BFS for each test sample
bfs_predictions = [bfs_predict(tree, x) for x in x_test]

# Calculate accuracy for BFS predictions
accuracy_bfs = accuracy_score(y_test, bfs_predictions)

print(f"BFS Decision Tree Accuracy: {accuracy_bfs}")

# Plot the tree
plt.figure(figsize=(20, 10))
plot_tree(tree, filled=True, feature_names=features.columns)
plt.show()

# Print the decision tree rules as text
tree_rules = export_text(tree, feature_names=list(features.columns))
print(tree_rules)

BFS

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score

# Read the CSV file
file_name='/kaggle/input/air-damage/DAMAGE.csv'
data = pd.read_csv(file_name, encoding='latin1')

# Convert non-numeric columns to categorical type
for column in data.select_dtypes(include=['object']):
    data[column] = data[column].astype(str)

# Define columns to encode
columns_to_encode = ['Record ID', 'Incident Year', 'Incident Month', 'Incident Day', 'Operator ID', 'Operator',
                     'Aircraft', 'Aircraft Type', 'Aircraft Make', 'Aircraft Model', 'Aircraft Mass', 'Engine Make',
                     'Engine Model', 'Engines', 'Engine Type', 'Engine1 Position', 'Engine2 Position', 'Engine3 Position',
                     'Engine4 Position', 'Airport ID', 'Airport', 'State', 'FAA Region', 'Warning Issued', 'Flight Phase',
                     'Visibility', 'Precipitation', 'Height', 'Speed', 'Distance', 'Species ID', 'Species Name',
                     'Species Quantity', 'Flight Impact', 'Fatalities', 'Injuries', 'Aircraft Damage', 'Radome Strike',
                     'Radome Damage', 'Windshield Strike', 'Windshield Damage', 'Nose Strike', 'Nose Damage',
                     'Engine1 Strike', 'Engine1 Damage', 'Engine2 Strike', 'Engine2 Damage', 'Engine3 Strike',
                     'Engine3 Damage', 'Engine4 Strike', 'Engine4 Damage', 'Engine Ingested', 'Propeller Strike',
                     'Propeller Damage', 'Wing or Rotor Strike', 'Wing or Rotor Damage', 'Fuselage Strike',
                     'Fuselage Damage', 'Landing Gear Strike', 'Landing Gear Damage', 'Tail Strike', 'Tail Damage',
                     'Lights Strike', 'Lights Damage', 'Other Strike', 'Other Damage']

# Initialize LabelEncoder
encoder = LabelEncoder()

# Encode categorical variables
for column in columns_to_encode:
    data[column] = encoder.fit_transform(data[column])

# Define features and target variable
features = data.drop("Aircraft Damage", axis=1)
target = data["Aircraft Damage"]

# Convert target variable to binary (0 and 1)
target_binary = (target > 0).astype(int)

# Handle missing values in the features
imputer = SimpleImputer(strategy='median')
features_imputed = imputer.fit_transform(features)

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(features_imputed, target_binary, test_size=0.1, random_state=42)

# Standardize the data
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

# Calculate the mean value of each feature for the positive and negative classes
positive_means = np.mean(x_train[y_train == 1], axis=0)
negative_means = np.mean(x_train[y_train == 0], axis=0)

# Create rules based on these mean values
rules = []
for i in range(x_train.shape[1]):
    if positive_means[i] > negative_means[i]:
        rules.append((i, '>', (positive_means[i] + negative_means[i]) / 2))
    else:
        rules.append((i, '<', (positive_means[i] + negative_means[i]) / 2))

# Define a greedy pattern function to predict based on the rules
def greedy_predict(sample, rules):
    score = 0
    for rule in rules:
        feature_index, operator, threshold = rule
        if operator == '>':
            if sample[feature_index] > threshold:
                score += 1
        else:
            if sample[feature_index] < threshold:
                score += 1
    return int(score > len(rules) / 2)

# Predict with the greedy algorithm for each test sample
greedy_predictions = [greedy_predict(x, rules) for x in x_test]

# Calculate accuracy for greedy predictions
accuracy_greedy = accuracy_score(y_test, greedy_predictions)

print(f"Greedy Algorithm Accuracy: {accuracy_greedy}")

# Print the rules
for rule in rules:
    feature_index, operator, threshold = rule
    feature_name = data.columns[feature_index]
    print(f"Feature {feature_name} {operator} {threshold}")


GREEDY

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
import shap
import tempfile
import webbrowser

# Read the CSV file
file_path = "/kaggle/input/air-damage/DAMAGE.csv"
data = pd.read_csv(file_path,encoding='latin1')
data = data[['Record ID','Incident Year','Incident Month','Incident Day','Operator ID','Operator','Aircraft','Aircraft Type','Aircraft Make','Aircraft Model','Aircraft Mass','Engine Make','Engine Model','Engines','Engine Type','Engine1 Position','Engine2 Position','Engine3 Position','Engine4 Position','Airport ID','Airport','State','FAA Region','Warning Issued','Flight Phase','Visibility','Precipitation','Height','Speed','Distance','Species ID','Species Name','Species Quantity','Flight Impact','Fatalities','Injuries','Aircraft Damage','Radome Strike','Radome Damage','Windshield Strike','Windshield Damage','Nose Strike','Nose Damage','Engine1 Strike','Engine1 Damage','Engine2 Strike','Engine2 Damage','Engine3 Strike','Engine3 Damage','Engine4 Strike','Engine4 Damage','Engine Ingested','Propeller Strike','Propeller Damage','Wing or Rotor Strike','Wing or Rotor Damage','Fuselage Strike','Fuselage Damage','Landing Gear Strike','Landing Gear Damage','Tail Strike','Tail Damage','Lights Strike','Lights Damage','Other Strike','Other Damage']]

for column in data.select_dtypes(include=['object']):
    data[column] = data[column].astype(str)

# Encode categorical variables
label_encoder = LabelEncoder()
for column in data.select_dtypes(include=['object']):
    data[column] = label_encoder.fit_transform(data[column])

# Convert non-numerical values to numerical using LabelEncoder
data_encoded = data.apply(label_encoder.fit_transform)

# Define features and target variable
features = data_encoded.drop("Aircraft Damage", axis=1)
target = data_encoded["Aircraft Damage"]

# Handle missing values in the features
imputer = SimpleImputer(strategy='mean')
features_imputed = imputer.fit_transform(features)

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(features_imputed, target, test_size=0.1)

# Apply polynomial features transformation with reduced degree and features
poly = PolynomialFeatures(degree=1, include_bias=False)
x_train_poly = poly.fit_transform(x_train)
x_test_poly = poly.transform(x_test)

# Get the polynomial feature names
feature_names = poly.get_feature_names_out(features.columns)

# Create an instance of the LinearRegression model
regressor = LinearRegression()

# Fit the model on the polynomial features
regressor.fit(x_train_poly, y_train)

# Predict values using the trained model
y_train_pred = regressor.predict(x_train_poly)
y_test_pred = regressor.predict(x_test_poly)

# Initialize the explainer with the trained model and training data
explainer = shap.Explainer(regressor, x_train_poly)

# Calculate SHAP values for the testing data
shap_values = explainer.shap_values(x_test_poly)

# Check the shape of x_test_poly and length of feature_names
print("Shape of x_test_poly:", x_test_poly.shape)
print("Length of feature_names:", len(feature_names))

# Print each feature name along with its index
for idx, name in enumerate(feature_names):
    print(f"Feature {idx}: {name}")

# Generate the summary plot HTML
try:
    shap_html = shap.summary_plot(shap_values, features=x_test_poly, feature_names=feature_names, show=False)

    # Save the HTML content to a temporary file
    with tempfile.NamedTemporaryFile(suffix='.html', delete=False) as f:
        f.write(shap_html.data.encode('utf-8'))
        temp_html_file = f.name

    # Open the temporary HTML file in a web browser
    webbrowser.open('file://' + temp_html_file)
except Exception as e:
    print("Error generating SHAP summary plot:", str(e))

# Print a subset of feature names
print("Printing a subset of feature names:")
for idx, name in enumerate(feature_names[:10]):  # Print first 10 feature names
    print(f"Feature {idx}: {name}")
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder

# Read the CSV file and select desired columns
file_path = "/kaggle/input/air-damage/DAMAGE.csv"
data = pd.read_csv(file_path,encoding='latin1')
data = data[['Record ID','Incident Year','Incident Month','Incident Day','Operator ID','Operator','Aircraft','Aircraft Type','Aircraft Make','Aircraft Model','Aircraft Mass','Engine Make','Engine Model','Engines','Engine Type','Engine1 Position','Engine2 Position','Engine3 Position','Engine4 Position','Airport ID','Airport','State','FAA Region','Warning Issued','Flight Phase','Visibility','Precipitation','Height','Speed','Distance','Species ID','Species Name','Species Quantity','Flight Impact','Fatalities','Injuries','Aircraft Damage','Radome Strike','Radome Damage','Windshield Strike','Windshield Damage','Nose Strike','Nose Damage','Engine1 Strike','Engine1 Damage','Engine2 Strike','Engine2 Damage','Engine3 Strike','Engine3 Damage','Engine4 Strike','Engine4 Damage','Engine Ingested','Propeller Strike','Propeller Damage','Wing or Rotor Strike','Wing or Rotor Damage','Fuselage Strike','Fuselage Damage','Landing Gear Strike','Landing Gear Damage','Tail Strike','Tail Damage','Lights Strike','Lights Damage','Other Strike','Other Damage']]

data = data[['Record ID','Incident Year','Incident Month','Incident Day','Operator ID','Operator','Aircraft','Aircraft Type','Aircraft Make','Aircraft Model','Aircraft Mass','Engine Make','Engine Model','Engines','Engine Type','Engine1 Position','Engine2 Position','Engine3 Position','Engine4 Position','Airport ID','Airport','State','FAA Region','Warning Issued','Flight Phase','Visibility','Precipitation','Height','Speed','Distance','Species ID','Species Name','Species Quantity','Flight Impact','Fatalities','Injuries','Aircraft Damage','Radome Strike','Radome Damage','Windshield Strike','Windshield Damage','Nose Strike','Nose Damage','Engine1 Strike','Engine1 Damage','Engine2 Strike','Engine2 Damage','Engine3 Strike','Engine3 Damage','Engine4 Strike','Engine4 Damage','Engine Ingested','Propeller Strike','Propeller Damage','Wing or Rotor Strike','Wing or Rotor Damage','Fuselage Strike','Fuselage Damage','Landing Gear Strike','Landing Gear Damage','Tail Strike','Tail Damage','Lights Strike','Lights Damage','Other Strike','Other Damage']]

for column in data.select_dtypes(include=['object']):
    data[column] = data[column].astype(str)

# Encode categorical variables
label_encoder = LabelEncoder()
for column in data.select_dtypes(include=['object']):
    data[column] = label_encoder.fit_transform(data[column])
# Convert non-numerical values to numerical using LabelEncoder
# Encode categorical variables
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
import numpy as np
# Read the CSV file

# Convert non-numeric columns to categorical type
for column in data.select_dtypes(include=['object']):
    data[column] = data[column].astype(str)
# Encode categorical variables
label_encoder = LabelEncoder()
for column in data.select_dtypes(include=['object']):
    data[column] = label_encoder.fit_transform(data[column])

# Define columns to encode
columns_to_encode = ['Record ID','Incident Year','Incident Month','Incident Day','Operator ID','Operator','Aircraft','Aircraft Type','Aircraft Make','Aircraft Model','Aircraft Mass','Engine Make','Engine Model','Engines','Engine Type','Engine1 Position','Engine2 Position','Engine3 Position','Engine4 Position','Airport ID','Airport','State','FAA Region','Warning Issued','Flight Phase','Visibility','Precipitation','Height','Speed','Distance','Species ID','Species Name','Species Quantity','Flight Impact','Fatalities','Injuries','Aircraft Damage','Radome Strike','Radome Damage','Windshield Strike','Windshield Damage','Nose Strike','Nose Damage','Engine1 Strike','Engine1 Damage','Engine2 Strike','Engine2 Damage','Engine3 Strike','Engine3 Damage','Engine4 Strike','Engine4 Damage','Engine Ingested','Propeller Strike','Propeller Damage','Wing or Rotor Strike','Wing or Rotor Damage','Fuselage Strike','Fuselage Damage','Landing Gear Strike','Landing Gear Damage','Tail Strike','Tail Damage','Lights Strike','Lights Damage','Other Strike','Other Damage']

# Initialize LabelEncoder
encoder = LabelEncoder()

# Encode categorical variables
for column in columns_to_encode:
    data[column] = encoder.fit_transform(data[column])

# Define features and target variable
features = data.drop("Aircraft Damage", axis=1)
target = data["Aircraft Damage"]

# Display the transformed dataframe
print(data.head())
label_encoder = LabelEncoder()
data_encoded = data.apply(label_encoder.fit_transform)
label_encoder = LabelEncoder()
data["Record ID"] = label_encoder.fit_transform(data["Record ID"])
data["Incident Year"] = label_encoder.fit_transform(data["Incident Year"])
data["Incident Month"] = label_encoder.fit_transform(data["Incident Month"])
data["Incident Day"] = label_encoder.fit_transform(data["Incident Day"])
data["Operator ID"] = label_encoder.fit_transform(data["Operator ID"])
data["Operator"] = label_encoder.fit_transform(data["Operator"])

# ... (continue encoding other columns)
# Define features and target variable
features = data_encoded.drop("Aircraft Damage", axis=1)
target = data_encoded["Aircraft Damage"]

# Handle missing values in the features
imputer = SimpleImputer(strategy='mean')
features_imputed = imputer.fit_transform(features)

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(features_imputed, target, test_size=0.1)

# Apply polynomial features transformation
poly = PolynomialFeatures(degree=2, include_bias=False)
x_train_poly = poly.fit_transform(x_train)
x_test_poly = poly.transform(x_test)

# Create an instance of the LinearRegression model
regressor = LinearRegression()

# Fit the model on the polynomial features
regressor.fit(x_train_poly, y_train)

# Predict 'G1' values using the trained model
y_train_pred = regressor.predict(x_train_poly)
y_test_pred = regressor.predict(x_test_poly)

# Retrieve the column names
column_names = features.columns

# Plot predicted 'G1' against each column
for column_index, column in enumerate(column_names):
    # Scatter plot for training data
    plt.scatter(x_train[:, column_index], y_train_pred, color='blue', label='Training Data')

    # Scatter plot for testing data
    plt.scatter(x_test[:, column_index], y_test_pred, color='red', label='Testing Data')

    plt.xlabel(column)
    plt.ylabel('Predicted Aircraft Damage')
    plt.title(f'Predicted Aircraft Damage vs {column}')
    plt.legend()
    plt.show()
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from gplearn.genetic import SymbolicRegressor
from gplearn.functions import make_function
from sklearn.metrics import mean_squared_error

# Read the CSV file and select desired columns
file_path = "/kaggle/input/air-damage/DAMAGE.csv"
data = pd.read_csv(file_path, encoding='latin1')
data = data[['Record ID', 'Incident Year', 'Incident Month', 'Incident Day', 'Operator ID', 'Operator', 'Aircraft',
             'Aircraft Type', 'Aircraft Make', 'Aircraft Model', 'Aircraft Mass', 'Engine Make', 'Engine Model',
             'Engines', 'Engine Type', 'Engine1 Position', 'Engine2 Position', 'Engine3 Position', 'Engine4 Position',
             'Airport ID', 'Airport', 'State', 'FAA Region', 'Warning Issued', 'Flight Phase', 'Visibility',
             'Precipitation', 'Height', 'Speed', 'Distance', 'Species ID', 'Species Name', 'Species Quantity',
             'Flight Impact', 'Fatalities', 'Injuries', 'Aircraft Damage', 'Radome Strike', 'Radome Damage',
             'Windshield Strike', 'Windshield Damage', 'Nose Strike', 'Nose Damage', 'Engine1 Strike', 'Engine1 Damage',
             'Engine2 Strike', 'Engine2 Damage', 'Engine3 Strike', 'Engine3 Damage', 'Engine4 Strike', 'Engine4 Damage',
             'Engine Ingested', 'Propeller Strike', 'Propeller Damage', 'Wing or Rotor Strike', 'Wing or Rotor Damage',
             'Fuselage Strike', 'Fuselage Damage', 'Landing Gear Strike', 'Landing Gear Damage', 'Tail Strike',
             'Tail Damage', 'Lights Strike', 'Lights Damage', 'Other Strike', 'Other Damage']]

# Convert non-numeric columns to categorical type
for column in data.select_dtypes(include=['object']):
    data[column] = data[column].astype(str)

# Encode categorical variables
label_encoder = LabelEncoder()
for column in data.select_dtypes(include=['object']):
    data[column] = label_encoder.fit_transform(data[column])

# Define features and target variable
features = data.drop("Aircraft Damage", axis=1)
target = data["Aircraft Damage"]

# Handle missing values in the features
imputer = SimpleImputer(strategy='mean')
features_imputed = imputer.fit_transform(features)

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(features_imputed, target, test_size=0.1, random_state=42)

# Define improved custom functions for genetic programming
def safe_log(x):
    return np.log(np.maximum(x, 1e-10))

def safe_exp(x):
    return np.where(x < 0, np.expm1(x) + 1, np.minimum(np.exp(x), 1e100))

def safe_pow(x, y):
    y_int = np.round(y).astype(int)
    is_int = np.isclose(y, y_int)
    int_result = np.where((x < 0) & is_int, np.sign(x)**y_int * np.abs(x)**y_int, np.power(np.abs(x), y))
    float_result = np.exp(safe_log(np.abs(x)) * y)
    return np.where(is_int, int_result, float_result)

def safe_sqrt(x):
    return np.sqrt(np.abs(x))

def safe_abs(x):
    return np.abs(x)

safe_log = make_function(function=safe_log, name='log', arity=1)
safe_exp = make_function(function=safe_exp, name='exp', arity=1)
safe_pow = make_function(function=safe_pow, name='pow', arity=2)
safe_sqrt = make_function(function=safe_sqrt, name='sqrt', arity=1)
safe_abs = make_function(function=safe_abs, name='abs', arity=1)

# Use genetic programming to evolve a symbolic expression
gp = SymbolicRegressor(population_size=1000,
                       generations=20,
                       stopping_criteria=0.01,
                       p_crossover=0.7,
                       p_subtree_mutation=0.1,
                       p_hoist_mutation=0.05,
                       p_point_mutation=0.1,
                       max_samples=0.9,
                       verbose=1,
                       parsimony_coefficient=0.01,
                       random_state=42,
                       const_range=(-1, 1),
                       function_set=['add', 'sub', 'mul', 'div', 'sin', 'cos', 'tan',
                                     safe_log, safe_exp, safe_pow, safe_sqrt, safe_abs])

# Fit the symbolic regression model
gp.fit(x_train, y_train)

# Extract and print the symbolic equation
symbolic_eq = gp._program

print("\nSymbolic Equation:")
print(symbolic_eq)

# Predict 'Aircraft Damage' values using the trained model
y_train_pred = gp.predict(x_train)
y_test_pred = gp.predict(x_test)

# Retrieve the column names
column_names = features.columns

# Plot predicted 'Aircraft Damage' against each column
for column_index, column in enumerate(column_names):
    plt.figure(figsize=(10, 6))
    # Scatter plot for training data
    plt.scatter(x_train[:, column_index], y_train_pred, color='blue', label='Training Data', alpha=0.5)

    # Scatter plot for testing data
    plt.scatter(x_test[:, column_index], y_test_pred, color='red', label='Testing Data', alpha=0.5)

    plt.xlabel(column)
    plt.ylabel('Predicted Aircraft Damage')
    plt.title(f'Predicted Aircraft Damage vs {column}')
    plt.legend()
    plt.show()

ML-PLOT-SHAP

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.impute import SimpleImputer
from sklearn.metrics import confusion_matrix, accuracy_score
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Read the CSV file
file_path = "/kaggle/input/air-damage/DAMAGE.csv"
data = pd.read_csv(file_path, encoding='latin1')

# Convert non-numeric columns to categorical type
for column in data.select_dtypes(include=['object']):
    data[column] = data[column].astype(str)

# Define columns to encode
columns_to_encode = ['Record ID', 'Incident Year', 'Incident Month', 'Incident Day', 'Operator ID', 'Operator', 'Aircraft', 'Aircraft Type', 'Aircraft Make', 'Aircraft Model', 'Aircraft Mass', 'Engine Make', 'Engine Model', 'Engines', 'Engine Type', 'Engine1 Position', 'Engine2 Position', 'Engine3 Position', 'Engine4 Position', 'Airport ID', 'Airport', 'State', 'FAA Region', 'Warning Issued', 'Flight Phase', 'Visibility', 'Precipitation', 'Height', 'Speed', 'Distance', 'Species ID', 'Species Name', 'Species Quantity', 'Flight Impact', 'Fatalities', 'Injuries', 'Aircraft Damage', 'Radome Strike', 'Radome Damage', 'Windshield Strike', 'Windshield Damage', 'Nose Strike', 'Nose Damage', 'Engine1 Strike', 'Engine1 Damage', 'Engine2 Strike', 'Engine2 Damage', 'Engine3 Strike', 'Engine3 Damage', 'Engine4 Strike', 'Engine4 Damage', 'Engine Ingested', 'Propeller Strike', 'Propeller Damage', 'Wing or Rotor Strike', 'Wing or Rotor Damage', 'Fuselage Strike', 'Fuselage Damage', 'Landing Gear Strike', 'Landing Gear Damage', 'Tail Strike', 'Tail Damage', 'Lights Strike', 'Lights Damage', 'Other Strike', 'Other Damage']

# Initialize LabelEncoder
encoder = LabelEncoder()

# Encode categorical variables
for column in columns_to_encode:
    data[column] = encoder.fit_transform(data[column])

# Define features and target variable
features = data.drop("Aircraft Damage", axis=1)
target = data["Aircraft Damage"]

# Display the transformed dataframe
print(data.head())

# Handle missing values in the features
imputer = SimpleImputer(strategy='mean')
features_imputed = imputer.fit_transform(features)

# Handle missing values in the target variable
target_imputed = imputer.fit_transform(target.values.reshape(-1, 1)).flatten()

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(features_imputed, target_imputed, test_size=0.1, random_state=42)

# Apply polynomial features transformation
poly = PolynomialFeatures(degree=2, include_bias=False)
x_train_poly = poly.fit_transform(x_train)
x_test_poly = poly.fit_transform(x_test)

# Create an instance of the LinearRegression model
regressor = LinearRegression()

# Fit the model on the polynomial features
regressor.fit(x_train_poly, y_train)

# Retrieve the coefficients and intercept
coefficients = regressor.coef_
intercept = regressor.intercept_

# Retrieve the original feature names
original_feature_names = features.columns

# Generate the polynomial feature names
feature_names = list(original_feature_names)
for feature_idx in poly.powers_:
    if np.sum(feature_idx) > 1:
        feature_name = "*".join(
            [
                f"{name}^{power}"
                for name, power in zip(original_feature_names, feature_idx)
                if power > 0
            ]
        )
        feature_names.append(feature_name)

# Create the equation
equation = "Aircraft Damage = "
for i, coefficient in enumerate(coefficients):
    if i == 0:
        equation += f"{intercept:.9f}"
    else:
        equation += f" + {coefficient:.9f} * {feature_names[i]}"
print('Coefficients:', coefficients)
print('Intercept:', intercept)
print("Equation:", equation)

# Predict on test data
y_pred = regressor.predict(x_test_poly)

# Binarize the predicted and actual values
y_pred_binary = (y_pred > 0.5).astype(int)
y_test_binary = (y_test > 0.5).astype(int)

# Compute the confusion matrix
conf_matrix = confusion_matrix(y_test_binary, y_pred_binary, labels=[0, 1])  # Provide labels for clarity
print("Confusion Matrix:")
print(conf_matrix)

# Calculate accuracy
accuracy = accuracy_score(y_test_binary, y_pred_binary)
print("Accuracy:", accuracy)

# Plot normalized confusion matrix
conf_matrix_norm = conf_matrix.astype('float') / conf_matrix.sum(axis=1)[:, np.newaxis]
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix_norm, annot=True, cmap='Blues', fmt=".2f")
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Normalized Confusion Matrix')
plt.show()

# Plot correlation matrix heatmap for selected features
selected_features = features.columns[:10]  # Example: Select first 10 features for simplicity
plt.figure(figsize=(12, 10))
sns.heatmap(features[selected_features].corr(), annot=True, cmap='coolwarm', fmt=".2f", linewidths=.5)
plt.title('Correlation Matrix Heatmap of Selected Features')
plt.show()

import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.impute import SimpleImputer
from sklearn.metrics import confusion_matrix, accuracy_score
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Read the CSV file

# Convert non-numeric columns to categorical type
for column in data.select_dtypes(include=['object']):
    data[column] = data[column].astype(str)

# Define columns to encode
columns_to_encode = ['Record ID', 'Incident Year', 'Incident Month', 'Incident Day', 'Operator ID', 'Operator', 'Aircraft', 'Aircraft Type', 'Aircraft Make', 'Aircraft Model', 'Aircraft Mass', 'Engine Make', 'Engine Model', 'Engines', 'Engine Type', 'Engine1 Position', 'Engine2 Position', 'Engine3 Position', 'Engine4 Position', 'Airport ID', 'Airport', 'State', 'FAA Region', 'Warning Issued', 'Flight Phase', 'Visibility', 'Precipitation', 'Height', 'Speed', 'Distance', 'Species ID', 'Species Name', 'Species Quantity', 'Flight Impact', 'Fatalities', 'Injuries', 'Aircraft Damage', 'Radome Strike', 'Radome Damage', 'Windshield Strike', 'Windshield Damage', 'Nose Strike', 'Nose Damage', 'Engine1 Strike', 'Engine1 Damage', 'Engine2 Strike', 'Engine2 Damage', 'Engine3 Strike', 'Engine3 Damage', 'Engine4 Strike', 'Engine4 Damage', 'Engine Ingested', 'Propeller Strike', 'Propeller Damage', 'Wing or Rotor Strike', 'Wing or Rotor Damage', 'Fuselage Strike', 'Fuselage Damage', 'Landing Gear Strike', 'Landing Gear Damage', 'Tail Strike', 'Tail Damage', 'Lights Strike', 'Lights Damage', 'Other Strike', 'Other Damage']

# Initialize LabelEncoder
encoder = LabelEncoder()

# Encode categorical variables
for column in columns_to_encode:
    data[column] = encoder.fit_transform(data[column])

# Define features and target variable
features = data.drop("Aircraft Damage", axis=1)
target = data["Aircraft Damage"]

# Display the transformed dataframe
print(data.head())

# Handle missing values in the features
imputer = SimpleImputer(strategy='mean')
features_imputed = imputer.fit_transform(features)

# Handle missing values in the target variable
target_imputed = imputer.fit_transform(target.values.reshape(-1, 1)).flatten()

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(features_imputed, target_imputed, test_size=0.1, random_state=42)

# Apply polynomial features transformation
poly = PolynomialFeatures(degree=2, include_bias=False)
x_train_poly = poly.fit_transform(x_train)
x_test_poly = poly.fit_transform(x_test)

# Create an instance of the LinearRegression model
regressor = LinearRegression()

# Fit the model on the polynomial features
regressor.fit(x_train_poly, y_train)

# Retrieve the coefficients and intercept
coefficients = regressor.coef_
intercept = regressor.intercept_

# Retrieve the original feature names
original_feature_names = features.columns

# Generate the polynomial feature names
feature_names = list(original_feature_names)
for feature_idx in poly.powers_:
    if np.sum(feature_idx) > 1:
        feature_name = "*".join(
            [
                f"{name}^{power}"
                for name, power in zip(original_feature_names, feature_idx)
                if power > 0
            ]
        )
        feature_names.append(feature_name)

# Create the equation
equation = "Aircraft Damage = "
for i, coefficient in enumerate(coefficients):
    if i == 0:
        equation += f"{intercept:.9f}"
    else:
        equation += f" + {coefficient:.9f} * {feature_names[i]}"
print('Coefficients:', coefficients)
print('Intercept:', intercept)
print("Equation:", equation)

# Predict on test data
y_pred = regressor.predict(x_test_poly)

# Binarize the predicted and actual values
y_pred_binary = (y_pred > 0.5).astype(int)
y_test_binary = (y_test > 0.5).astype(int)

# Compute the confusion matrix
conf_matrix = confusion_matrix(y_test_binary, y_pred_binary, labels=[0, 1])  # Provide labels for clarity
print("Confusion Matrix:")
print(conf_matrix)

# Calculate accuracy
accuracy = accuracy_score(y_test_binary, y_pred_binary)
print("Accuracy:", accuracy)

# Plot normalized confusion matrix
conf_matrix_norm = conf_matrix.astype('float') / conf_matrix.sum(axis=1)[:, np.newaxis]
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix_norm, annot=True, cmap='Blues', fmt=".2f")
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Normalized Confusion Matrix')

# Plot correlation matrix heatmap for all features
plt.figure(figsize=(12, 10))
sns.heatmap(features.corr(), annot=True, cmap='coolwarm', fmt=".2f", linewidths=.5)
plt.title('Correlation Matrix Heatmap of All Features')

# Print the equation
print("\nEquation for Aircraft Damage Prediction:")
print(equation)

plt.show()

import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
import seaborn as sns

# Read the CSV file

# Convert non-numeric columns to categorical type
for column in data.select_dtypes(include=['object']):
    data[column] = data[column].astype(str)

# Define columns to encode
columns_to_encode = ['Record ID', 'Incident Year', 'Incident Month', 'Incident Day', 'Operator ID', 'Operator', 'Aircraft', 'Aircraft Type', 'Aircraft Make', 'Aircraft Model', 'Aircraft Mass', 'Engine Make', 'Engine Model', 'Engines', 'Engine Type', 'Engine1 Position', 'Engine2 Position', 'Engine3 Position', 'Engine4 Position', 'Airport ID', 'Airport', 'State', 'FAA Region', 'Warning Issued', 'Flight Phase', 'Visibility', 'Precipitation', 'Height', 'Speed', 'Distance', 'Species ID', 'Species Name', 'Species Quantity', 'Flight Impact', 'Fatalities', 'Injuries', 'Aircraft Damage', 'Radome Strike', 'Radome Damage', 'Windshield Strike', 'Windshield Damage', 'Nose Strike', 'Nose Damage', 'Engine1 Strike', 'Engine1 Damage', 'Engine2 Strike', 'Engine2 Damage', 'Engine3 Strike', 'Engine3 Damage', 'Engine4 Strike', 'Engine4 Damage', 'Engine Ingested', 'Propeller Strike', 'Propeller Damage', 'Wing or Rotor Strike', 'Wing or Rotor Damage', 'Fuselage Strike', 'Fuselage Damage', 'Landing Gear Strike', 'Landing Gear Damage', 'Tail Strike', 'Tail Damage', 'Lights Strike', 'Lights Damage', 'Other Strike', 'Other Damage']

# Initialize LabelEncoder
encoder = LabelEncoder()

# Encode categorical variables
for column in columns_to_encode:
    data[column] = encoder.fit_transform(data[column])

# Handle missing values in the features
imputer = SimpleImputer(strategy='mean')
features_imputed = imputer.fit_transform(data.drop("Aircraft Damage", axis=1))

# Combine features and target into a DataFrame for plotting
df_plot = pd.DataFrame(features_imputed, columns=data.columns[:-1])  # Exclude target variable

# Add the target variable to the DataFrame
df_plot['Aircraft Damage'] = data['Aircraft Damage']

# Select a subset of features for pair plot (adjust based on your preference)
selected_features = df_plot.columns[:10]  # Example: Select first 10 features

# Create pair plot using Seaborn
plt.figure(figsize=(20, 15))
sns.pairplot(df_plot[selected_features], kind='scatter')
plt.suptitle('Pair Plot of Selected Features', y=1.02, fontsize=18)
plt.tight_layout()
plt.show()
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from sklearn.tree import plot_tree, export_text



# Convert non-numeric columns to categorical type
for column in data.select_dtypes(include=['object']):
    data[column] = data[column].astype(str)

# Define columns to encode
columns_to_encode = ['Record ID', 'Incident Year', 'Incident Month', 'Incident Day', 'Operator ID', 'Operator', 'Aircraft',
                     'Aircraft Type', 'Aircraft Make', 'Aircraft Model', 'Aircraft Mass', 'Engine Make', 'Engine Model',
                     'Engines', 'Engine Type', 'Engine1 Position', 'Engine2 Position', 'Engine3 Position', 'Engine4 Position',
                     'Airport ID', 'Airport', 'State', 'FAA Region', 'Warning Issued', 'Flight Phase', 'Visibility',
                     'Precipitation', 'Height', 'Speed', 'Distance', 'Species ID', 'Species Name', 'Species Quantity',
                     'Flight Impact', 'Fatalities', 'Injuries', 'Aircraft Damage', 'Radome Strike', 'Radome Damage',
                     'Windshield Strike', 'Windshield Damage', 'Nose Strike', 'Nose Damage', 'Engine1 Strike', 'Engine1 Damage',
                     'Engine2 Strike', 'Engine2 Damage', 'Engine3 Strike', 'Engine3 Damage', 'Engine4 Strike', 'Engine4 Damage',
                     'Engine Ingested', 'Propeller Strike', 'Propeller Damage', 'Wing or Rotor Strike', 'Wing or Rotor Damage',
                     'Fuselage Strike', 'Fuselage Damage', 'Landing Gear Strike', 'Landing Gear Damage', 'Tail Strike',
                     'Tail Damage', 'Lights Strike', 'Lights Damage', 'Other Strike', 'Other Damage']

# Initialize OrdinalEncoder
encoder = OrdinalEncoder()

# Encode categorical variables
data[columns_to_encode] = encoder.fit_transform(data[columns_to_encode])
print("Categorical columns encoded.")

# Define features and target variable
features = data.drop("Aircraft Damage", axis=1)
target = data["Aircraft Damage"]

# Handle missing values in the encoded features
imputer = SimpleImputer(strategy='most_frequent')
features_imputed = imputer.fit_transform(features)
print("Missing values in features handled.")

# Handle missing values in the target variable
target_imputed = imputer.fit_transform(target.values.reshape(-1, 1)).flatten()
print("Missing values in target handled.")

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(features_imputed, target_imputed, test_size=0.1, random_state=42)
print("Data split into training and testing sets.")

# Create an instance of the DecisionTreeClassifier
decision_tree = DecisionTreeClassifier(random_state=42)

# Fit the model on the training data
decision_tree.fit(x_train, y_train)
print("Model training completed.")

# Make predictions on the test data
y_pred = decision_tree.predict(x_test)

# Evaluate the model's accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Decision Tree Accuracy:", accuracy)

# Print the decision tree in text format for better readability
tree_rules = export_text(decision_tree, feature_names=features.columns.tolist())
print(tree_rules)

# Optionally, visualize the tree using plot_tree with adjusted plot parameters
plt.figure(figsize=(40,20))  # Adjust the size to make the text more readable
plot_tree(decision_tree, filled=True, feature_names=features.columns, class_names=True, fontsize=10, proportion=True, rounded=True)
plt.show()
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from sklearn.tree import plot_tree, export_text



# Convert non-numeric columns to categorical type
for column in data.select_dtypes(include=['object']):
    data[column] = data[column].astype(str)

# Define columns to encode
columns_to_encode = ['Record ID', 'Incident Year', 'Incident Month', 'Incident Day', 'Operator ID', 'Operator', 'Aircraft',
                     'Aircraft Type', 'Aircraft Make', 'Aircraft Model', 'Aircraft Mass', 'Engine Make', 'Engine Model',
                     'Engines', 'Engine Type', 'Engine1 Position', 'Engine2 Position', 'Engine3 Position', 'Engine4 Position',
                     'Airport ID', 'Airport', 'State', 'FAA Region', 'Warning Issued', 'Flight Phase', 'Visibility',
                     'Precipitation', 'Height', 'Speed', 'Distance', 'Species ID', 'Species Name', 'Species Quantity',
                     'Flight Impact', 'Fatalities', 'Injuries', 'Aircraft Damage', 'Radome Strike', 'Radome Damage',
                     'Windshield Strike', 'Windshield Damage', 'Nose Strike', 'Nose Damage', 'Engine1 Strike', 'Engine1 Damage',
                     'Engine2 Strike', 'Engine2 Damage', 'Engine3 Strike', 'Engine3 Damage', 'Engine4 Strike', 'Engine4 Damage',
                     'Engine Ingested', 'Propeller Strike', 'Propeller Damage', 'Wing or Rotor Strike', 'Wing or Rotor Damage',
                     'Fuselage Strike', 'Fuselage Damage', 'Landing Gear Strike', 'Landing Gear Damage', 'Tail Strike',
                     'Tail Damage', 'Lights Strike', 'Lights Damage', 'Other Strike', 'Other Damage']

# Initialize OrdinalEncoder
encoder = OrdinalEncoder()

# Encode categorical variables
data[columns_to_encode] = encoder.fit_transform(data[columns_to_encode])
print("Categorical columns encoded.")

# Define features and target variable
features = data.drop("Aircraft Damage", axis=1)
target = data["Aircraft Damage"]

# Handle missing values in the encoded features
imputer = SimpleImputer(strategy='most_frequent')
features_imputed = imputer.fit_transform(features)
print("Missing values in features handled.")

# Handle missing values in the target variable
target_imputed = imputer.fit_transform(target.values.reshape(-1, 1)).flatten()
print("Missing values in target handled.")

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(features_imputed, target_imputed, test_size=0.1, random_state=42)
print("Data split into training and testing sets.")

# Create an instance of the DecisionTreeClassifier with limited depth
decision_tree = DecisionTreeClassifier(random_state=42, max_depth=5)  # Adjust max_depth to control tree complexity

# Fit the model on the training data
decision_tree.fit(x_train, y_train)
print("Model training completed.")

# Make predictions on the test data
y_pred = decision_tree.predict(x_test)

# Evaluate the model's accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Decision Tree Accuracy:", accuracy)

# Print the decision tree in text format for better readability
tree_rules = export_text(decision_tree, feature_names=features.columns.tolist())
print(tree_rules)

# Optionally, visualize the tree using plot_tree with adjusted plot parameters
plt.figure(figsize=(20,10))  # Adjust the size to make the text more readable
plot_tree(decision_tree, filled=True, feature_names=features.columns, class_names=True, fontsize=10, proportion=True, rounded=True)
plt.show()
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.impute import SimpleImputer


# Convert non-numeric columns to categorical type
for column in data.select_dtypes(include=['object']):
    data[column] = data[column].astype(str)
# Encode categorical variables
label_encoder = LabelEncoder()
for column in data.select_dtypes(include=['object']):
    data[column] = label_encoder.fit_transform(data[column])
columns_to_encode = ['Record ID','Incident Year','Incident Month','Incident Day','Operator ID','Operator','Aircraft','Aircraft Type','Aircraft Make','Aircraft Model','Aircraft Mass','Engine Make','Engine Model','Engines','Engine Type','Engine1 Position','Engine2 Position','Engine3 Position','Engine4 Position','Airport ID','Airport','State','FAA Region','Warning Issued','Flight Phase','Visibility','Precipitation','Height','Speed','Distance','Species ID','Species Name','Species Quantity','Flight Impact','Fatalities','Injuries','Aircraft Damage','Radome Strike','Radome Damage','Windshield Strike','Windshield Damage','Nose Strike','Nose Damage','Engine1 Strike','Engine1 Damage','Engine2 Strike','Engine2 Damage','Engine3 Strike','Engine3 Damage','Engine4 Strike','Engine4 Damage','Engine Ingested','Propeller Strike','Propeller Damage','Wing or Rotor Strike','Wing or Rotor Damage','Fuselage Strike','Fuselage Damage','Landing Gear Strike','Landing Gear Damage','Tail Strike','Tail Damage','Lights Strike','Lights Damage','Other Strike','Other Damage']

# Define features and target variable
features = data.drop("Aircraft Damage", axis=1)
target = data["Aircraft Damage"]


# Apply OrdinalEncoder to encode categorical variables
encoder = OrdinalEncoder()
features_encoded = encoder.fit_transform(features)
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder

# Read the CSV file

# Convert non-numeric columns to categorical type
for column in data.select_dtypes(include=['object']):
    data[column] = data[column].astype(str)

# Define columns to encode
columns_to_encode = ['Record ID','Incident Year','Incident Month','Incident Day','Operator ID','Operator','Aircraft','Aircraft Type','Aircraft Make','Aircraft Model','Aircraft Mass','Engine Make','Engine Model','Engines','Engine Type','Engine1 Position','Engine2 Position','Engine3 Position','Engine4 Position','Airport ID','Airport','State','FAA Region','Warning Issued','Flight Phase','Visibility','Precipitation','Height','Speed','Distance','Species ID','Species Name','Species Quantity','Flight Impact','Fatalities','Injuries','Aircraft Damage','Radome Strike','Radome Damage','Windshield Strike','Windshield Damage','Nose Strike','Nose Damage','Engine1 Strike','Engine1 Damage','Engine2 Strike','Engine2 Damage','Engine3 Strike','Engine3 Damage','Engine4 Strike','Engine4 Damage','Engine Ingested','Propeller Strike','Propeller Damage','Wing or Rotor Strike','Wing or Rotor Damage','Fuselage Strike','Fuselage Damage','Landing Gear Strike','Landing Gear Damage','Tail Strike','Tail Damage','Lights Strike','Lights Damage','Other Strike','Other Damage']

# Initialize OrdinalEncoder
encoder = OrdinalEncoder()

# Encode categorical variables
data[columns_to_encode] = encoder.fit_transform(data[columns_to_encode])

# Define features and target variable
features = data.drop("Aircraft Damage", axis=1)
target = data["Aircraft Damage"]

# Display the transformed dataframe
print(data.head())
# Handle missing values in the encoded features
imputer = SimpleImputer(strategy='most_frequent')
features_imputed = imputer.fit_transform(features_encoded)

# Handle missing values in the target variable
target_imputed = imputer.fit_transform(target.values.reshape(-1, 1)).flatten()

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(features_imputed, target_imputed, test_size=0.1)

# Create an instance of the LogisticRegression model
logistic_reg = LogisticRegression()

# Fit the model on the training data
logistic_reg.fit(x_train, y_train)

# Evaluate the model's accuracy
accuracy = logistic_reg.score(x_test, y_test)
print("Accuracy:", accuracy)

# Retrieve the coefficients and intercept
coefficients = logistic_reg.coef_[0].tolist()
intercept = logistic_reg.intercept_[0]

# Print the coefficients and intercept
print("Coefficients:", coefficients)
print("Intercept:", intercept)
import numpy as np

# Define the logistic equation function
def logistic_equation(features, coefficients, intercept):
    z = np.dot(features, coefficients) + intercept
    return 1 / (1 + np.exp(-z))

# Create an instance of the LogisticRegression model
logistic_reg = LogisticRegression()

# Fit the model on the training data
logistic_reg.fit(x_train, y_train)

# Retrieve the coefficients and intercept
coefficients = logistic_reg.coef_[0]
intercept = logistic_reg.intercept_[0]

# Extract column names of the features
feature_names = data.drop("Aircraft Damage", axis=1).columns

# Construct the logistic equation string
logistic_eq_str = f"P(Aircraft Damage=1) = 1 / (1 + e^(-({intercept}"
for feature, coef in zip(feature_names, coefficients):
    logistic_eq_str += f" + {coef:.9f}*{feature}"
logistic_eq_str += "))"

print("Logistic Equation:")
print(logistic_eq_str)
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.impute import SimpleImputer
from sklearn.metrics import confusion_matrix, accuracy_score
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Read the CSV file

# Convert non-numeric columns to categorical type
for column in data.select_dtypes(include=['object']):
    data[column] = data[column].astype(str)

# Define columns to encode
columns_to_encode = ['Record ID', 'Incident Year', 'Incident Month', 'Incident Day', 'Operator ID', 'Operator', 'Aircraft', 'Aircraft Type', 'Aircraft Make', 'Aircraft Model', 'Aircraft Mass', 'Engine Make', 'Engine Model', 'Engines', 'Engine Type', 'Engine1 Position', 'Engine2 Position', 'Engine3 Position', 'Engine4 Position', 'Airport ID', 'Airport', 'State', 'FAA Region', 'Warning Issued', 'Flight Phase', 'Visibility', 'Precipitation', 'Height', 'Speed', 'Distance', 'Species ID', 'Species Name', 'Species Quantity', 'Flight Impact', 'Fatalities', 'Injuries', 'Aircraft Damage', 'Radome Strike', 'Radome Damage', 'Windshield Strike', 'Windshield Damage', 'Nose Strike', 'Nose Damage', 'Engine1 Strike', 'Engine1 Damage', 'Engine2 Strike', 'Engine2 Damage', 'Engine3 Strike', 'Engine3 Damage', 'Engine4 Strike', 'Engine4 Damage', 'Engine Ingested', 'Propeller Strike', 'Propeller Damage', 'Wing or Rotor Strike', 'Wing or Rotor Damage', 'Fuselage Strike', 'Fuselage Damage', 'Landing Gear Strike', 'Landing Gear Damage', 'Tail Strike', 'Tail Damage', 'Lights Strike', 'Lights Damage', 'Other Strike', 'Other Damage']

# Initialize LabelEncoder
encoder = LabelEncoder()

# Encode categorical variables
for column in columns_to_encode:
    data[column] = encoder.fit_transform(data[column])

# Define features and target variable
features = data.drop("Aircraft Damage", axis=1)
target = data["Aircraft Damage"]

# Display the transformed dataframe
print(data.head())

# Select a specific feature for analysis (change 'Aircraft Type' to your desired feature)
selected_feature = 'Aircraft Type'

# Create a temporary dataframe with the selected feature and target
temp_df = pd.DataFrame(features[selected_feature])
temp_df['Aircraft Damage'] = target

# Handle missing values in the selected feature
imputer = SimpleImputer(strategy='mean')
selected_feature_imputed = imputer.fit_transform(temp_df[[selected_feature]])

# Split the data into training and testing sets for the selected feature
x_feat_train, x_feat_test, y_feat_train, y_feat_test = train_test_split(selected_feature_imputed, target, test_size=0.1, random_state=42)

# Apply polynomial features transformation
poly = PolynomialFeatures(degree=2, include_bias=False)
x_feat_train_poly = poly.fit_transform(x_feat_train)
x_feat_test_poly = poly.fit_transform(x_feat_test)

# Create an instance of the LinearRegression model
regressor = LinearRegression()

# Fit the model on the polynomial features for the selected feature
regressor.fit(x_feat_train_poly, y_feat_train)

# Predict on test data for the selected feature
y_feat_pred = regressor.predict(x_feat_test_poly)

# Binarize the predicted and actual values for the selected feature
y_feat_pred_binary = (y_feat_pred > 0.5).astype(int)
y_feat_test_binary = (y_feat_test > 0.5).astype(int)

# Compute confusion matrix for the selected feature
conf_matrix_feat = confusion_matrix(y_feat_test_binary, y_feat_pred_binary, labels=[0, 1])

# Display confusion matrix for the selected feature
conf_matrix_df = pd.DataFrame(conf_matrix_feat, columns=['Predicted 0', 'Predicted 1'], index=['Actual 0', 'Actual 1'])
print(f"\nConfusion Matrix for Feature '{selected_feature}':")
print(conf_matrix_df)

# Calculate accuracy for the selected feature
accuracy = accuracy_score(y_feat_test_binary, y_feat_pred_binary)
print("\nAccuracy:", accuracy)

# Plot normalized confusion matrix for the selected feature
conf_matrix_norm = conf_matrix_feat.astype('float') / conf_matrix_feat.sum(axis=1)[:, np.newaxis]
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix_norm, annot=True, cmap='Blues', fmt=".2f")
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title(f'Normalized Confusion Matrix for Feature \'{selected_feature}\'')
plt.show()

import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.impute import SimpleImputer
from sklearn.metrics import confusion_matrix, accuracy_score
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Read the CSV file

# Convert non-numeric columns to categorical type
for column in data.select_dtypes(include=['object']):
    data[column] = data[column].astype(str)

# Define columns to encode
columns_to_encode = ['Record ID', 'Incident Year', 'Incident Month', 'Incident Day', 'Operator ID', 'Operator', 'Aircraft', 'Aircraft Type', 'Aircraft Make', 'Aircraft Model', 'Aircraft Mass', 'Engine Make', 'Engine Model', 'Engines', 'Engine Type', 'Engine1 Position', 'Engine2 Position', 'Engine3 Position', 'Engine4 Position', 'Airport ID', 'Airport', 'State', 'FAA Region', 'Warning Issued', 'Flight Phase', 'Visibility', 'Precipitation', 'Height', 'Speed', 'Distance', 'Species ID', 'Species Name', 'Species Quantity', 'Flight Impact', 'Fatalities', 'Injuries', 'Aircraft Damage', 'Radome Strike', 'Radome Damage', 'Windshield Strike', 'Windshield Damage', 'Nose Strike', 'Nose Damage', 'Engine1 Strike', 'Engine1 Damage', 'Engine2 Strike', 'Engine2 Damage', 'Engine3 Strike', 'Engine3 Damage', 'Engine4 Strike', 'Engine4 Damage', 'Engine Ingested', 'Propeller Strike', 'Propeller Damage', 'Wing or Rotor Strike', 'Wing or Rotor Damage', 'Fuselage Strike', 'Fuselage Damage', 'Landing Gear Strike', 'Landing Gear Damage', 'Tail Strike', 'Tail Damage', 'Lights Strike', 'Lights Damage', 'Other Strike', 'Other Damage']

# Initialize LabelEncoder
encoder = LabelEncoder()

# Encode categorical variables
for column in columns_to_encode:
    data[column] = encoder.fit_transform(data[column])

# Define features and target variable
features = data.drop("Aircraft Damage", axis=1)
target = data["Aircraft Damage"]

# Display the transformed dataframe
print(data.head())

# Handle missing values in the features
imputer = SimpleImputer(strategy='mean')
features_imputed = imputer.fit_transform(features)

# Handle missing values in the target variable
target_imputed = imputer.fit_transform(target.values.reshape(-1, 1)).flatten()

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(features_imputed, target_imputed, test_size=0.1, random_state=42)

# Apply polynomial features transformation
poly = PolynomialFeatures(degree=2, include_bias=False)
x_train_poly = poly.fit_transform(x_train)
x_test_poly = poly.fit_transform(x_test)

# Create an instance of the LinearRegression model
regressor = LinearRegression()

# Fit the model on the polynomial features
regressor.fit(x_train_poly, y_train)

# Predict on test data
y_pred = regressor.predict(x_test_poly)

# Binarize the predicted and actual values
y_pred_binary = (y_pred > 0.5).astype(int)
y_test_binary = (y_test > 0.5).astype(int)

# Compute the confusion matrix
conf_matrix = confusion_matrix(y_test_binary, y_pred_binary, labels=[0, 1])  # Provide labels for clarity
conf_matrix_df = pd.DataFrame(conf_matrix, columns=['Predicted 0', 'Predicted 1'], index=['Actual 0', 'Actual 1'])
print("Confusion Matrix:")
print(conf_matrix_df)

# Calculate accuracy
accuracy = accuracy_score(y_test_binary, y_pred_binary)
print("\nAccuracy:", accuracy)

# Print detailed analysis for each feature
print("\nDetailed Analysis of Confusion Matrix by Feature Impact:")

# Loop through each feature
for feature in features.columns:
    # Create a temporary dataframe with only the feature and target
    temp_df = pd.DataFrame(features[feature])
    temp_df['Aircraft Damage'] = target

    # Train-test split for this feature
    x_feat_train, x_feat_test, y_feat_train, y_feat_test = train_test_split(temp_df[[feature]].values, target, test_size=0.1, random_state=42)

    # Apply polynomial features transformation
    x_feat_train_poly = poly.fit_transform(x_feat_train)
    x_feat_test_poly = poly.fit_transform(x_feat_test)

    # Fit the model on the polynomial features
    regressor.fit(x_feat_train_poly, y_feat_train)

    # Predict on test data
    y_feat_pred = regressor.predict(x_feat_test_poly)

    # Binarize the predicted and actual values
    y_feat_pred_binary = (y_feat_pred > 0.5).astype(int)
    y_feat_test_binary = (y_feat_test > 0.5).astype(int)

    # Compute confusion matrix for this feature
    conf_matrix_feat = confusion_matrix(y_feat_test_binary, y_feat_pred_binary, labels=[0, 1])

    # Print feature header
    print(f"\nFeature: {feature}")
    print("Confusion Matrix:")
    print(pd.DataFrame(conf_matrix_feat, columns=['Predicted 0', 'Predicted 1'], index=['Actual 0', 'Actual 1']))

    # Print interpretation of confusion matrix for this feature
    print("Interpretation of Confusion Matrix:")
    print("True Negative (TN): Predicted 0 (No Damage) and Actual 0 (No Damage)")
    print("False Positive (FP): Predicted 1 (Damage) but Actual 0 (No Damage)")
    print("False Negative (FN): Predicted 0 (No Damage) but Actual 1 (Damage)")
    print("True Positive (TP): Predicted 1 (Damage) and Actual 1 (Damage)")

plt.show()

ML-PATTERN

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.impute import SimpleImputer
from sklearn.linear_model import Lasso
import numpy as np

# Read the CSV file
file_path = "/kaggle/input/pupil-csv/pupil-mat.csv"
data = pd.read_csv(file_path,encoding='latin1')

# Convert non-numeric columns to categorical type
for column in data.select_dtypes(include=['object']):
    data[column] = data[column].astype(str)

# Encode categorical variables
label_encoder = LabelEncoder()
for column in data.select_dtypes(include=['object']):
    data[column] = label_encoder.fit_transform(data[column])

# Define columns to encode
columns_to_encode = ['school', 'sex', 'age', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu', 'Mjob', 'Fjob', 'reason', 'guardian', 'traveltime', 'studytime', 'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 'Dalc', 'Walc', 'health', 'absences', 'G1', 'G2', 'G3']

# Initialize LabelEncoder
encoder = LabelEncoder()

# Encode categorical variables
for column in columns_to_encode:
    data[column] = encoder.fit_transform(data[column])

# Define features and target variable
features = data.drop("G1", axis=1)
target = data["G1"]

# Display the transformed dataframe
print(data.head())

# Handle missing values in the features
imputer = SimpleImputer(strategy='mean')
features_imputed = imputer.fit_transform(features)

# Handle missing values in the target variable
target_imputed = imputer.fit_transform(target.values.reshape(-1, 1)).flatten()

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(features_imputed, target_imputed, test_size=0.1)

# Apply polynomial features transformation
poly = PolynomialFeatures(degree=2, include_bias=False)
x_train_poly = poly.fit_transform(x_train)
x_test_poly = poly.transform(x_test)

# Create an instance of the Lasso model
lasso_regressor = Lasso(alpha=0.1)  # You can adjust the alpha parameter for tuning the sparsity

# Fit the model on the polynomial features
lasso_regressor.fit(x_train_poly, y_train)

# Retrieve the coefficients and intercept
coefficients = lasso_regressor.coef_
intercept = lasso_regressor.intercept_

# Retrieve the original feature names
original_feature_names = features.columns

# Generate the polynomial feature names
feature_names = list(original_feature_names)
for feature_idx in poly.powers_:
    if np.sum(feature_idx) > 1:
        feature_name = "*".join(
            [
                f"{name}^{power}"
                for name, power in zip(original_feature_names, feature_idx)
                if power > 0
            ]
        )
        feature_names.append(feature_name)

# Create the equation
equation = "G1 = "
for i, coefficient in enumerate(coefficients):
    if i == 0:
        equation += f"{intercept:.2f}"
    else:
        equation += f" + {coefficient:.2f} * {feature_names[i]}"
print('Coefficients:', coefficients)
print('Intercept:', intercept)
print("Equation:", equation)

import pandas as pd
import numpy as np
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.impute import SimpleImputer

# Read the CSV file and select desired columns

# Convert non-numerical values to numerical using LabelEncoder
label_encoder = LabelEncoder()
for column in data.select_dtypes(include=['object']):
    data[column] = label_encoder.fit_transform(data[column])

# Define features and target variable
features = data.drop("G1", axis=1)
target = data["G1"]

# Apply OrdinalEncoder to encode categorical variables
encoder = OrdinalEncoder()
features_encoded = encoder.fit_transform(features)

# Handle missing values in the encoded features
imputer = SimpleImputer(strategy='mean')
features_imputed = imputer.fit_transform(features_encoded)

# Handle missing values in the target variable
target_imputed = imputer.fit_transform(target.values.reshape(-1, 1)).flatten()

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(features_imputed, target_imputed, test_size=0.1)

# Create an instance of the Lasso regression model
lasso = Lasso(alpha=0.1)  # Adjust regularization strength with alpha

# Fit the model on the training data
lasso.fit(x_train, y_train)

# Evaluate the model's accuracy
acc = lasso.score(x_test, y_test)
print("Accuracy:", acc)








import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.impute import SimpleImputer

# Read the CSV file and select desired columns

data = data[['school', 'sex', 'age', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu', 'Mjob', 'Fjob', 'reason', 'guardian', 'traveltime', 'studytime', 'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 'Dalc', 'Walc', 'health', 'absences', 'G1', 'G2', 'G3']]

# Convert non-numeric columns to categorical type
for column in data.select_dtypes(include=['object']):
    data[column] = data[column].astype(str)

# Encode categorical variables
label_encoder = LabelEncoder()
for column in data.select_dtypes(include=['object']):
    data[column] = label_encoder.fit_transform(data[column])
# Encode categorical variables
label_encoder = LabelEncoder()
for column in data.select_dtypes(include=['object']):
    data[column] = label_encoder.fit_transform(data[column])
# Convert non-numerical values to numerical using LabelEncoder
label_encoder = LabelEncoder()
data["school"] = label_encoder.fit_transform(data["school"])
data["sex"] = label_encoder.fit_transform(data["sex"])
data["address"] = label_encoder.fit_transform(data["address"])
data["famsize"] = label_encoder.fit_transform(data["famsize"])
data["Pstatus"] = label_encoder.fit_transform(data["Pstatus"])
data["Mjob"] = label_encoder.fit_transform(data["Mjob"])
data["Fjob"] = label_encoder.fit_transform(data["Fjob"])
data["reason"] = label_encoder.fit_transform(data["reason"])
data["guardian"] = label_encoder.fit_transform(data["guardian"])
data["schoolsup"] = label_encoder.fit_transform(data["schoolsup"])
data["famsup"] = label_encoder.fit_transform(data["famsup"])
data["paid"] = label_encoder.fit_transform(data["paid"])
data["activities"] = label_encoder.fit_transform(data["activities"])
data["nursery"] = label_encoder.fit_transform(data["nursery"])
data["higher"] = label_encoder.fit_transform(data["higher"])
data["internet"] = label_encoder.fit_transform(data["internet"])
data["romantic"] = label_encoder.fit_transform(data["romantic"])
# Define features and target variable
features = data.drop("G1", axis=1)
target = data["G1"]

# Apply OrdinalEncoder to encode categorical variables
encoder = OrdinalEncoder()
features_encoded = encoder.fit_transform(features)

# Handle missing values in the encoded features
imputer = SimpleImputer(strategy='mean')
features_imputed = imputer.fit_transform(features_encoded)

# Handle missing values in the target variable
target_imputed = imputer.fit_transform(target.values.reshape(-1, 1)).flatten()

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(features_imputed, target_imputed, test_size=0.1)

# Create an instance of the Lasso Regression model
model = Lasso(alpha=0.1)  # Adjust regularization strength with alpha

# Fit the model on the training data
model.fit(x_train, y_train)

# Evaluate the model's accuracy on training and testing data
train_accuracy = model.score(x_train, y_train)
test_accuracy = model.score(x_test, y_test)

print("Train Accuracy:", train_accuracy)
print("Test Accuracy:", test_accuracy)

import pandas as pd
import numpy as np
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder

# Read the CSV file and select desired columns

# Label encode categorical variables
label_encoder = LabelEncoder()
for column in data.columns[data.dtypes == object]:
    data[column] = label_encoder.fit_transform(data[column])

# Define features and target variable

# Handle missing values in the features
imputer = SimpleImputer(strategy='mean')
features_imputed = imputer.fit_transform(features)

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(features_imputed, target, test_size=0.1)

# Apply polynomial features transformation
poly = PolynomialFeatures(degree=2, include_bias=False)
x_train_poly = poly.fit_transform(x_train)
x_test_poly = poly.transform(x_test)

# Create an instance of the Lasso model
lasso_regressor = Lasso(alpha=0.1)  # You can adjust the alpha value for desired sparsity

# Fit the model on the polynomial features
lasso_regressor.fit(x_train_poly, y_train)

# Retrieve the coefficients and intercept
coefficients = lasso_regressor.coef_
intercept = lasso_regressor.intercept_

# Retrieve the original feature names
original_feature_names = features.columns

# Generate the polynomial feature names
feature_names = list(original_feature_names)
for feature_idx in poly.powers_:
    if np.sum(feature_idx) > 1:
        feature_name = "*".join(
            [
                f"{name}^{power}"
                for name, power in zip(original_feature_names, feature_idx)
                if power > 0
            ]
        )
        feature_names.append(feature_name)

# Create the equation
equation = "G1 = "
for i, coefficient in enumerate(coefficients):
    if i == 0:
        equation += f"{intercept:.2f}"
    else:
        equation += f" + {coefficient:.2f} * {feature_names[i]}"

print("Equation:", equation)

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import ElasticNet
from sklearn.preprocessing import PolynomialFeatures
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
# Read the CSV file and select desired columns

data = data[['school', 'sex', 'age', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu', 'Mjob', 'Fjob', 'reason', 'guardian', 'traveltime', 'studytime', 'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 'Dalc', 'Walc', 'health', 'absences', 'G1', 'G2', 'G3']]
# Convert non-numeric columns to categorical type
for column in data.select_dtypes(include=['object']):
    data[column] = data[column].astype(str)

# Encode categorical variables
label_encoder = LabelEncoder()
for column in data.select_dtypes(include=['object']):
    data[column] = label_encoder.fit_transform(data[column])
# Convert non-numerical values to numerical using LabelEncoder
label_encoder = LabelEncoder()
data["school"] = label_encoder.fit_transform(data["school"])
data["sex"] = label_encoder.fit_transform(data["sex"])
data["address"] = label_encoder.fit_transform(data["address"])
data["famsize"] = label_encoder.fit_transform(data["famsize"])
data["Pstatus"] = label_encoder.fit_transform(data["Pstatus"])
data["Mjob"] = label_encoder.fit_transform(data["Mjob"])
data["Fjob"] = label_encoder.fit_transform(data["Fjob"])
data["reason"] = label_encoder.fit_transform(data["reason"])
data["guardian"] = label_encoder.fit_transform(data["guardian"])
data["schoolsup"] = label_encoder.fit_transform(data["schoolsup"])
data["famsup"] = label_encoder.fit_transform(data["famsup"])
data["paid"] = label_encoder.fit_transform(data["paid"])
data["activities"] = label_encoder.fit_transform(data["activities"])
data["nursery"] = label_encoder.fit_transform(data["nursery"])
data["higher"] = label_encoder.fit_transform(data["higher"])
data["internet"] = label_encoder.fit_transform(data["internet"])
data["romantic"] = label_encoder.fit_transform(data["romantic"])
# Handle missing values in the data
imputer = SimpleImputer(strategy='mean')
data_imputed = imputer.fit_transform(data)

# Separate features and target variable
X = data_imputed[:, :-1]
y = data_imputed[:, -1]

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

# Apply polynomial features transformation
poly = PolynomialFeatures(degree=2, include_bias=False)
x_train_poly = poly.fit_transform(x_train)
x_test_poly = poly.transform(x_test)

# Create an instance of the ElasticNet regression model
elastic_net = ElasticNet(alpha=0.1, l1_ratio=0.5)  # Adjust regularization strength with alpha and mix ratio with l1_ratio

# Fit the model on the polynomial features
elastic_net.fit(x_train_poly, y_train)

# Predict 'G1' values for training and testing sets
y_train_pred = elastic_net.predict(x_train_poly)
y_test_pred = elastic_net.predict(x_test_poly)

# Print the predicted 'G1' values
print("Predicted 'G1' values for training set:", y_train_pred)
print("Predicted 'G1' values for test set:", y_test_pred)

# Plot the actual G1 values and the predicted G1 values
import matplotlib.pyplot as plt

plt.scatter(y_test, y_test_pred)
plt.plot([np.min(y_test), np.max(y_test)], [np.min(y_test), np.max(y_test)], color='red', linestyle='--')
plt.xlabel("Actual G1")
plt.ylabel("Predicted G1")
plt.title("Lasso Regression: Actual vs Predicted G1")
plt.show()
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import Lasso
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

# Read the CSV file and select desired columns


# Encode categorical variables
label_encoder = LabelEncoder()
data_encoded = data.apply(label_encoder.fit_transform)

# Separate the features and target variable
X = data_encoded.drop("G1", axis=1)
y = data_encoded["G1"]

# Handle missing values in the data
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(X_imputed, y, test_size=0.1)

# Create an instance of the Lasso regression model
lasso = Lasso(alpha=0.1)  # Adjust regularization strength with alpha

# Fit the model on the training data
lasso.fit(x_train, y_train)

# Get feature names
feature_names = X.columns

# Print the coefficients of the model
print("Coefficients:", lasso.coef_)

# Plot 'G1' against each column
for i, column in enumerate(feature_names):
    plt.scatter(X[column], y)
    plt.xlabel(column)
    plt.ylabel('G1')
    plt.title(f'G1 vs {column}')
    plt.show()

import pandas as pd
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer

# Read the CSV file into a pandas DataFrame


# Separate the features (input variables) and the target variable
X = data[['school','sex','age','address','famsize','Pstatus','Medu','Fedu','Mjob','Fjob','reason','guardian','traveltime','studytime','failures','schoolsup','famsup','paid','activities','nursery','higher','internet','romantic','famrel','freetime','goout','Dalc','Walc','health','absences','G1','G2']]
y = data["G1"]

# Handle missing values in y
imputer = SimpleImputer(strategy='mean')
y_imputed = imputer.fit_transform(y.values.reshape(-1, 1)).flatten()

# Preprocess and encode non-numeric columns in X using LabelEncoder
non_numeric_cols = ['school','sex','address','famsize','Pstatus','Mjob','Fjob','reason','guardian','schoolsup','famsup','paid','activities','nursery','higher','internet','romantic']
encoder = LabelEncoder()
X_encoded = X.copy()
for col in non_numeric_cols:
    X_encoded[col] = encoder.fit_transform(X[col])

# Handle missing values in X
X_imputed = imputer.fit_transform(X_encoded)

# Scale the features using StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)

# Create an instance of the Lasso Regression model
model = Lasso(alpha=0.1)  # You can adjust the regularization strength by changing alpha

# Fit the model to the scaled data
model.fit(X_scaled, y_imputed)

# Make predictions for the existing data
predictions = model.predict(X_scaled)
print("PREDICTIONS", predictions)
import pandas as pd
import numpy as np
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.impute import SimpleImputer

# Read the CSV file and select desired columns

data = data[['school', 'sex', 'age', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu', 'Mjob', 'Fjob', 'reason', 'guardian', 'traveltime', 'studytime', 'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 'Dalc', 'Walc', 'health', 'absences', 'G1', 'G2', 'G3']]

# Convert non-numerical values to numerical using LabelEncoder
label_encoder = LabelEncoder()
for column in data.select_dtypes(include=['object']):
    data[column] = label_encoder.fit_transform(data[column])

# Define features and target variable
features = data.drop("G1", axis=1)
target = data["G1"]

# Apply OrdinalEncoder to encode categorical variables
encoder = OrdinalEncoder()
features_encoded = encoder.fit_transform(features)

# Handle missing values in the encoded features
imputer = SimpleImputer(strategy='mean')
features_imputed = imputer.fit_transform(features_encoded)

# Handle missing values in the target variable
target_imputed = imputer.fit_transform(target.values.reshape(-1, 1)).flatten()

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(features_imputed, target_imputed, test_size=0.1)

# Create an instance of the Lasso regression model
lasso = Lasso(alpha=0.1)  # Adjust regularization strength with alpha

# Fit the model on the training data
lasso.fit(x_train, y_train)

# Evaluate the model's accuracy
acc = lasso.score(x_test, y_test)
print("Accuracy:", acc)

# Print the coefficients and intercept
print("Coefficients:", lasso.coef_)
print("Intercept:", lasso.intercept_)
# Print the linear equation
features_names = features.columns
linear_equation = "G1 = "
for i, feature in enumerate(features_names):
    coefficient = lasso.coef_[i]
    linear_equation += f"({coefficient:.2f}) * {feature} + "
linear_equation += f"({lasso.intercept_:.2f})"
print("Linear Equation:", linear_equation)
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.impute import SimpleImputer

# Read the CSV file and select desired columns

# Convert non-numeric columns to categorical type
for column in data.select_dtypes(include=['object']):
    data[column] = data[column].astype(str)

# Apply OrdinalEncoder to encode categorical variables
encoder = OrdinalEncoder()
data_encoded = encoder.fit_transform(data)

# Convert non-numerical values to numerical using LabelEncoder
label_encoder = LabelEncoder()
for column in data.select_dtypes(include=['object']):
    data[column] = label_encoder.fit_transform(data[column])

# Define features and target variable
features = pd.DataFrame(data_encoded, columns=data.columns).drop("G1", axis=1)
target = data["G1"]

# Handle missing values using SimpleImputer
imputer = SimpleImputer(strategy='mean')
features_imputed = imputer.fit_transform(features)

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(features_imputed, target, test_size=0.1)

# Create an instance of the Logistic Regression model
logistic = LogisticRegression()

# Fit the model on the training data
logistic.fit(x_train, y_train)

# Predict the target variable on the test data
y_pred = logistic.predict(x_test)

# Evaluate the model's accuracy
accuracy = logistic.score(x_test, y_test)
print("Test Accuracy:", accuracy)



import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import ElasticNet
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.impute import SimpleImputer

# Read the CSV file and select desired columns

data = data[['school', 'sex', 'age', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu', 'Mjob', 'Fjob', 'reason', 'guardian', 'traveltime', 'studytime', 'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 'Dalc', 'Walc', 'health', 'absences', 'G1', 'G2', 'G3']]

# Convert non-numerical values to numerical using LabelEncoder
label_encoder = LabelEncoder()
data["school"] = label_encoder.fit_transform(data["school"])
data["sex"] = label_encoder.fit_transform(data["sex"])
data["address"] = label_encoder.fit_transform(data["address"])
data["famsize"] = label_encoder.fit_transform(data["famsize"])
data["Pstatus"] = label_encoder.fit_transform(data["Pstatus"])
data["Mjob"] = label_encoder.fit_transform(data["Mjob"])
data["Fjob"] = label_encoder.fit_transform(data["Fjob"])
data["reason"] = label_encoder.fit_transform(data["reason"])
data["guardian"] = label_encoder.fit_transform(data["guardian"])
data["schoolsup"] = label_encoder.fit_transform(data["schoolsup"])
data["famsup"] = label_encoder.fit_transform(data["famsup"])
data["paid"] = label_encoder.fit_transform(data["paid"])
data["activities"] = label_encoder.fit_transform(data["activities"])
data["nursery"] = label_encoder.fit_transform(data["nursery"])
data["higher"] = label_encoder.fit_transform(data["higher"])
data["internet"] = label_encoder.fit_transform(data["internet"])
data["romantic"] = label_encoder.fit_transform(data["romantic"])
# ... (continue encoding other columns)

# Define features and target variable
features = data.drop("sex", axis=1)
target = data["sex"]

# Apply OrdinalEncoder to encode categorical variables
encoder = OrdinalEncoder()
features_encoded = encoder.fit_transform(features)

# Handle missing values in the encoded features
imputer = SimpleImputer(strategy='most_frequent')
features_imputed = imputer.fit_transform(features_encoded)

# Handle missing values in the target variable
target_imputed = imputer.fit_transform(target.values.reshape(-1, 1)).flatten()

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(features_imputed, target_imputed, test_size=0.1)

# Create an instance of the ElasticNet regression model
elastic_net = ElasticNet(alpha=0.1, l1_ratio=0.5)  # Adjust regularization strength with alpha and mix ratio with l1_ratio

# Fit the model on the training data
elastic_net.fit(x_train, y_train)

# Evaluate the model's accuracy
acc = elastic_net.score(x_test, y_test)
print("Accuracy:", acc)

# Print the coefficients and intercept
print("Coefficients:", elastic_net.coef_)
print("Intercept:", elastic_net.intercept_)

# Print the linear equation
features_names = features.columns
linear_equation = "sex ="
for i, feature in enumerate(features_names):
    coefficient = elastic_net.coef_[i]
    linear_equation += f"({coefficient:.2f}) * {feature} + "
linear_equation += f"({elastic_net.intercept_:.2f})"
print("Linear Equation:", linear_equation)
import pandas as pd
from sklearn.linear_model import ElasticNet
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

# Read the CSV file into a pandas DataFrame

# Separate the features (input variables) and the target variable
X = data[['school', 'age', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu', 'Mjob', 'Fjob', 'reason', 'guardian', 'traveltime', 'studytime', 'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 'Dalc', 'Walc', 'health', 'absences', 'G1', 'G2', 'G3']]
y = data["sex"]

# Preprocess and encode all columns using OrdinalEncoder
encoder = OrdinalEncoder()
X_encoded = encoder.fit_transform(X)

# Handle missing values in X
imputer = SimpleImputer(strategy='most_frequent')
X_imputed = imputer.fit_transform(X_encoded)

# Scale the features using StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)

# Encode the target variable y
encoder_y = OrdinalEncoder()
y_encoded = encoder_y.fit_transform(y.values.reshape(-1, 1)).flatten()

# Create an instance of the Elastic Net model
elastic_net = ElasticNet(alpha=0.1, l1_ratio=0.5)  # Adjust regularization strength with alpha and mix ratio with l1_ratio

# Fit the model to the scaled data
elastic_net.fit(X_scaled, y_encoded)

# Make predictions for the existing data
predictions = elastic_net.predict(X_scaled)
print("PREDICTIONS:", predictions)
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import ElasticNet

# Read the CSV file and select desired columns

data = data[['school', 'sex', 'age', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu', 'Mjob', 'Fjob', 'reason', 'guardian', 'traveltime', 'studytime', 'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 'Dalc', 'Walc', 'health', 'absences', 'G1', 'G2', 'G3']]

# Convert non-numerical values to numerical using LabelEncoder
label_encoder = LabelEncoder()
for column in data.columns:
    if data[column].dtype == 'object':
        data[column] = label_encoder.fit_transform(data[column])

# Define features and target variable
X = data.drop(columns=['sex'])
y = data['sex']

# Handle missing values in the data
imputer = SimpleImputer(strategy='most_frequent')
X_imputed = imputer.fit_transform(X)

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(X_imputed, y, test_size=0.1)

# Create an instance of the ElasticNet model
elastic_net = ElasticNet(alpha=0.1, l1_ratio=0.5)  # Adjust regularization strength with alpha and mix ratio with l1_ratio

# Fit the model on the training data
elastic_net.fit(x_train, y_train)

# Evaluate the model's accuracy on the training data
train_acc = elastic_net.score(x_train, y_train)
print("Training Accuracy:", train_acc)

# Evaluate the model's accuracy on the test data
test_acc = elastic_net.score(x_test, y_test)
print("Test Accuracy:", test_acc)
import pandas as pd
import numpy as np
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt

# Read the CSV file and select desired columns

data = data[['school', 'sex', 'age', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu', 'Mjob', 'Fjob', 'reason', 'guardian', 'traveltime', 'studytime', 'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 'Dalc', 'Walc', 'health', 'absences', 'G1', 'G2', 'G3']]

# Encode categorical variables
label_encoder = LabelEncoder()
data_encoded = data.apply(label_encoder.fit_transform)

# Define features and target variable
features = data_encoded.drop("sex", axis=1)
target = data_encoded["sex"]

# Handle missing values in the features
imputer = SimpleImputer(strategy='mean')
features_imputed = imputer.fit_transform(features)

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(features_imputed, target, test_size=0.1)

# Apply polynomial features transformation
poly = PolynomialFeatures(degree=2, include_bias=False)
x_train_poly = poly.fit_transform(x_train)
x_test_poly = poly.transform(x_test)

# Create an instance of the ElasticNet model
elastic_net = ElasticNet(alpha=0.1, l1_ratio=0.5, max_iter=10000)

# Fit the model on the polynomial features
elastic_net.fit(x_train_poly, y_train)

# Retrieve the coefficients and intercept
coefficients = elastic_net.coef_
intercept = elastic_net.intercept_

# Retrieve the original feature names
original_feature_names = features.columns

# Generate the polynomial feature names
feature_names = list(original_feature_names)
for feature_idx in poly.powers_:
    if np.sum(feature_idx) > 1:
        feature_name = "*".join(
            [
                f"{name}^{power}"
                for name, power in zip(original_feature_names, feature_idx)
                if power > 0
            ]
        )
        feature_names.append(feature_name)

# Create the equation
equation = "sex = "
for i, coefficient in enumerate(coefficients):
    if i == 0:
        equation += f"{intercept:.2f}"
    else:
        equation += f" + {coefficient:.2f} * {feature_names[i]}"

print("Equation:", equation)
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import ElasticNet

# Read the CSV file


# Select relevant columns
data = data[['school', 'age', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu', 'Mjob', 'Fjob', 'reason', 'guardian', 'traveltime', 'studytime', 'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 'Dalc', 'Walc', 'health', 'absences', 'G1', 'G2', 'G3', 'sex']]

# Encode categorical variables
label_encoder = LabelEncoder()
data_encoded = data.apply(label_encoder.fit_transform)

# Handle missing values
imputer = SimpleImputer(strategy='mean')
data_imputed = imputer.fit_transform(data_encoded)

# Split features and target variable
X = data_imputed[:, :-1]
y = data_imputed[:, -1]

# Split data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create an instance of ElasticNet model
# You can adjust the parameters alpha (regularization strength) and l1_ratio (L1 ratio) as needed
elastic_net = ElasticNet(alpha=0.1, l1_ratio=0.5)

# Train the model
elastic_net.fit(x_train, y_train)

# Make predictions for training and testing sets
y_train_pred = elastic_net.predict(x_train)
y_test_pred = elastic_net.predict(x_test)

# Print the predicted values for training and testing sets
print("Predicted 'sex' values for training set:", y_train_pred)
print("Predicted 'sex' values for test set:", y_test_pred)

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import ElasticNet
import matplotlib.pyplot as plt

# Read the CSV file

# Select relevant columns
data = data[['school', 'age', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu', 'Mjob', 'Fjob', 'reason', 'guardian', 'traveltime', 'studytime', 'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 'Dalc', 'Walc', 'health', 'absences', 'G1', 'G2', 'G3', 'sex']]

# Encode categorical variables
label_encoder = LabelEncoder()
data_encoded = data.apply(label_encoder.fit_transform)

# Handle missing values
imputer = SimpleImputer(strategy='mean')
data_imputed = imputer.fit_transform(data_encoded)

# Split features and target variable
X = data_imputed[:, :-1]
y = data_imputed[:, -1]

# Split data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create an instance of ElasticNet model
# You can adjust the parameters alpha (regularization strength) and l1_ratio (L1 ratio) as needed
elastic_net = ElasticNet(alpha=0.1, l1_ratio=0.5)

# Train the model
elastic_net.fit(x_train, y_train)

# Make predictions for training and testing sets
y_train_pred = elastic_net.predict(x_train)
y_test_pred = elastic_net.predict(x_test)

# Plot the actual sex values and the predicted sex values
plt.scatter(y_test, y_test_pred)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red', linestyle='--')
plt.xlabel("Actual sex")
plt.ylabel("Predicted sex")
plt.title("Elastic Net Regression: Actual vs Predicted sex")
plt.show()

import pandas as pd
import numpy as np
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
import matplotlib.pyplot as plt

# Read the CSV file and select desired columns

data = data[['school', 'sex', 'age', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu', 'Mjob', 'Fjob', 'reason', 'guardian', 'traveltime', 'studytime', 'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 'Dalc', 'Walc', 'health', 'absences', 'G1', 'G2', 'G3']]

# Encode categorical variables
label_encoder = LabelEncoder()
data_encoded = data.apply(label_encoder.fit_transform)

# Define features and target variable
features = data_encoded.drop("sex", axis=1)
target = data_encoded["sex"]

# Handle missing values in the features
imputer = SimpleImputer(strategy='mean')
features_imputed = imputer.fit_transform(features)

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(features_imputed, target, test_size=0.1)

# Apply polynomial features transformation
poly = PolynomialFeatures(degree=2, include_bias=False)
x_train_poly = poly.fit_transform(x_train)
x_test_poly = poly.transform(x_test)

# Create an instance of the ElasticNet model
elastic_net = ElasticNet(alpha=0.1, l1_ratio=0.5)  # Adjust regularization strength with alpha and mix ratio with l1_ratio

# Fit the model on the polynomial features
elastic_net.fit(x_train_poly, y_train)

# Predict 'sex' values using the trained model
y_pred = elastic_net.predict(x_test_poly)

# Create scatter plots between predicted sex and other columns
for col_idx, col_name in enumerate(features.columns):
    plt.scatter(x_test_poly[:, col_idx], y_pred, label="Predicted sex")
    plt.xlabel(col_name)
    plt.ylabel("Predicted sex")
    plt.title(f"Predicted sex vs {col_name}")
    plt.legend()
    plt.show()
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

# Read the CSV file and select desired columns

data = data[['school', 'sex', 'age', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu', 'Mjob', 'Fjob', 'reason', 'guardian', 'traveltime', 'studytime', 'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 'Dalc', 'Walc', 'health', 'absences', 'G1', 'G2', 'G3']]

# Encode categorical variables
label_encoder = LabelEncoder()
data_encoded = data.apply(label_encoder.fit_transform)

# Define features and target variable
features = data_encoded.drop("sex", axis=1)
target = data_encoded["sex"]

# Handle missing values in the features
imputer = SimpleImputer(strategy='mean')
features_imputed = imputer.fit_transform(features)

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(features_imputed, target, test_size=0.1)

# Scale the features
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

# Create an instance of the LogisticRegression model with elastic net penalty
classifier = LogisticRegression(penalty='elasticnet', solver='saga', l1_ratio=0.5)  # Adjust the l1_ratio as needed

# Fit the model
classifier.fit(x_train_scaled, y_train)

# Predict 'sex' values using the trained model
y_pred = classifier.predict(features_imputed)

# Retrieve the column names
column_names = features.columns

# Plot predicted 'school' against each column
for column in column_names:
    unique_values = data[column].unique()
    num_unique = len(unique_values)
    plt.figure(figsize=(12, 6))
    for i, value in enumerate(unique_values):
        plt.subplot(1, num_unique, i+1)
        plt.bar([0, 1], [np.sum((features[column] == value) & (y_pred == 0)),
                         np.sum((features[column] == value) & (y_pred == 1))], color=['blue', 'red'])
        plt.xlabel("sex")
        plt.ylabel("Count")
        plt.title(f"{column} = {value}")
        plt.xticks([0, 1], label_encoder.inverse_transform([0, 1]))  # Use inverse_transform to get the original labels
    plt.tight_layout()
    plt.show()


SPARSE-REG

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
import numpy as np
# Read the CSV file
file_path = "/kaggle/input/pupil-csv/pupil-mat.csv"
data = pd.read_csv(file_path,encoding='latin1')
# Convert non-numeric columns to categorical type
for column in data.select_dtypes(include=['object']):
    data[column] = data[column].astype(str)
# Encode categorical variables
label_encoder = LabelEncoder()
for column in data.select_dtypes(include=['object']):
    data[column] = label_encoder.fit_transform(data[column])

# Define columns to encode
columns_to_encode = ['school', 'sex', 'age', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu', 'Mjob', 'Fjob', 'reason', 'guardian', 'traveltime', 'studytime', 'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 'Dalc', 'Walc', 'health', 'absences', 'G1', 'G2', 'G3']

# Initialize LabelEncoder
encoder = LabelEncoder()

# Encode categorical variables
for column in columns_to_encode:
    data[column] = encoder.fit_transform(data[column])

# Define features and target variable
features = data.drop("G1", axis=1)
target = data["G1"]

# Display the transformed dataframe
print(data.head())
# Handle missing values in the features
imputer = SimpleImputer(strategy='mean')
features_imputed = imputer.fit_transform(features)

# Handle missing values in the target variable
target_imputed = imputer.fit_transform(target.values.reshape(-1, 1)).flatten()

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(features_imputed, target_imputed, test_size=0.1)
# Apply polynomial features transformation
poly = PolynomialFeatures(degree=2, include_bias=False)
x_train_poly = poly.fit_transform(x_train)
x_test_poly = poly.transform(x_test)

# Create an instance of the LinearRegression model
regressor = LinearRegression()

# Fit the model on the polynomial features
regressor.fit(x_train_poly, y_train)

# Retrieve the coefficients and intercept
coefficients = regressor.coef_
intercept = regressor.intercept_

# Retrieve the original feature names
original_feature_names = features.columns

# Generate the polynomial feature names
feature_names = list(original_feature_names)
for feature_idx in poly.powers_:
    if np.sum(feature_idx) > 1:
        feature_name = "*".join(
            [
                f"{name}^{power}"
                for name, power in zip(original_feature_names, feature_idx)
                if power > 0
            ]
        )
        feature_names.append(feature_name)

# Create the equation
equation = "G1= "
for i, coefficient in enumerate(coefficients):
    if i == 0:
        equation += f"{intercept:f}"
    else:
        equation += f" + {coefficient:f} * {feature_names[i]}"
print('coefficient',coefficients)
print('intercept',intercept)
print("Equation:", equation)
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.impute import SimpleImputer

# Read the CSV file and select desired columns
# Read the CSV file and select desired columns

# Encode categorical variables
label_encoder = LabelEncoder()

# Convert non-numeric columns to categorical type
for column in data.select_dtypes(include=['object']):
    data[column] = data[column].astype(str)
for column in data.select_dtypes(include=['object']):
    data[column] = label_encoder.fit_transform(data[column])
# Define features and target variable


# Apply OrdinalEncoder to encode categorical variables
encoder = OrdinalEncoder()
features_encoded = encoder.fit_transform(features)

# Handle missing values in the encoded features
imputer = SimpleImputer(strategy='most_frequent')
features_imputed = imputer.fit_transform(features_encoded)

# Handle missing values in the target variable
target_imputed = imputer.fit_transform(target.values.reshape(-1, 1)).flatten()

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(features_imputed, target_imputed, test_size=0.1)

# Create an instance of the LinearRegression model
linear = linear_model.LinearRegression()

# Fit the model on the training data
linear.fit(x_train, y_train)

# Evaluate the model's accuracy
acc = linear.score(x_test, y_test)
print("Accuracy:", acc)


import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.impute import SimpleImputer

# Read the CSV file and select desired columns

data = data[['school', 'sex', 'age', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu', 'Mjob', 'Fjob', 'reason', 'guardian', 'traveltime', 'studytime', 'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 'Dalc', 'Walc', 'health', 'absences', 'G1', 'G2', 'G3']]
# Convert non-numeric columns to categorical type
for column in data.select_dtypes(include=['object']):
    data[column] = data[column].astype(str)

# Encode categorical variables
label_encoder = LabelEncoder()
for column in data.select_dtypes(include=['object']):
    data[column] = label_encoder.fit_transform(data[column])
# Convert non-numerical values to numerical using LabelEncoder
label_encoder = LabelEncoder()
data["school"] = label_encoder.fit_transform(data["school"])
data["sex"] = label_encoder.fit_transform(data["sex"])
data["address"] = label_encoder.fit_transform(data["address"])
data["famsize"] = label_encoder.fit_transform(data["famsize"])
data["Pstatus"] = label_encoder.fit_transform(data["Pstatus"])
data["Mjob"] = label_encoder.fit_transform(data["Mjob"])
data["Fjob"] = label_encoder.fit_transform(data["Fjob"])
data["reason"] = label_encoder.fit_transform(data["reason"])
data["guardian"] = label_encoder.fit_transform(data["guardian"])
data["schoolsup"] = label_encoder.fit_transform(data["schoolsup"])
data["famsup"] = label_encoder.fit_transform(data["famsup"])
data["paid"] = label_encoder.fit_transform(data["paid"])
data["activities"] = label_encoder.fit_transform(data["activities"])
data["nursery"] = label_encoder.fit_transform(data["nursery"])
data["higher"] = label_encoder.fit_transform(data["higher"])
data["internet"] = label_encoder.fit_transform(data["internet"])
data["romantic"] = label_encoder.fit_transform(data["romantic"])

# Define features and target variable
features = data.drop("G1", axis=1)
target = data["G1"]

# Apply OrdinalEncoder to encode categorical variables
encoder = OrdinalEncoder()
features_encoded = encoder.fit_transform(features)

# Handle missing values in the encoded features
imputer = SimpleImputer(strategy='mean')
features_imputed = imputer.fit_transform(features_encoded)

# Handle missing values in the target variable
target_imputed = imputer.fit_transform(target.values.reshape(-1, 1)).flatten()

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(features_imputed, target_imputed, test_size=0.1)

# Create an instance of the Linear Regression model
model = LinearRegression()

# Fit the model on the training data
model.fit(x_train, y_train)

# Evaluate the model's accuracy on training and testing data
train_accuracy = model.score(x_train, y_train)
test_accuracy = model.score(x_test, y_test)

print("Train Accuracy:", train_accuracy)
print("Test Accuracy:", test_accuracy)
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder

# Read the CSV file and select desired columns

data = data[['school', 'sex', 'age', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu', 'Mjob', 'Fjob', 'reason', 'guardian', 'traveltime', 'studytime', 'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 'Dalc', 'Walc', 'health', 'absences', 'G1', 'G2', 'G3']]

# Encode categorical variables
label_encoder = LabelEncoder()
data_encoded = data.apply(label_encoder.fit_transform)
label_encoder = LabelEncoder()
data_encoded = data.apply(label_encoder.fit_transform)
label_encoder = LabelEncoder()
data["school"] = label_encoder.fit_transform(data["school"])
data["sex"] = label_encoder.fit_transform(data["sex"])
data["address"] = label_encoder.fit_transform(data["address"])
data["famsize"] = label_encoder.fit_transform(data["famsize"])
data["Pstatus"] = label_encoder.fit_transform(data["Pstatus"])
data["Mjob"] = label_encoder.fit_transform(data["Mjob"])
data["Fjob"] = label_encoder.fit_transform(data["Fjob"])
data["reason"] = label_encoder.fit_transform(data["reason"])
data["guardian"] = label_encoder.fit_transform(data["guardian"])
data["schoolsup"] = label_encoder.fit_transform(data["schoolsup"])
data["famsup"] = label_encoder.fit_transform(data["famsup"])
data["paid"] = label_encoder.fit_transform(data["paid"])
data["activities"] = label_encoder.fit_transform(data["activities"])
data["nursery"] = label_encoder.fit_transform(data["nursery"])
data["higher"] = label_encoder.fit_transform(data["higher"])
data["internet"] = label_encoder.fit_transform(data["internet"])
data["romantic"] = label_encoder.fit_transform(data["romantic"])
# Define features and target variable
features = data_encoded.drop("G1", axis=1)
target = data_encoded["G1"]

# Handle missing values in the features
imputer = SimpleImputer(strategy='mean')
features_imputed = imputer.fit_transform(features)

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(features_imputed, target, test_size=0.1)

# Apply polynomial features transformation
poly = PolynomialFeatures(degree=2, include_bias=False)
x_train_poly = poly.fit_transform(x_train)
x_test_poly = poly.transform(x_test)

# Create an instance of the LinearRegression model
regressor = LinearRegression()

# Fit the model on the polynomial features
regressor.fit(x_train_poly, y_train)

# Retrieve the coefficients and intercept
coefficients = regressor.coef_
intercept = regressor.intercept_

# Retrieve the original feature names
original_feature_names = features.columns

# Generate the polynomial feature names
feature_names = list(original_feature_names)
for feature_idx in poly.powers_:
    if np.sum(feature_idx) > 1:
        feature_name = "*".join(
            [
                f"{name}^{power}"
                for name, power in zip(original_feature_names, feature_idx)
                if power > 0
            ]
        )
        feature_names.append(feature_name)

# Create the equation
equation = "G1= "
for i, coefficient in enumerate(coefficients):
    if i == 0:
        equation += f"{intercept:.2f}"
    else:
        equation += f" + {coefficient:.2f} * {feature_names[i]}"

print("Equation:", equation)


# Predict 'G1' values using the trained model
y_train_pred = regressor.predict(x_train_poly)
y_test_pred = regressor.predict(x_test_poly)

# Print the predicted 'G1' values
print("Predicted 'G1' values for training set:", y_train_pred)
print("Predicted 'G1' values for test set:", y_test_pred)
# Predict on the test data
import matplotlib.pyplot as plt
y_pred = regressor.predict(x_test_poly)

# Plot the actual G1 values and the predicted G1 values
plt.scatter(y_test, y_pred)
plt.plot([np.min(x_test), np.max(x_test)], [np.min(y_test), np.max(y_test)], color='red', linestyle='--')
plt.xlabel("Actual G1")
plt.ylabel("Predicted G1")
plt.title("Polynomial Regression: Actual vs Predicted G1")
plt.show()
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder

# Read the CSV file and select desired columns

data = data[['school', 'sex', 'age', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu', 'Mjob', 'Fjob', 'reason', 'guardian', 'traveltime', 'studytime', 'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 'Dalc', 'Walc', 'health', 'absences', 'G1', 'G2', 'G3']]

# Encode categorical variables
label_encoder = LabelEncoder()
data_encoded = data.apply(label_encoder.fit_transform)

# Retrieve the column names
column_names = data_encoded.columns

# Plot 'G1' against each column
for column in column_names:
    if column != 'G1':
        plt.scatter(data_encoded[column], data_encoded['G1'])
        plt.xlabel(column)
        plt.ylabel('G1')
        plt.title(f'G1 vs {column}')
        plt.show()

import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer

# Read the CSV file into a pandas DataFrame


# Separate the features (input variables) and the target variable
X = data[['school','sex','age','address','famsize','Pstatus','Medu','Fedu','Mjob','Fjob','reason','guardian','traveltime','studytime','failures','schoolsup','famsup','paid','activities','nursery','higher','internet','romantic','famrel','freetime','goout','Dalc','Walc','health','absences','G1','G2']]
y = data["G1"]

# Handle missing values in y
imputer = SimpleImputer(strategy='mean')
y_imputed = imputer.fit_transform(y.values.reshape(-1, 1)).flatten()

# Preprocess and encode non-numeric columns in X using LabelEncoder
non_numeric_cols = ['school','sex','address','famsize','Pstatus','Mjob','Fjob','reason','guardian','schoolsup','famsup','paid','activities','nursery','higher','internet','romantic']
encoder = LabelEncoder()
X_encoded = X.copy()
for col in non_numeric_cols:
    X_encoded[col] = encoder.fit_transform(X[col])

# Handle missing values in X
X_imputed = imputer.fit_transform(X_encoded)

# Scale the features using StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)

# Create an instance of the Linear Regression model
model = LinearRegression()

# Fit the model to the scaled data
model.fit(X_scaled, y_imputed)

# Make predictions for the existing data
predictions = model.predict(X_scaled)
print("PREDICTIONS", predictions)
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.impute import SimpleImputer

# Read the CSV file and select desired columns

data = data[['school', 'sex', 'age', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu', 'Mjob', 'Fjob', 'reason', 'guardian', 'traveltime', 'studytime', 'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 'Dalc', 'Walc', 'health', 'absences', 'G1', 'G2', 'G3']]

# Convert non-numerical values to numerical using LabelEncoder
label_encoder = LabelEncoder()
data["school"] = label_encoder.fit_transform(data["school"])
data["sex"] = label_encoder.fit_transform(data["sex"])
data["address"] = label_encoder.fit_transform(data["address"])
data["famsize"] = label_encoder.fit_transform(data["famsize"])
data["Pstatus"] = label_encoder.fit_transform(data["Pstatus"])
data["Mjob"] = label_encoder.fit_transform(data["Mjob"])
data["Fjob"] = label_encoder.fit_transform(data["Fjob"])
data["reason"] = label_encoder.fit_transform(data["reason"])
data["guardian"] = label_encoder.fit_transform(data["guardian"])
data["schoolsup"] = label_encoder.fit_transform(data["schoolsup"])
data["famsup"] = label_encoder.fit_transform(data["famsup"])
data["paid"] = label_encoder.fit_transform(data["paid"])
data["activities"] = label_encoder.fit_transform(data["activities"])
data["nursery"] = label_encoder.fit_transform(data["nursery"])
data["higher"] = label_encoder.fit_transform(data["higher"])
data["internet"] = label_encoder.fit_transform(data["internet"])
data["romantic"] = label_encoder.fit_transform(data["romantic"])
# ... (continue encoding other columns)

# Define features and target variable
features = data.drop("G1", axis=1)
target = data["G1"]

# Apply OrdinalEncoder to encode categorical variables
encoder = OrdinalEncoder()
features_encoded = encoder.fit_transform(features)

# Handle missing values in the encoded features
imputer = SimpleImputer(strategy='mean')
features_imputed = imputer.fit_transform(features_encoded)

# Handle missing values in the target variable
target_imputed = imputer.fit_transform(target.values.reshape(-1, 1)).flatten()

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(features_imputed, target_imputed, test_size=0.1)

# Create an instance of the LinearRegression model
linear = linear_model.LinearRegression()

# Fit the model on the training data
linear.fit(x_train, y_train)

# Evaluate the model's accuracy
acc = linear.score(x_test, y_test)
print("Accuracy:", acc)

# Print the coefficients and intercept
print("Coefficients:", linear.coef_)
print("Intercept:", linear.intercept_)

# Print the linear equation
features_names = features.columns
linear_equation = "G1 = "
for i, feature in enumerate(features_names):
    coefficient = linear.coef_[i]
    linear_equation += f"({coefficient:.2f}) * {feature} + "
linear_equation += f"({linear.intercept_:.2f})"
print("Linear Equation:", linear_equation)

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder

# Read the CSV file and select desired columns

data = data[['school', 'sex', 'age', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu', 'Mjob', 'Fjob', 'reason', 'guardian', 'traveltime', 'studytime', 'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 'Dalc', 'Walc', 'health', 'absences', 'G1', 'G2', 'G3']]
# Convert non-numeric columns to categorical type
for column in data.select_dtypes(include=['object']):
    data[column] = data[column].astype(str)

# Encode categorical variables
label_encoder = LabelEncoder()
for column in data.select_dtypes(include=['object']):
    data[column] = label_encoder.fit_transform(data[column])
# Convert non-numerical values to numerical using LabelEncoder
label_encoder = LabelEncoder()
data["school"] = label_encoder.fit_transform(data["school"])
data["sex"] = label_encoder.fit_transform(data["sex"])
data["address"] = label_encoder.fit_transform(data["address"])
data["famsize"] = label_encoder.fit_transform(data["famsize"])
data["Pstatus"] = label_encoder.fit_transform(data["Pstatus"])
data["Mjob"] = label_encoder.fit_transform(data["Mjob"])
data["Fjob"] = label_encoder.fit_transform(data["Fjob"])
data["reason"] = label_encoder.fit_transform(data["reason"])
data["guardian"] = label_encoder.fit_transform(data["guardian"])
data["schoolsup"] = label_encoder.fit_transform(data["schoolsup"])
data["famsup"] = label_encoder.fit_transform(data["famsup"])
data["paid"] = label_encoder.fit_transform(data["paid"])
data["activities"] = label_encoder.fit_transform(data["activities"])
data["nursery"] = label_encoder.fit_transform(data["nursery"])
data["higher"] = label_encoder.fit_transform(data["higher"])
data["internet"] = label_encoder.fit_transform(data["internet"])
data["romantic"] = label_encoder.fit_transform(data["romantic"])

# Define features and target variable
features = data.drop("G1", axis=1)
target = data["G1"]

# Handle missing values in the features
imputer = SimpleImputer(strategy='mean')
features_imputed = imputer.fit_transform(features)

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(features_imputed, target, test_size=0.1)

# Apply polynomial features transformation
poly = PolynomialFeatures(degree=2, include_bias=False)
x_train_poly = poly.fit_transform(x_train)
x_test_poly = poly.transform(x_test)

# Create an instance of the LinearRegression model
regressor = LinearRegression()

# Fit the model on the polynomial features
regressor.fit(x_train_poly, y_train)

# Predict 'G1' values using the trained model
y_train_pred = regressor.predict(x_train_poly)
y_test_pred = regressor.predict(x_test_poly)

# Retrieve the column names
column_names = features.columns

# Plot predicted 'G1' against each column
for column_index, column in enumerate(column_names):
    # Scatter plot for training data
    plt.scatter(x_train[:, column_index], y_train_pred, color='blue', label='Training Data')

    # Scatter plot for testing data
    plt.scatter(x_test[:, column_index], y_test_pred, color='red', label='Testing Data')

    plt.xlabel(column)
    plt.ylabel('Predicted G1')
    plt.title(f'Predicted G1 vs {column}')
    plt.legend()
    plt.show()
import pandas as pd
import matplotlib.pyplot as plt

# Read the CSV file


# Specify the column name for comparison
column_name = "G1"

# Get the values from the specified column
column_values = data[column_name]

# Calculate the percentage values based on the maximum value in the column
max_value = column_values.max()
percentage_values = (column_values / max_value) * 100

# Create a line plot to compare the percentage values across rows
plt.figure(figsize=(10, 6))
plt.plot(percentage_values, marker='o')
plt.xlabel("Row Index")
plt.ylabel(f"{column_name} Percentage")
plt.title(f"Comparison of {column_name} Percentage Across Rows")
plt.tight_layout()

plt.show()






import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.impute import SimpleImputer

# Read the CSV file and select desired columns

data = data[['school', 'sex', 'age', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu', 'Mjob', 'Fjob', 'reason', 'guardian', 'traveltime', 'studytime', 'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 'Dalc', 'Walc', 'health', 'absences', 'G1', 'G2', 'G3']]

# Convert non-numerical values to numerical using LabelEncoder
label_encoder = LabelEncoder()
data["school"] = label_encoder.fit_transform(data["school"])
data["sex"] = label_encoder.fit_transform(data["sex"])
data["address"] = label_encoder.fit_transform(data["address"])
data["famsize"] = label_encoder.fit_transform(data["famsize"])
data["Pstatus"] = label_encoder.fit_transform(data["Pstatus"])
data["Mjob"] = label_encoder.fit_transform(data["Mjob"])
data["Fjob"] = label_encoder.fit_transform(data["Fjob"])
data["reason"] = label_encoder.fit_transform(data["reason"])
data["guardian"] = label_encoder.fit_transform(data["guardian"])
data["schoolsup"] = label_encoder.fit_transform(data["schoolsup"])
data["famsup"] = label_encoder.fit_transform(data["famsup"])
data["paid"] = label_encoder.fit_transform(data["paid"])
data["activities"] = label_encoder.fit_transform(data["activities"])
data["nursery"] = label_encoder.fit_transform(data["nursery"])
data["higher"] = label_encoder.fit_transform(data["higher"])
data["internet"] = label_encoder.fit_transform(data["internet"])
data["romantic"] = label_encoder.fit_transform(data["romantic"])
# ... (continue encoding other columns)

# Define features and target variable
features = data.drop("sex", axis=1)
target = data["sex"]

# Apply OrdinalEncoder to encode categorical variables
encoder = OrdinalEncoder()
features_encoded = encoder.fit_transform(features)

# Handle missing values in the encoded features
imputer = SimpleImputer(strategy='most_frequent')
features_imputed = imputer.fit_transform(features_encoded)

# Handle missing values in the target variable
target_imputed = imputer.fit_transform(target.values.reshape(-1, 1)).flatten()

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(features_imputed, target_imputed, test_size=0.1)

# Create an instance of the LinearRegression model
linear = linear_model.LinearRegression()

# Fit the model on the training data
linear.fit(x_train, y_train)

# Evaluate the model's accuracy
acc = linear.score(x_test, y_test)
print("Accuracy:", acc)

# Print the coefficients and intercept
print("Coefficients:", linear.coef_)
print("Intercept:", linear.intercept_)

# Print the linear equation
features_names = features.columns
linear_equation = "sex ="
for i, feature in enumerate(features_names):
    coefficient = linear.coef_[i]
    linear_equation += f"({coefficient:.2f}) * {feature} + "
linear_equation += f"({linear.intercept_:.2f})"
print("Linear Equation:", linear_equation)
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

# Read the CSV file into a pandas DataFrame


# Separate the features (input variables) and the target variable
X = data[['school', 'age', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu', 'Mjob', 'Fjob', 'reason', 'guardian', 'traveltime', 'studytime', 'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 'Dalc', 'Walc', 'health', 'absences', 'G1', 'G2', 'G3']]
y = data["sex"]

# Preprocess and encode all columns using OrdinalEncoder
encoder = OrdinalEncoder()
X_encoded = encoder.fit_transform(X)

# Handle missing values in X
imputer = SimpleImputer(strategy='most_frequent')
X_imputed = imputer.fit_transform(X_encoded)

# Scale the features using StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)

# Encode the target variable y
encoder_y = OrdinalEncoder()
y_encoded = encoder_y.fit_transform(y.values.reshape(-1, 1)).flatten()

# Create an instance of the Linear Regression model
model = LinearRegression()

# Fit the model to the scaled data
model.fit(X_scaled, y_encoded)

# Make predictions for the existing data
predictions = model.predict(X_scaled)
print("PREDICTIONS:", predictions)
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.impute import SimpleImputer

# Read the CSV file and select desired columns

data = data[['school', 'sex', 'age', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu', 'Mjob', 'Fjob', 'reason', 'guardian', 'traveltime', 'studytime', 'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 'Dalc', 'Walc', 'health', 'absences', 'G1', 'G2', 'G3']]

# Convert non-numerical values to numerical using LabelEncoder
label_encoder = LabelEncoder()
data["school"] = label_encoder.fit_transform(data["school"])
data["sex"] = label_encoder.fit_transform(data["sex"])
data["address"] = label_encoder.fit_transform(data["address"])
data["famsize"] = label_encoder.fit_transform(data["famsize"])
data["Pstatus"] = label_encoder.fit_transform(data["Pstatus"])
data["Mjob"] = label_encoder.fit_transform(data["Mjob"])
data["Fjob"] = label_encoder.fit_transform(data["Fjob"])
data["reason"] = label_encoder.fit_transform(data["reason"])
data["guardian"] = label_encoder.fit_transform(data["guardian"])
data["schoolsup"] = label_encoder.fit_transform(data["schoolsup"])
data["famsup"] = label_encoder.fit_transform(data["famsup"])
data["paid"] = label_encoder.fit_transform(data["paid"])
data["activities"] = label_encoder.fit_transform(data["activities"])
data["nursery"] = label_encoder.fit_transform(data["nursery"])
data["higher"] = label_encoder.fit_transform(data["higher"])
data["internet"] = label_encoder.fit_transform(data["internet"])
data["romantic"] = label_encoder.fit_transform(data["romantic"])

# Define features and target variable
features = data.drop("sex", axis=1)
target = data["sex"]

# Apply OrdinalEncoder to encode categorical variables
encoder = OrdinalEncoder()
features_encoded = encoder.fit_transform(features)

# Handle missing values in the encoded features
imputer = SimpleImputer(strategy='most_frequent')
features_imputed = imputer.fit_transform(features_encoded)

# Handle missing values in the target variable
target_imputed = imputer.fit_transform(target.values.reshape(-1, 1)).flatten()

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(features_imputed, target_imputed, test_size=0.1)

# Create an instance of the LinearRegression model
linear = linear_model.LinearRegression()

# Fit the model on the training data
linear.fit(x_train, y_train)

# Evaluate the model's accuracy on the training data
train_acc = linear.score(x_train, y_train)
print("Training Accuracy:", train_acc)

# Evaluate the model's accuracy on the test data
test_acc = linear.score(x_test, y_test)
print("Test Accuracy:", test_acc)
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder

# Read the CSV file and select desired columns

data = data[['school', 'sex', 'age', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu', 'Mjob', 'Fjob', 'reason', 'guardian', 'traveltime', 'studytime', 'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 'Dalc', 'Walc', 'health', 'absences', 'G1', 'G2', 'G3']]

# Encode categorical variables
label_encoder = LabelEncoder()
data_encoded = data.apply(label_encoder.fit_transform)
label_encoder = LabelEncoder()
data["school"] = label_encoder.fit_transform(data["school"])
data["sex"] = label_encoder.fit_transform(data["sex"])
data["address"] = label_encoder.fit_transform(data["address"])
data["famsize"] = label_encoder.fit_transform(data["famsize"])
data["Pstatus"] = label_encoder.fit_transform(data["Pstatus"])
data["Mjob"] = label_encoder.fit_transform(data["Mjob"])
data["Fjob"] = label_encoder.fit_transform(data["Fjob"])
data["reason"] = label_encoder.fit_transform(data["reason"])
data["guardian"] = label_encoder.fit_transform(data["guardian"])
data["schoolsup"] = label_encoder.fit_transform(data["schoolsup"])
data["famsup"] = label_encoder.fit_transform(data["famsup"])
data["paid"] = label_encoder.fit_transform(data["paid"])
data["activities"] = label_encoder.fit_transform(data["activities"])
data["nursery"] = label_encoder.fit_transform(data["nursery"])
data["higher"] = label_encoder.fit_transform(data["higher"])
data["internet"] = label_encoder.fit_transform(data["internet"])
data["romantic"] = label_encoder.fit_transform(data["romantic"])
# Define features and target variable
features = data_encoded.drop("sex", axis=1)
target = data_encoded["sex"]

# Handle missing values in the features
imputer = SimpleImputer(strategy='mean')
features_imputed = imputer.fit_transform(features)

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(features_imputed, target, test_size=0.1)

# Apply polynomial features transformation
poly = PolynomialFeatures(degree=2, include_bias=False)
x_train_poly = poly.fit_transform(x_train)
x_test_poly = poly.transform(x_test)

# Create an instance of the LinearRegression model
regressor = LinearRegression()

# Fit the model on the polynomial features
regressor.fit(x_train_poly, y_train)

# Retrieve the coefficients and intercept
coefficients = regressor.coef_
intercept = regressor.intercept_

# Retrieve the original feature names
original_feature_names = features.columns

# Generate the polynomial feature names
feature_names = list(original_feature_names)
for feature_idx in poly.powers_:
    if np.sum(feature_idx) > 1:
        feature_name = "*".join(
            [
                f"{name}^{power}"
                for name, power in zip(original_feature_names, feature_idx)
                if power > 0
            ]
        )
        feature_names.append(feature_name)

# Create the equation
equation = "sex = "
for i, coefficient in enumerate(coefficients):
    if i == 0:
        equation += f"{intercept:.2f}"
    else:
        equation += f" + {coefficient:.2f} * {feature_names[i]}"

print("Equation:", equation)
# Predict 'sex' values using the trained model
y_train_pred = regressor.predict(x_train_poly)
y_test_pred = regressor.predict(x_test_poly)

# Print the predicted 'G1' values
print("Predicted 'sex' values for training set:", y_train_pred)
print("Predicted 'sex' values for test set:", y_test_pred)
# Predict on the test data
y_pred = regressor.predict(x_test_poly)

# Plot the actual sex values and the predicted sex values
plt.scatter(y_test, y_pred)
plt.plot([np.min(y_test), np.max(y_test)], [np.min(y_test), np.max(y_test)], color='red', linestyle='--')
plt.xlabel("Actualsex")
plt.ylabel("Predicted sex")
plt.title("Polynomial Regression: Actual vs Predicted sex")
plt.show()
# Create scatter plots between predicted sex and other columns
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
import matplotlib.pyplot as plt

# ... (previous code remains the same)

# Predict on the test data
y_pred = regressor.predict(x_test_poly)

# Create scatter plots between predicted sex and other columns
for col_idx, col_name in enumerate(features.columns):
    plt.scatter(x_test_poly[:, col_idx], y_pred, label="Predicted sex")
    plt.xlabel(col_name)
    plt.ylabel("Predicted sex")
    plt.title(f"Predicted sex vs {col_name}")
    plt.legend()
    plt.show()
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer

# Read the CSV file and select desired columns

data = data[['school', 'sex', 'age', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu', 'Mjob', 'Fjob', 'reason', 'guardian', 'traveltime', 'studytime', 'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 'Dalc', 'Walc', 'health', 'absences', 'G1', 'G2', 'G3']]

# Encode categorical variables
label_encoder = LabelEncoder()
data_encoded = data.apply(label_encoder.fit_transform)

# Define features and target variable
features = data_encoded.drop("sex", axis=1)
target = data_encoded["sex"]

# Handle missing values in the features
imputer = SimpleImputer(strategy='mean')
features_imputed = imputer.fit_transform(features)

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(features_imputed, target, test_size=0.1)

# Create an instance of the LogisticRegression model
classifier = LogisticRegression()

# Fit the model
classifier.fit(x_train, y_train)

# Predict 'sex' values using the trained model
y_train_pred = classifier.predict(features_imputed)

# Retrieve the column names
column_names = features.columns

# Plot predicted 'school' against each column
for column in column_names:
    unique_values = data[column].unique()
    num_unique = len(unique_values)
    plt.figure(figsize=(12, 6))
    for i, value in enumerate(unique_values):
        plt.subplot(1, num_unique, i+1)
        plt.bar([0, 1], [np.sum((features[column] == value) & (y_train_pred == 0)),
                         np.sum((features[column] == value) & (y_train_pred == 1))], color=['blue', 'red'])
        plt.xlabel("sex")
        plt.ylabel("Count")
        plt.title(f"{column} = {value}")
        plt.xticks([0, 1], label_encoder.inverse_transform([0, 1]))  # Use inverse_transform to get the original labels
    plt.tight_layout()
    plt.show()

import pandas as pd
import matplotlib.pyplot as plt

# Read the CSV file

# Specify the column name for comparison
column_name = "sex"

# Get the value counts from the specified column
value_counts = data[column_name].value_counts()

# Create a bar plot to compare the distribution of values across rows
plt.figure(figsize=(10, 6))
value_counts.plot(kind='bar')
plt.xlabel("sex")
plt.ylabel("Count")
plt.title(f"Distribution of {column_name} Across Rows")
plt.tight_layout()

LINEAR-REG

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.impute import SimpleImputer

# Read the CSV file and select desired columns
file_path = "/kaggle/input/landing-aircraft/Landing.csv"
data = pd.read_csv(file_path, encoding='latin1')
# Convert non-numeric columns to categorical type
for column in data.select_dtypes(include=['object']):
    data[column] = data[column].astype(str)
# Encode categorical variables
label_encoder = LabelEncoder()
for column in data.select_dtypes(include=['object']):
    data[column] = label_encoder.fit_transform(data[column])

# Define features and target variable
features = data.drop("JOINT_USE", axis=1)
target = data["JOINT_USE"]


# Apply OrdinalEncoder to encode categorical variables
encoder = OrdinalEncoder()
features_encoded = encoder.fit_transform(features)
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder

# Read the CSV file


# Convert non-numeric columns to categorical type
for column in data.select_dtypes(include=['object']):
    data[column] = data[column].astype(str)

# Define columns to encode
columns_to_encode = ['index','X','Y','FID','SITE_NO','LAN_FA_TY','LOCID','EFF_DATE','FAA_REGION','FAA_DISTRI','ST_POSTAL','STFIPS','FAA_ST','STATE_NAME','COUNTY_NAM','COUNTY_ST','CITY_NAME','FULLNAME','OWNER_TYPE','FAC_USE','FAC_CYSTZP','LATITUDE','LONGITUDE','ELEV','AERO_CHART','CBD_DIST','CBD_DIR','ACT_DATE','CERT_TYPE','FED_AGREE','INTERNATIO','CUST_LNDG','JOINT_USE','MIL_LNDG_R','CNTL_TWR','S_ENG_GA','M_ENG_GA','JET_EN_GA','HELICOPTER','OPER_GLIDE','OPER_MIL','ULTRALIGHT','COMM_SERV','AIR_TAXI','LOCAL_OPS','ITIN_OPS','MIL_OPS','Arrivals','Departures','Enplanemen','Passengers']

# Initialize OrdinalEncoder
encoder = OrdinalEncoder()

# Encode categorical variables
data[columns_to_encode] = encoder.fit_transform(data[columns_to_encode])

# Define features and target variable
features = data.drop("JOINT_USE", axis=1)
target = data["JOINT_USE"]

# Display the transformed dataframe
print(data.head())
# Handle missing values in the encoded features
imputer = SimpleImputer(strategy='most_frequent')
features_imputed = imputer.fit_transform(features_encoded)

# Handle missing values in the target variable
target_imputed = imputer.fit_transform(target.values.reshape(-1, 1)).flatten()

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(features_imputed, target_imputed, test_size=0.1)

# Create an instance of the LogisticRegression model
logistic_reg = LogisticRegression()

# Fit the model on the training data
logistic_reg.fit(x_train, y_train)

# Evaluate the model's accuracy
accuracy = logistic_reg.score(x_test, y_test)
print("Accuracy:", accuracy)

# Retrieve the coefficients and intercept
coefficients = logistic_reg.coef_[0].tolist()
intercept = logistic_reg.intercept_[0]

# Print the coefficients and intercept
print("Coefficients:", coefficients)
print("Intercept:", intercept)
import numpy as np

# Define the logistic equation function
def logistic_equation(features, coefficients, intercept):
    z = np.dot(features, coefficients) + intercept
    return 1 / (1 + np.exp(-z))

# Create an instance of the LogisticRegression model
logistic_reg = LogisticRegression()

# Fit the model on the training data
logistic_reg.fit(x_train, y_train)

# Retrieve the coefficients and intercept
coefficients = logistic_reg.coef_[0]
intercept = logistic_reg.intercept_[0]

# Extract column names of the features
feature_names = data.drop("JOINT_USE", axis=1).columns

# Construct the logistic equation string
logistic_eq_str = f"P(JOINT_USE=1) = 1 / (1 + e^(-({intercept}"
for feature, coef in zip(feature_names, coefficients):
    logistic_eq_str += f" + {coef:.4f}*{feature}"
logistic_eq_str += "))"

print("Logistic Equation:")
print(logistic_eq_str)

LOGISTIC

In [None]:
# @title Default title text ROLLING WINDOWS
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt

# Read the CSV file
file_name='/kaggle/input/air-damage/DAMAGE.csv'
data = pd.read_csv(file_name, encoding='latin1')

# Convert non-numeric columns to string type
for column in data.select_dtypes(include=['object']):
    data[column] = data[column].astype(str)

# Define columns to encode
columns_to_encode = ['Record ID', 'Incident Year', 'Incident Month', 'Incident Day', 'Operator ID', 'Operator', 'Aircraft', 'Aircraft Type', 'Aircraft Make', 'Aircraft Model', 'Aircraft Mass', 'Engine Make', 'Engine Model', 'Engines', 'Engine Type', 'Engine1 Position', 'Engine2 Position', 'Engine3 Position', 'Engine4 Position', 'Airport ID', 'Airport', 'State', 'FAA Region', 'Warning Issued', 'Flight Phase', 'Visibility', 'Precipitation', 'Height', 'Speed', 'Distance', 'Species ID', 'Species Name', 'Species Quantity', 'Flight Impact', 'Fatalities', 'Injuries', 'Aircraft Damage', 'Radome Strike', 'Radome Damage', 'Windshield Strike', 'Windshield Damage', 'Nose Strike', 'Nose Damage', 'Engine1 Strike', 'Engine1 Damage', 'Engine2 Strike', 'Engine2 Damage', 'Engine3 Strike', 'Engine3 Damage', 'Engine4 Strike', 'Engine4 Damage', 'Engine Ingested', 'Propeller Strike', 'Propeller Damage', 'Wing or Rotor Strike', 'Wing or Rotor Damage', 'Fuselage Strike', 'Fuselage Damage', 'Landing Gear Strike', 'Landing Gear Damage', 'Tail Strike', 'Tail Damage', 'Lights Strike', 'Lights Damage', 'Other Strike', 'Other Damage']

# Initialize LabelEncoder
encoder = LabelEncoder()

# Encode categorical variables
for column in columns_to_encode:
    data[column] = encoder.fit_transform(data[column])

# Define features and target variable
features = data.drop("Aircraft Damage", axis=1)
target = data["Aircraft Damage"]

# Handle missing values in the features
imputer = SimpleImputer(strategy='median')
features_imputed = imputer.fit_transform(features)

# Handle missing values in the target variable
target_imputed = imputer.fit_transform(target.values.reshape(-1, 1)).flatten()

# Standardize the data
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features_imputed)

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(features_scaled, target_imputed, test_size=0.1, random_state=42)

# Build the ANN model
model = keras.Sequential([
    layers.Input(shape=(x_train.shape[1],)),
    layers.Dense(32, activation='relu'),
    layers.Dense(16, activation='relu'),
    layers.Dense(1)
])

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001), loss='mean_squared_error')

# Train the model
history = model.fit(x_train, y_train, validation_split=0.2, epochs=20, batch_size=64, verbose=1)

# Predict the entire target variable
y_pred = model.predict(features_scaled).flatten()

# Rolling window setup
window_size = 100  # Define your rolling window size
step_size = 10  # Define the step size for the rolling window
num_windows = (len(features_scaled) - window_size) // step_size + 1

# Prepare a list to store rolling window predictions
rolling_predictions = []

# Apply rolling window on the predictions
for start in range(0, num_windows * step_size, step_size):
    end = start + window_size
    if end > len(y_pred):
        break
    window_pred = y_pred[start:end]
    rolling_predictions.append(window_pred.mean())

# Convert rolling predictions to numpy array for evaluation
rolling_predictions = np.array(rolling_predictions)
true_values = target_imputed[window_size // 2: len(rolling_predictions) + window_size // 2]

# Evaluate the overall predictions
mse = mean_squared_error(true_values, rolling_predictions)
r2 = r2_score(true_values, rolling_predictions)

print(f"Rolling Window ANN Mean Squared Error: {mse}")
print(f"Rolling Window ANN R-squared Score: {r2}")

# Visualize the true values and rolling predictions
plt.figure(figsize=(12, 6))
plt.plot(true_values, label='True Values')
plt.plot(rolling_predictions, label='Rolling Predictions', alpha=0.7)
plt.legend()
plt.title('Rolling Window Predictions')
plt.show()


ROLLING-WINDOWS

In [None]:
# @title  RANDOM-FOREST
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns

# Read the CSV file
file_name='/kaggle/input/air-damage/DAMAGE.csv'
data = pd.read_csv(file_name, encoding='latin1')

# Convert non-numeric columns to string type
for column in data.select_dtypes(include=['object']):
    data[column] = data[column].astype(str)

# Define columns to encode
columns_to_encode = ['Record ID', 'Incident Year', 'Incident Month', 'Incident Day', 'Operator ID', 'Operator', 'Aircraft', 'Aircraft Type', 'Aircraft Make', 'Aircraft Model', 'Aircraft Mass', 'Engine Make', 'Engine Model', 'Engines', 'Engine Type', 'Engine1 Position', 'Engine2 Position', 'Engine3 Position', 'Engine4 Position', 'Airport ID', 'Airport', 'State', 'FAA Region', 'Warning Issued', 'Flight Phase', 'Visibility', 'Precipitation', 'Height', 'Speed', 'Distance', 'Species ID', 'Species Name', 'Species Quantity', 'Flight Impact', 'Fatalities', 'Injuries', 'Aircraft Damage', 'Radome Strike', 'Radome Damage', 'Windshield Strike', 'Windshield Damage', 'Nose Strike', 'Nose Damage', 'Engine1 Strike', 'Engine1 Damage', 'Engine2 Strike', 'Engine2 Damage', 'Engine3 Strike', 'Engine3 Damage', 'Engine4 Strike', 'Engine4 Damage', 'Engine Ingested', 'Propeller Strike', 'Propeller Damage', 'Wing or Rotor Strike', 'Wing or Rotor Damage', 'Fuselage Strike', 'Fuselage Damage', 'Landing Gear Strike', 'Landing Gear Damage', 'Tail Strike', 'Tail Damage', 'Lights Strike', 'Lights Damage', 'Other Strike', 'Other Damage']

# Initialize LabelEncoder
encoder = LabelEncoder()

# Encode categorical variables
for column in columns_to_encode:
    data[column] = encoder.fit_transform(data[column])

# Define features and target variable
features = data.drop("Aircraft Damage", axis=1)
target = data["Aircraft Damage"]

# Handle missing values in the features
imputer = SimpleImputer(strategy='median')
features_imputed = imputer.fit_transform(features)

# Standardize the data
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features_imputed)

# Convert target to binary (0 and 1)
target_binary = (target > 0).astype(int)

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(features_scaled, target_binary, test_size=0.1, random_state=42)

# Train a Random Forest classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(x_train, y_train)

# Get feature importances
importances = clf.feature_importances_
indices = np.argsort(importances)[::-1]
feature_names = features.columns

# Print the feature ranking
print("Feature ranking:")
for f in range(x_train.shape[1]):
    print(f"{f + 1}. feature {indices[f]} ({importances[indices[f]]}) - {feature_names[indices[f]]}")

# Plot the feature importances
plt.figure(figsize=(15, 10))
plt.title("Feature importances")
plt.bar(range(x_train.shape[1]), importances[indices], align="center")
plt.xticks(range(x_train.shape[1]), feature_names[indices], rotation=90)
plt.xlim([-1, x_train.shape[1]])
plt.show()

# Visualize the relationship between the most important features and the target
important_features = feature_names[indices[:5]]  # Top 5 important features

for feature in important_features:
    plt.figure(figsize=(10, 6))
    sns.scatterplot(x=data[feature], y=target, alpha=0.5)
    plt.title(f'{feature} vs Aircraft Damage')
    plt.xlabel(feature)
    plt.ylabel('Aircraft Damage')
    plt.show()

# Calculate the correlation matrix
corr_matrix = data.corr()

# Plot the correlation matrix
plt.figure(figsize=(20, 20))
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

# Read the CSV file
file_name='/kaggle/input/air-damage/DAMAGE.csv'
data = pd.read_csv(file_name, encoding='latin1')

# Convert non-numeric columns to string type
for column in data.select_dtypes(include=['object']):
    data[column] = data[column].astype(str)

# Define columns to encode
columns_to_encode = ['Record ID', 'Incident Year', 'Incident Month', 'Incident Day', 'Operator ID', 'Operator', 'Aircraft', 'Aircraft Type', 'Aircraft Make', 'Aircraft Model', 'Aircraft Mass', 'Engine Make', 'Engine Model', 'Engines', 'Engine Type', 'Engine1 Position', 'Engine2 Position', 'Engine3 Position', 'Engine4 Position', 'Airport ID', 'Airport', 'State', 'FAA Region', 'Warning Issued', 'Flight Phase', 'Visibility', 'Precipitation', 'Height', 'Speed', 'Distance', 'Species ID', 'Species Name', 'Species Quantity', 'Flight Impact', 'Fatalities', 'Injuries', 'Aircraft Damage', 'Radome Strike', 'Radome Damage', 'Windshield Strike', 'Windshield Damage', 'Nose Strike', 'Nose Damage', 'Engine1 Strike', 'Engine1 Damage', 'Engine2 Strike', 'Engine2 Damage', 'Engine3 Strike', 'Engine3 Damage', 'Engine4 Strike', 'Engine4 Damage', 'Engine Ingested', 'Propeller Strike', 'Propeller Damage', 'Wing or Rotor Strike', 'Wing or Rotor Damage', 'Fuselage Strike', 'Fuselage Damage', 'Landing Gear Strike', 'Landing Gear Damage', 'Tail Strike', 'Tail Damage', 'Lights Strike', 'Lights Damage', 'Other Strike', 'Other Damage']

# Initialize LabelEncoder
encoder = LabelEncoder()

# Encode categorical variables
for column in columns_to_encode:
    data[column] = encoder.fit_transform(data[column])

# Define features and target variable
features = data.drop("Aircraft Damage", axis=1)
target = data["Aircraft Damage"]

# Handle missing values in the features
imputer = SimpleImputer(strategy='median')
features_imputed = imputer.fit_transform(features)

# Standardize the data
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features_imputed)

# Convert target to binary (0 and 1)
target_binary = (target > 0).astype(int)

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(features_scaled, target_binary, test_size=0.1, random_state=42)

# Define the flag and pennant detection function
def detect_patterns(data, window_size=20):
    patterns = []
    for i in range(len(data) - window_size):
        window = data[i:i+window_size]
        # Simple heuristic for flag and pennant pattern detection
        if max(window) - min(window) > np.std(window) * 2:  # Example condition for strong movement
            patterns.append(1)  # Flag or pennant detected
        else:
            patterns.append(0)  # No pattern detected
    patterns += [0] * window_size  # Append zeros for the remaining part
    return np.array(patterns)

# Apply the pattern detection to each feature
pattern_features = np.apply_along_axis(detect_patterns, 0, features_scaled)

# Train a classifier to predict patterns
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(x_train, y_train)

# Predict patterns on the test set
y_pred = clf.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"Pattern Prediction Accuracy: {accuracy}")

# Visualize the detected patterns for one of the features
plt.figure(figsize=(12, 6))
plt.plot(range(len(features_scaled)), features_scaled[:, 0], label='Feature 0')
plt.plot(range(len(features_scaled)), pattern_features[:, 0], label='Detected Patterns', linestyle='--', color='red')
plt.legend()
plt.title('Detected Flag and Pennant Patterns for Feature 0')
plt.show()
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns

# Read the CSV file
file_name = '/kaggle/input/air-damage/DAMAGE.csv'
data = pd.read_csv(file_name, encoding='latin1')

# Convert non-numeric columns to string type
for column in data.select_dtypes(include=['object']):
    data[column] = data[column].astype(str)

# Define columns to encode
columns_to_encode = ['Record ID', 'Incident Year', 'Incident Month', 'Incident Day', 'Operator ID', 'Operator', 'Aircraft', 'Aircraft Type', 'Aircraft Make', 'Aircraft Model', 'Aircraft Mass', 'Engine Make', 'Engine Model', 'Engines', 'Engine Type', 'Engine1 Position', 'Engine2 Position', 'Engine3 Position', 'Engine4 Position', 'Airport ID', 'Airport', 'State', 'FAA Region', 'Warning Issued', 'Flight Phase', 'Visibility', 'Precipitation', 'Height', 'Speed', 'Distance', 'Species ID', 'Species Name', 'Species Quantity', 'Flight Impact', 'Fatalities', 'Injuries', 'Aircraft Damage', 'Radome Strike', 'Radome Damage', 'Windshield Strike', 'Windshield Damage', 'Nose Strike', 'Nose Damage', 'Engine1 Strike', 'Engine1 Damage', 'Engine2 Strike', 'Engine2 Damage', 'Engine3 Strike', 'Engine3 Damage', 'Engine4 Strike', 'Engine4 Damage', 'Engine Ingested', 'Propeller Strike', 'Propeller Damage', 'Wing or Rotor Strike', 'Wing or Rotor Damage', 'Fuselage Strike', 'Fuselage Damage', 'Landing Gear Strike', 'Landing Gear Damage', 'Tail Strike', 'Tail Damage', 'Lights Strike', 'Lights Damage', 'Other Strike', 'Other Damage']

# Initialize LabelEncoder
encoder = LabelEncoder()

# Encode categorical variables
for column in columns_to_encode:
    data[column] = encoder.fit_transform(data[column])

# Define features and target variable
features = data.drop("Aircraft Damage", axis=1)
target = data["Aircraft Damage"]

# Handle missing values in the features
imputer = SimpleImputer(strategy='median')
features_imputed = imputer.fit_transform(features)

# Standardize the data
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features_imputed)

# Convert target to binary (0 and 1)
target_binary = (target > 0).astype(int)

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(features_scaled, target_binary, test_size=0.1, random_state=42)

# Train a Random Forest classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(x_train, y_train)

# Get feature importances
importances = clf.feature_importances_
indices = np.argsort(importances)[::-1]
feature_names = features.columns

# Print the feature ranking
print("Feature ranking:")
for f in range(x_train.shape[1]):
    print(f"{f + 1}. feature {indices[f]} ({importances[indices[f]]}) - {feature_names[indices[f]]}")

# Plot the feature importances
plt.figure(figsize=(15, 10))
plt.title("Feature importances")
plt.bar(range(x_train.shape[1]), importances[indices], align="center")
plt.xticks(range(x_train.shape[1]), feature_names[indices], rotation=90)
plt.xlim([-1, x_train.shape[1]])
plt.show()

# Visualize the relationship between the most important features and the target
important_features = feature_names[indices[:5]]  # Top 5 important features

for feature in important_features:
    plt.figure(figsize=(10, 6))
    sns.scatterplot(x=data[feature], y=target, alpha=0.5)
    plt.title(f'{feature} vs Aircraft Damage')
    plt.xlabel(feature)
    plt.ylabel('Aircraft Damage')
    plt.show()

# Calculate the correlation matrix
corr_matrix = data.corr()

# Print the correlation matrix as text
print("Correlation Matrix:")
print(corr_matrix)

# Plot the correlation matrix
plt.figure(figsize=(20, 20))
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

RANDOM-FOREST

In [None]:
# @title  POWER-LAW
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
import seaborn as sns

# Read the CSV file
file_path = "/kaggle/input/air-damage/DAMAGE.csv"
data = pd.read_csv(file_path, encoding='latin1')

# Convert non-numeric columns to categorical type
for column in data.select_dtypes(include=['object']):
    data[column] = data[column].astype(str)

# Define columns to encode
columns_to_encode = ['Record ID', 'Incident Year', 'Incident Month', 'Incident Day',
                     'Operator ID', 'Operator', 'Aircraft', 'Aircraft Type',
                     'Aircraft Make', 'Aircraft Model', 'Aircraft Mass',
                     'Engine Make', 'Engine Model', 'Engines', 'Engine Type',
                     'Engine1 Position', 'Engine2 Position', 'Engine3 Position',
                     'Engine4 Position', 'Airport ID', 'Airport', 'State',
                     'FAA Region', 'Warning Issued', 'Flight Phase',
                     'Visibility', 'Precipitation', 'Height', 'Speed',
                     'Distance', 'Species ID', 'Species Name',
                     'Species Quantity', 'Flight Impact', 'Fatalities',
                     'Injuries', 'Radome Strike', 'Windshield Strike',
                     'Nose Strike', 'Engine1 Strike', 'Engine2 Strike',
                     'Engine3 Strike', 'Engine4 Strike', 'Engine Ingested',
                     'Propeller Strike', 'Wing or Rotor Strike',
                     'Fuselage Strike', 'Landing Gear Strike',
                     'Tail Strike', 'Lights Strike', 'Other Strike']

# Initialize OrdinalEncoder and encode categorical variables
encoder = OrdinalEncoder()
data[columns_to_encode] = encoder.fit_transform(data[columns_to_encode])

# Define features and target variable
features = data.drop("Aircraft Damage", axis=1)
target = data["Aircraft Damage"]

# Handle missing values in the features
imputer = SimpleImputer(strategy='most_frequent')
features_imputed = imputer.fit_transform(features)

# Handle missing values in the target variable
target_imputed = imputer.fit_transform(target.values.reshape(-1, 1)).flatten()

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(features_imputed, target_imputed, test_size=0.1)

# Create an instance of the LogisticRegression model
logistic_reg = LogisticRegression()

# Fit the model on the training data
logistic_reg.fit(x_train, y_train)

# Evaluate the model's accuracy
accuracy = logistic_reg.score(x_test, y_test)
print("Accuracy:", accuracy)

# Retrieve the coefficients and intercept
coefficients = logistic_reg.coef_[0]
intercept = logistic_reg.intercept_[0]

# Extract column names of the features
feature_names = data.drop("Aircraft Damage", axis=1).columns

# Construct the logistic equation string
logistic_eq_str = f"P(Aircraft Damage=1) = 1 / (1 + e^(-({intercept:.4f}"
for feature, coef in zip(feature_names, coefficients):
    logistic_eq_str += f" + {coef:.4f}*{feature}"
logistic_eq_str += "))"

print("Logistic Equation:")
print(logistic_eq_str)

# Visualizing Distribution of the target variable
plt.figure(figsize=(10, 6))
sns.histplot(target_imputed, bins=50, kde=True, stat="density")
plt.title('Distribution of Aircraft Damage')
plt.xlabel('Aircraft Damage')
plt.ylabel('Density')
plt.show()

# Checking for power law distribution
def plot_power_law(data, title='Power Law Distribution'):
    # Drop NaN values
    data = data[~np.isnan(data)]

    if len(data) == 0:  # Check if there's still data after dropping NaNs
        print("No valid data available for plotting.")
        return

    # Calculate the empirical frequency distribution
    counts, bin_edges = np.histogram(data, bins=50, density=True)

    # Calculate the cumulative distribution
    cumulative_counts = np.cumsum(counts[::-1])[::-1]

    # Plot the log-log plot
    plt.figure(figsize=(10, 6))
    plt.loglog(bin_edges[1:], cumulative_counts, marker='o', linestyle='None')
    plt.title(title)
    plt.xlabel('Value')
    plt.ylabel('Cumulative Frequency')
    plt.grid(True)
    plt.show()

# Check the distribution of Aircraft Damage for power law behavior
plot_power_law(target_imputed, title='Cumulative Distribution of Aircraft Damage')

# Optionally, perform additional analysis on features for power law behavior
for column in features.columns:
    feature_data = features[column]
    plot_power_law(feature_data, title=f'Cumulative Distribution of {column}')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
import seaborn as sns

# Load CSV
data = pd.read_csv('E:/URP-SHIT/transportation/flights_sample_3m.csv/flights_sample_3m_modified-2.csv', encoding='latin1')

# Convert non-numeric columns to strings, then encode
for column in data.select_dtypes(include=['object']):
    data[column] = data[column].astype(str)
    data[column] = LabelEncoder().fit_transform(data[column])

# Select numeric columns that may follow power-law and transform them
powerlaw_candidates = ['DISTANCE', 'DELAY_DUE_CARRIER', 'DELAY_DUE_LATE_AIRCRAFT',
                       'DELAY_DUE_NAS', 'DELAY_DUE_WEATHER', 'AIR_TIME']

for col in powerlaw_candidates:
    if col in data.columns:
        # Apply log1p to handle 0s safely
        data[f'log_{col}'] = np.log1p(data[col])
        # Optionally drop original if desired:
        data.drop(columns=col, inplace=True)

# Define features and target
features = data.drop("CANCELLED", axis=1)
target = data["CANCELLED"]

# Impute missing values
imputer = SimpleImputer(strategy='most_frequent')
features_imputed = imputer.fit_transform(features)
target_imputed = imputer.fit_transform(target.values.reshape(-1, 1)).flatten()

# Split dataset
x_train, x_test, y_train, y_test = train_test_split(features_imputed, target_imputed, test_size=0.1, random_state=42)

# Train logistic regression
logistic_reg = LogisticRegression(max_iter=1000)
logistic_reg.fit(x_train, y_train)

# Accuracy
accuracy = logistic_reg.score(x_test, y_test)
print("Accuracy:", accuracy)

# Coefficients and equation
coefficients = logistic_reg.coef_[0]
intercept = logistic_reg.intercept_[0]
feature_names = features.columns

logistic_eq_str = f"P(CANCELLED=1) = 1 / (1 + e^(-({intercept:.4f}"
for feature, coef in zip(feature_names, coefficients):
    logistic_eq_str += f" + {coef:.4f}*{feature}"
logistic_eq_str += ")))"

print("\nLogistic Equation:")
print(logistic_eq_str)

# 🔍 Distribution Plot
plt.figure(figsize=(10, 6))
sns.histplot(target_imputed, bins=50, kde=True, stat="density")
plt.title('Distribution of CANCELLED')
plt.xlabel('CANCELLED')
plt.ylabel('Density')
plt.show()

# 🔍 Power Law Visualizations
def plot_power_law(data, title='Power Law Distribution'):
    data = data[~np.isnan(data)]
    if len(data) == 0:
        print("No valid data available for plotting.")
        return
    counts, bin_edges = np.histogram(data, bins=50, density=True)
    cumulative_counts = np.cumsum(counts[::-1])[::-1]
    plt.figure(figsize=(10, 6))
    plt.loglog(bin_edges[1:], cumulative_counts, marker='o', linestyle='None')
    plt.title(title)
    plt.xlabel('Value')
    plt.ylabel('Cumulative Frequency')
    plt.grid(True)
    plt.show()

# Plot transformed and original variables
for col in powerlaw_candidates:
    if f'log_{col}' in data.columns:
        plot_power_law(data[f'log_{col}'], f'Log-Transformed Power Law: {col}')


POWER-LAW

In [None]:
# @title  LONG-TAIL
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
import seaborn as sns

# Read the CSV file
file_path = "/kaggle/input/air-damage/DAMAGE.csv"
data = pd.read_csv(file_path, encoding='latin1')

# Convert non-numeric columns to categorical type
for column in data.select_dtypes(include=['object']):
    data[column] = data[column].astype(str)

# Define columns to encode
columns_to_encode = ['Record ID', 'Incident Year', 'Incident Month', 'Incident Day',
                     'Operator ID', 'Operator', 'Aircraft', 'Aircraft Type',
                     'Aircraft Make', 'Aircraft Model', 'Aircraft Mass',
                     'Engine Make', 'Engine Model', 'Engines', 'Engine Type',
                     'Engine1 Position', 'Engine2 Position', 'Engine3 Position',
                     'Engine4 Position', 'Airport ID', 'Airport', 'State',
                     'FAA Region', 'Warning Issued', 'Flight Phase',
                     'Visibility', 'Precipitation', 'Height', 'Speed',
                     'Distance', 'Species ID', 'Species Name',
                     'Species Quantity', 'Flight Impact', 'Fatalities',
                     'Injuries', 'Radome Strike', 'Windshield Strike',
                     'Nose Strike', 'Engine1 Strike', 'Engine2 Strike',
                     'Engine3 Strike', 'Engine4 Strike', 'Engine Ingested',
                     'Propeller Strike', 'Wing or Rotor Strike',
                     'Fuselage Strike', 'Landing Gear Strike',
                     'Tail Strike', 'Lights Strike', 'Other Strike']

# Initialize OrdinalEncoder and encode categorical variables
encoder = OrdinalEncoder()
data[columns_to_encode] = encoder.fit_transform(data[columns_to_encode])

# Define features and target variable
features = data.drop("Aircraft Damage", axis=1)
target = data["Aircraft Damage"]

# Handle missing values in the features
imputer = SimpleImputer(strategy='most_frequent')
features_imputed = imputer.fit_transform(features)

# Handle missing values in the target variable
target_imputed = imputer.fit_transform(target.values.reshape(-1, 1)).flatten()

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(features_imputed, target_imputed, test_size=0.1)

# Create an instance of the LogisticRegression model
logistic_reg = LogisticRegression()

# Fit the model on the training data
logistic_reg.fit(x_train, y_train)

# Evaluate the model's accuracy
accuracy = logistic_reg.score(x_test, y_test)
print("Accuracy:", accuracy)

# Retrieve the coefficients and intercept
coefficients = logistic_reg.coef_[0]
intercept = logistic_reg.intercept_[0]

# Extract column names of the features
feature_names = data.drop("Aircraft Damage", axis=1).columns

# Construct the logistic equation string
logistic_eq_str = f"P(Aircraft Damage=1) = 1 / (1 + e^(-({intercept:.4f}"
for feature, coef in zip(feature_names, coefficients):
    logistic_eq_str += f" + {coef:.4f}*{feature}"
logistic_eq_str += "))"

print("Logistic Equation:")
print(logistic_eq_str)

# Visualizing Distribution of the target variable
plt.figure(figsize=(10, 6))
sns.histplot(target_imputed, bins=50, kde=True, stat="density")
plt.title('Distribution of Aircraft Damage')
plt.xlabel('Aircraft Damage')
plt.ylabel('Density')
plt.show()

# Checking for power law distribution
def plot_power_law(data, title='Power Law Distribution'):
    # Drop NaN values
    data = data[~np.isnan(data)]

    if len(data) == 0:  # Check if there's still data after dropping NaNs
        print("No valid data available for plotting.")
        return

    # Calculate the empirical frequency distribution
    counts, bin_edges = np.histogram(data, bins=50, density=True)

    # Calculate the cumulative distribution
    cumulative_counts = np.cumsum(counts[::-1])[::-1]

    # Plot the log-log plot
    plt.figure(figsize=(10, 6))
    plt.loglog(bin_edges[1:], cumulative_counts, marker='o', linestyle='None')
    plt.title(title)
    plt.xlabel('Value')
    plt.ylabel('Cumulative Frequency')
    plt.grid(True)
    plt.show()

# Check the distribution of Aircraft Damage for power law behavior
plot_power_law(target_imputed, title='Cumulative Distribution of Aircraft Damage')

# Optionally, perform additional analysis on features for power law behavior
for column in features.columns:
    feature_data = features[column]
    plot_power_law(feature_data, title=f'Cumulative Distribution of {column}')

LONG-TAIL

In [None]:
# @title HIERARCHICAL
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt

# Read the CSV file
file_name = '/kaggle/input/air-damage/DAMAGE.csv'
data = pd.read_csv(file_name, encoding='latin1')

# Convert non-numeric columns to string type
for column in data.select_dtypes(include=['object']):
    data[column] = data[column].astype(str)

# Define columns to encode
columns_to_encode = ['Record ID', 'Incident Year', 'Incident Month', 'Incident Day', 'Operator ID', 'Operator',
                     'Aircraft', 'Aircraft Type', 'Aircraft Make', 'Aircraft Model', 'Aircraft Mass',
                     'Engine Make', 'Engine Model', 'Engines', 'Engine Type', 'Engine1 Position',
                     'Engine2 Position', 'Engine3 Position', 'Engine4 Position', 'Airport ID', 'Airport',
                     'State', 'FAA Region', 'Warning Issued', 'Flight Phase', 'Visibility', 'Precipitation',
                     'Height', 'Speed', 'Distance', 'Species ID', 'Species Name', 'Species Quantity',
                     'Flight Impact', 'Fatalities', 'Injuries', 'Aircraft Damage', 'Radome Strike',
                     'Radome Damage', 'Windshield Strike', 'Windshield Damage', 'Nose Strike',
                     'Nose Damage', 'Engine1 Strike', 'Engine1 Damage', 'Engine2 Strike', 'Engine2 Damage',
                     'Engine3 Strike', 'Engine3 Damage', 'Engine4 Strike', 'Engine4 Damage',
                     'Engine Ingested', 'Propeller Strike', 'Propeller Damage', 'Wing or Rotor Strike',
                     'Wing or Rotor Damage', 'Fuselage Strike', 'Fuselage Damage', 'Landing Gear Strike',
                     'Landing Gear Damage', 'Tail Strike', 'Tail Damage', 'Lights Strike',
                     'Lights Damage', 'Other Strike', 'Other Damage']

# Initialize LabelEncoder
encoder = LabelEncoder()

# Encode categorical variables
for column in columns_to_encode:
    data[column] = encoder.fit_transform(data[column])

# Define levels based on thresholds
data['Damage Level'] = pd.cut(data['Aircraft Damage'],
                              bins=[-1, 0, 2, 4, 6, 8, 10],
                              labels=[0, 1, 2, 3, 4, 5])

# Define features and target variable
features = data.drop(["Aircraft Damage", "Damage Level"], axis=1)
target = data["Damage Level"].astype(int)

# Sample a subset of the data to reduce memory usage
sampled_data = data.sample(frac=0.1, random_state=42)
features_sampled = sampled_data.drop(["Aircraft Damage", "Damage Level"], axis=1)
target_sampled = sampled_data["Damage Level"].astype(int)

# Handle missing values in the features
imputer = SimpleImputer(strategy='median')
features_imputed = imputer.fit_transform(features_sampled)

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(features_imputed, target_sampled, test_size=0.1, random_state=42)

# Standardize the data
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

# Build the ANN model
model = keras.Sequential([
    layers.Input(shape=(x_train.shape[1],)),
    layers.Dense(64, activation='relu'),
    layers.Dense(32, activation='relu'),
    layers.Dense(16, activation='relu'),
    layers.Dense(6, activation='softmax')  # Predicting hierarchical classes
])

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(x_train, y_train, validation_split=0.2, epochs=20, batch_size=64, verbose=1)

# Evaluate the model
y_pred_probs = model.predict(x_test)
y_pred = np.argmax(y_pred_probs, axis=1)
accuracy = accuracy_score(y_test, y_pred)

print(f"ANN Accuracy: {accuracy}")
print("Classification Report:\n", classification_report(y_test, y_pred))

# Plot training history
plt.figure(figsize=(12, 6))
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend()
plt.show()

plt.figure(figsize=(12, 6))
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend()
plt.show()

# Print hierarchical structure
print("\nHierarchical Structure of Damage Levels:")
print("Level 0: No Damage")
print("Level 1: Minor Damage")
print("Level 2: Moderate Damage")
print("Level 3: Significant Damage")
print("Level 4: Severe Damage")
print("Level 5: Catastrophic Damage")


HIERARCHICAL

In [None]:
# @title  BIOFURCATIONS
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt

# Read the CSV file
file_name = '/content/DAMAGE.csv'
data = pd.read_csv(file_name, encoding='latin1')

# Convert non-numeric columns to string type
for column in data.select_dtypes(include=['object']):
    data[column] = data[column].astype(str)

# Define columns to encode
columns_to_encode = ['Record ID', 'Incident Year', 'Incident Month', 'Incident Day', 'Operator ID', 'Operator',
                     'Aircraft', 'Aircraft Type', 'Aircraft Make', 'Aircraft Model', 'Aircraft Mass',
                     'Engine Make', 'Engine Model', 'Engines', 'Engine Type', 'Engine1 Position',
                     'Engine2 Position', 'Engine3 Position', 'Engine4 Position', 'Airport ID', 'Airport',
                     'State', 'FAA Region', 'Warning Issued', 'Flight Phase', 'Visibility', 'Precipitation',
                     'Height', 'Speed', 'Distance', 'Species ID', 'Species Name', 'Species Quantity',
                     'Flight Impact', 'Fatalities', 'Injuries', 'Aircraft Damage', 'Radome Strike',
                     'Radome Damage', 'Windshield Strike', 'Windshield Damage', 'Nose Strike',
                     'Nose Damage', 'Engine1 Strike', 'Engine1 Damage', 'Engine2 Strike', 'Engine2 Damage',
                     'Engine3 Strike', 'Engine3 Damage', 'Engine4 Strike', 'Engine4 Damage',
                     'Engine Ingested', 'Propeller Strike', 'Propeller Damage', 'Wing or Rotor Strike',
                     'Wing or Rotor Damage', 'Fuselage Strike', 'Fuselage Damage', 'Landing Gear Strike',
                     'Landing Gear Damage', 'Tail Strike', 'Tail Damage', 'Lights Strike',
                     'Lights Damage', 'Other Strike', 'Other Damage']

# Initialize LabelEncoder
encoder = LabelEncoder()

# Encode categorical variables
for column in columns_to_encode:
    data[column] = encoder.fit_transform(data[column])

# Define levels based on thresholds
data['Damage Level'] = pd.cut(data['Aircraft Damage'],
                              bins=[-1, 0, 2, 4, 6, 8, 10],
                              labels=[0, 1, 2, 3, 4, 5])

# Define features and target variable
features = data.drop(["Aircraft Damage", "Damage Level"], axis=1)
target = data["Damage Level"].astype(int)

# Sample a subset of the data to reduce memory usage
sampled_data = data.sample(frac=0.1, random_state=42)
features_sampled = sampled_data.drop(["Aircraft Damage", "Damage Level"], axis=1)
target_sampled = sampled_data["Damage Level"].astype(int)

# Handle missing values in the features
imputer = SimpleImputer(strategy='median')
features_imputed = imputer.fit_transform(features_sampled)

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(features_imputed, target_sampled, test_size=0.1, random_state=42)

# Standardize the data
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

# Build the ANN model
model = keras.Sequential([
    layers.Input(shape=(x_train.shape[1],)),
    layers.Dense(64, activation='relu'),
    layers.Dense(32, activation='relu'),
    layers.Dense(16, activation='relu'),
    layers.Dense(6, activation='softmax')  # Predicting hierarchical classes
])

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(x_train, y_train, validation_split=0.2, epochs=20, batch_size=64, verbose=1)

# Evaluate the model
y_pred_probs = model.predict(x_test)
y_pred = np.argmax(y_pred_probs, axis=1)
accuracy = accuracy_score(y_test, y_pred)

print(f"ANN Accuracy: {accuracy}")
print("Classification Report:\n", classification_report(y_test, y_pred))

# Identify bifurcations by analyzing prediction probabilities
bifurcations = []
for i, probs in enumerate(y_pred_probs):
    sorted_probs = np.sort(probs)
    # Detect if there's a significant drop between the two highest probabilities
    if sorted_probs[-1] - sorted_probs[-2] < 0.2:  # Adjust threshold as needed
        bifurcations.append((i, y_test.iloc[i], y_pred[i]))

print(f"\nIdentified Bifurcations (based on prediction uncertainty): {len(bifurcations)}")
for idx, true_val, pred_val in bifurcations:
    print(f"Index: {idx}, True: {true_val}, Predicted: {pred_val}, Probabilities: {y_pred_probs[idx]}")

# Plot training history
plt.figure(figsize=(12, 6))
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend()
plt.show()

plt.figure(figsize=(12, 6))
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend()
plt.show()

# Print hierarchical structure
print("\nHierarchical Structure of Damage Levels:")
print("Level 0: No Damage")
print("Level 1: Minor Damage")
print("Level 2: Moderate Damage")
print("Level 3: Significant Damage")
print("Level 4: Severe Damage")
print("Level 5: Catastrophic Damage")


BIOFURCATIONS