<a href="https://colab.research.google.com/github/Akash1542/707-lecture-master/blob/main/Assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Importing Libraries.
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler

In [2]:
# Creating a function to preprocess data.
def preprocess(X):
# Handling missing values (fill with 0)
    X.fillna(0, inplace=True)

# Standardizing features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    return X_scaled

In [3]:
# Creating a function to evaluate model using cross-validation and F1-score.
def evaluate_model(model, X, y):
    f1_scores = cross_val_score(model, X, y, cv=5, scoring='f1')
    return f1_scores.mean()

In [4]:
# Generating dataset A.
np.random.seed(42)
X_A, y_A = make_classification(n_samples=1000, n_features=20, n_classes=2, weights=[0.5, 0.5], class_sep=0.1, random_state=42)

# Generate dataset B
X_B, y_B = make_classification(n_samples=1000, n_features=20, n_classes=2, weights=[0.1, 0.9], class_sep=0.5, random_state=42)

In [5]:
# Converting datasets to pandas DataFrames.
data_A = pd.DataFrame(X_A, columns=[f'feature_{i}' for i in range(X_A.shape[1])])
data_A['target'] = y_A

data_B = pd.DataFrame(X_B, columns=[f'feature_{i}' for i in range(X_B.shape[1])])
data_B['target'] = y_B

In [6]:
# Preprocessing data for both datasets.
X_A_scaled = preprocess(data_A.drop('target', axis=1))
y_A = data_A['target']

X_B_scaled = preprocess(data_B.drop('target', axis=1))
y_B = data_B['target']

In [7]:
# Initializing logistic regression and decision tree classifiers.
lr_model = LogisticRegression()
dt_model = DecisionTreeClassifier()

In [8]:
# Evaluating models on dataset A.
f1_lr_A = evaluate_model(lr_model, X_A_scaled, y_A)
f1_dt_A = evaluate_model(dt_model, X_A_scaled, y_A)

# Evaluating models on dataset B.
f1_lr_B = evaluate_model(lr_model, X_B_scaled, y_B)
f1_dt_B = evaluate_model(dt_model, X_B_scaled, y_B)

In [9]:
# Calculating performance difference.
print("Dataset A \n Logistic Regression:", f1_lr_A)
print("Decision Tree Classifier:", f1_dt_A)
print("Dataset B \n Logistic Regression:", f1_lr_B)
print("Decision Tree Classifier:", f1_dt_B)
print("Dataset A Performance Difference between both the models:", abs(f1_lr_A - f1_dt_A))
print("Dataset B Performance Difference between both the models:", abs(f1_lr_B - f1_dt_B))

Dataset A 
 Logistic Regression: 0.5311199388846448
Decision Tree Classifier: 0.6554729353772546
Dataset B 
 Logistic Regression: 0.9395444877589844
Decision Tree Classifier: 0.9103548718922598
Dataset A Performance Difference between both the models: 0.12435299649260978
Dataset B Performance Difference between both the models: 0.029189615866724528


In [10]:
# Saving datasets to CSV files.
data_A.to_csv('data_A.csv', index=False)
data_B.to_csv('data_B.csv', index=False)