In [1]:
#Importing the necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import make_classification

In [2]:
X, y = make_classification(n_features = 2, n_samples = 500, n_classes = 2, n_clusters_per_class = 2, n_redundant = 0)

In [3]:
X.shape

(500, 2)

In [4]:
y.shape

(500,)

In [5]:
df = pd.concat([pd.DataFrame(data=X[0], columns=['x']), pd.DataFrame(data=y, columns=['y'])], axis = 1)

In [6]:
# Checking the NULL Values
nullx, nully = df['x'].isnull().sum(), df['y'].isnull().sum()
print('The null values in X and y are:', nullx, 'and',nully)

The null values in X and y are: 498 and 0


In [7]:
# Replacing the null values with the Mean Imputation Technique
df['x'] = df['x'].fillna(df['x'].mean())

In [8]:
# Checking the NULL Values after replacing:
nullx, nully = df['x'].isnull().sum(), df['y'].isnull().sum()
print('The null values in X and y are:', nullx, 'and',nully)

The null values in X and y are: 0 and 0


In [9]:
df.head()

Unnamed: 0,x,y
0,0.773646,1
1,-0.212239,1
2,0.280703,0
3,0.280703,0
4,0.280703,0


In [10]:
# Splitting the data to train and test

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, test_size = 0.2)

print('The shape of X_train, X_test, y_train and y_test are:', X_train.shape, X_test.shape, y_train.shape, y_test.shape)

The shape of X_train, X_test, y_train and y_test are: (400, 2) (100, 2) (400,) (100,)


In [11]:
# Directly using and training Model with default parameters

from sklearn.linear_model import LogisticRegression

regressor = LogisticRegression()

regressor.fit(X_train, y_train)

In [12]:
# Calculating the accuracy score with default parameters

y_pred = regressor.predict(X_test)

from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

0.6376811594202898

In [13]:
# Training the model with best parameters according to our datset
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import GridSearchCV

best_parameters = {
    'penalty': ['l1', 'l2', 'elasticnet', None],
    'C': list(np.arange(1, 151)),
    'solver': ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga']
}

# Create a GridSearchCV object
best_reg = GridSearchCV(regressor, param_grid=best_parameters, cv=5, scoring='accuracy')

# Fit the GridSearchCV object to your data
best_reg.fit(X_train, y_train)

# Get the best parameters
best_reg.best_params_

{'C': 1, 'penalty': 'l1', 'solver': 'liblinear'}

In [14]:
y_pred2 = best_reg.best_estimator_.predict(X_test)

# Calculating the accuracy with best parameters:

from sklearn.metrics import accuracy_score

# Calculate accuracy for predictions made by the model with default parameters
accuracy_default = accuracy_score(y_test, y_pred)

# Calculate accuracy for predictions made by the model with best parameters
accuracy_best = accuracy_score(y_test, y_pred2)

print('Accuracy with default parameters:', accuracy_default)
print('Accuracy with best parameters:', accuracy_best)

Accuracy with default parameters: 0.91
Accuracy with best parameters: 0.91


# Note:

#### Here we are getting the same accuracy (0.93) for both the model with default parameters and the model with the best parameters obtained from GridSearchCV, it suggests that the default parameters were already optimal for our dataset. In such a scenario, there might be limited room for improvement by tuning hyperparameters using GridSearchCV. It's also possible that the dataset generated by 'make_classification' is relatively straightforward for the logistic regression algorithm, and the default parameters are already providing a good solution.
