<a href="https://colab.research.google.com/github/AravindBiswas/MyStudy/blob/master/Supervised_Learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install tensorflow



In [2]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np


In [3]:
# Load the dataset
def load_data(file_name):
    try:
        data = pd.read_csv(file_name)
        return data
    except Exception as e:
        print(f"Failed to load data: {e}")

In [4]:
# One-hot encoding for the target variable
def one_hot_encoding(data):
    le = LabelEncoder()
    data['label'] = le.fit_transform(data['label'])
    return data

In [5]:
# Split the dataset into train-test
def split_data(data):
    X = data.drop('label', axis=1)
    y = data['label']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    return X_train, X_test, y_train, y_test

In [6]:
# Fit a logistic regression model
def fit_logistic_regression(X_train, X_test, y_train, y_test):
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy


In [14]:
# Compute the correlation matrix
def compute_correlation_matrix(data):
    corr_matrix = data.drop('label', axis=1).corr()
    return corr_matrix

# Plot the correlation matrix
def plot_correlation_matrix(corr_matrix):
    plt.figure(figsize=(10,8))
    sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', square=True)
    plt.show()

In [8]:
# Identify highly correlated predictors
def identify_highly_correlated_predictors(corr_matrix, threshold=0.7):
    highly_correlated_predictors = []
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold:
                highly_correlated_predictors.append((corr_matrix.columns[i], corr_matrix.columns[j]))
    return highly_correlated_predictors

In [9]:
# Remove highly correlated predictors
def remove_highly_correlated_predictors(data, highly_correlated_predictors):
    predictors_to_remove = []
    for pair in highly_correlated_predictors:
        predictors_to_remove.append(pair[1])
    predictors_to_remove = list(set(predictors_to_remove))
    data = data.drop(predictors_to_remove, axis=1)
    return data

In [13]:
# Main function
def main():
    data = load_data('voice.csv')
    data = one_hot_encoding(data)
    X_train, X_test, y_train, y_test = split_data(data)
    accuracy1 = fit_logistic_regression(X_train, X_test, y_train, y_test)
    print(f"Accuracy before removing highly correlated predictors: {accuracy1}")
    corr_matrix = compute_correlation_matrix(data)
    plot_correlation_matrix(corr_matrix)
    highly_correlated_predictors = identify_highly_correlated_predictors(corr_matrix)
    print(f"Highly correlated predictors: {highly_correlated_predictors}")
    data = remove_highly_correlated_predictors(data, highly_correlated_predictors)
    X_train, X_test, y_train, y_test = split_data(data)
    accuracy2 = fit_logistic_regression(X_train, X_test, y_train, y_test)
    print(f"Accuracy after removing highly correlated predictors: {accuracy2}")