In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler

In [None]:
#load the dataset
df = pd.read_csv('generated_features.csv')
print("Initial dataset")
print(df.head(10))
print(df.shape)

In [None]:
#store categorical columns
categorical = [df['SMILES'], df['Source']]
categorical

In [None]:
#drop the categorical columns
df = df.drop(columns=['SMILES', 'Source'])
print("Dataset after dropping categorical columns")
print(df.head(10))

In [None]:
#seperate the features and the target and converting them to dataframes
X = df.drop(columns=['Toxicity'])
Y = df['Toxicity']
X = pd.DataFrame(X)
Y = pd.DataFrame(Y)

In [None]:
X.head(10)

In [None]:
Y.head(10)

In [None]:
#splitting the data into 5% test and 95% train
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.05, random_state=42)
X_train.head(10)

In [None]:
print("X_train shape: ", X_train.shape)
print("X_test shape: ", X_test.shape)

1.First thing would be to remove the duplicate columns

In [None]:
#drop duplicate columns from Xtrain adn Xtest
duplicate_columns = X_train.columns[X_train.T.duplicated(keep='first')]
duplicate_columns

In [None]:
X_train = X_train.drop(columns=duplicate_columns)
X_test = X_test.drop(columns=duplicate_columns)
print("X_train shape after dropping duplicate columns: ", X_train.shape)
print("X_test shape after dropping duplicate columns: ", X_test.shape)

In [None]:
from sklearn.impute import SimpleImputer
# Assuming X_train and X_test are DataFrames

# Handle infinite and nan values
X_train.replace([np.inf, -np.inf], np.nan, inplace=True)
X_test.replace([np.inf, -np.inf], np.nan, inplace=True)

# Impute the nan values
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

# Convert back to DataFrames
X_train = pd.DataFrame(X_train_imputed, columns=X_train.columns)
X_test = pd.DataFrame(X_test_imputed, columns=X_test.columns)

print("Values which are nan in X_train: ", np.isnan(X_train).sum())
X_train.head(10)


Standardising the data

In [None]:
from sklearn.preprocessing import StandardScaler
#scale the features
scaler = StandardScaler()
scaler.fit(X_train)

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
scaler.mean_

In [None]:
X_train

In [None]:
X_train_scaled

In [None]:
# Convert X_train and X_test arrays to DataFrames
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)


In [None]:
np.round(X_train.describe(), 1)

In [None]:
np.round(X_train_scaled.mean(), 1)

Using variance threshold

In [None]:
from sklearn.feature_selection import VarianceThreshold
sel = VarianceThreshold(threshold=0.2)

In [None]:
sel.fit(X_train_scaled)

In [None]:
sum(sel.get_support())

In [None]:
columns = X_train.columns[sel.get_support()]

In [None]:
X_train = sel.transform(X_train_scaled)
X_test = sel.transform(X_test_scaled)


X_train = pd.DataFrame(X_train, columns=columns)
X_test = pd.DataFrame(X_test, columns=columns)

In [None]:
# #give mean of variance of each column
# np.round(X_train.var(), 1)

In [None]:
X_train.head(10)
X_train.shape

In [None]:
#remove the columns with zero variance
# X_train = X_train.loc[:, X_train.var() > 0]
# X_test = X_test.loc[:, X_test.var() > 0]

# X_train.shape

In [None]:
threshold = 0.9
#code to remove highly correlated features
def correlation(dataset, threshold):
    col_corr = set()  # Set of all the names of correlated columns
    corr_matrix = dataset.corr()
    print(corr_matrix)
    print("The shape of the correlation matrix is: ", corr_matrix.shape)
    print("Total number of operations: ", corr_matrix.shape[0] * corr_matrix.shape[1])
    index = 0
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold: # we are interested in absolute coeff value
                colname = corr_matrix.columns[i]  # getting the name of column
                col_corr.add(colname)
            print("Operation number: ", index)
            index += 1
    return col_corr

In [None]:
X_train = X_train.drop(columns=correlation(X_train, threshold))
X_test = X_test.drop(columns=correlation(X_test, threshold))
print("X_train shape after removing correlated columns: ", X_train.shape)
print("X_test shape after removing correlated columns: ", X_test.shape)