<a href="https://colab.research.google.com/github/Ash100/Minor/blob/main/2n1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#**2n1: Unknown EDA and De-NACL(ation)**
Prepared for testing purposes by **Dr. Ashfaq Ahmad**.
Use it on your own risk, if you want to....

In [None]:
#@title Install and Import necessary libraries
!pip install seaborn

# Import libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [2]:
#@title Data Loading
input_file_path = ''  # Update this to your actual input file path
data = pd.read_csv(input_file_path)

In [None]:
#@title Display the first few rows of the dataset
print("First few rows of the dataset:")
print(data.head())

In [None]:
#@title Summary Statistics (excluding non-numeric columns like SMILES)
print("\nSummary Statistics:")
numeric_data = data.select_dtypes(include=[np.number])
print(numeric_data.describe())

In [None]:
#@title Check for missing values
print("\nMissing Values Analysis:")
print(data.isnull().sum())

In [None]:
#@title Correlation Matrix (excluding non-numeric columns like SMILES)
print("\nCorrelation Matrix:")
corr_matrix = numeric_data.corr()
print(corr_matrix)

In [None]:
#@title Plot Correlation Matrix
plt.figure(figsize=(12, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix')
plt.show()

In [None]:
#@title Distribution Plots for all numerical columns
numerical_columns = data.select_dtypes(include=[np.number]).columns.tolist()
for col in numerical_columns:
    plt.figure(figsize=(10, 6))
    sns.histplot(data[col], kde=True)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')
    plt.show()

In [None]:
#@title Box Plots for all numerical columns
for col in numerical_columns:
    plt.figure(figsize=(10, 6))
    sns.boxplot(y=data[col])
    plt.title(f'Box Plot of {col}')
    plt.ylabel(col)
    plt.show()

In [None]:
#@title Pair Plots for numerical columns to visualize relationships
sns.pairplot(data[numerical_columns])
plt.show()

# Activity vs Descriptors (Scatter Plots)
for col in numerical_columns:
    if col != 'Activity':  # Exclude Activity column from descriptors
        plt.figure(figsize=(10, 6))
        sns.scatterplot(x=data[col], y=data['Activity'])
        plt.title(f'Activity vs {col}')
        plt.xlabel(col)
        plt.ylabel('Activity')
        plt.show()

### Some Statistics (Optional)

In [None]:
#@title Outlier Detection
print("\nOutlier Detection:")
for col in numeric_data.columns:
    plt.figure(figsize=(10, 6))
    sns.boxplot(data[col])
    plt.title(f'Outliers in {col}')
    plt.ylabel(col)
    plt.show()

In [None]:
!pip install seaborn scipy scikit-learn

In [17]:
#@title Import libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [None]:
#@title Normality Tests
print("\nNormality Tests:")
for col in numeric_data.columns:
    k2, p = stats.normaltest(data[col])
    alpha = 0.05
    print(f'{col} normality test p-value = {p}')
    if p < alpha:
        print(f"{col} does not follow a normal distribution")
    else:
        print(f"{col} follows a normal distribution")

In [None]:
#@title T-Test / ANOVA
print("\nT-Test / ANOVA:")
if 'Activity' in numeric_data.columns:
    active = data[data['Activity'] > data['Activity'].median()]
    inactive = data[data['Activity'] <= data['Activity'].median()]
    for col in numeric_data.columns:
        if col != 'Activity':
            t_stat, p_val = stats.ttest_ind(active[col], inactive[col])
            print(f'T-Test between active and inactive for {col}: p-value = {p_val}')
            if p_val < alpha:
                print(f"Statistically significant difference for {col}")
            else:
                print(f"No statistically significant difference for {col}")

In [None]:
#@title Feature Importance using Correlation with Activity
print("\nFeature Importance:")
corr_with_activity = corr_matrix['Activity'].sort_values(ascending=False)
print(corr_with_activity)

In [None]:
#@title PCA (Principal Component Analysis)
print("\nPCA (Principal Component Analysis):")
scaler = StandardScaler()
scaled_data = scaler.fit_transform(numeric_data.drop(columns=['Activity'], errors='ignore'))
pca = PCA(n_components=2)
pca_results = pca.fit_transform(scaled_data)
pca_df = pd.DataFrame(data=pca_results, columns=['Principal Component 1', 'Principal Component 2'])
plt.figure(figsize=(10, 6))
sns.scatterplot(x='Principal Component 1', y='Principal Component 2', data=pca_df)
plt.title('PCA Results')
plt.show()

In [None]:
#@title Distribution of Activity by Categories (if applicable)
if 'Category' in data.columns:
    print("Category column found. Generating plot.")
    print(data['Category'].value_counts())  # Print the distribution of categories
    plt.figure(figsize=(10, 6))
    sns.boxplot(x='Category', y='Activity', data=data)
    plt.title('Distribution of Activity by Category')
    plt.show()
else:
    print("Category column not found.")

#**Perform De-Saltation**

In [None]:
!pip install rdkit-pypi

In [2]:
#@title Import necessary libraries
import pandas as pd
import networkx as nx
from rdkit import Chem
from rdkit.Chem import Descriptors, rdMolDescriptors
from rdkit.Chem import Descriptors, rdmolops

In [3]:
#@title Load data from a CSV file (make sure to upload your file to Colab)
# The CSV file should have at least two columns: "SMILES" and "Activity"
file_path = ''  # Change this to the path of your CSV file
data = pd.read_csv(file_path)

In [None]:
#@title below script
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem, rdMolDescriptors
from rdkit.Chem.rdchem import Mol
from typing import List

# Function to remove salts by keeping the largest fragment
def remove_salts(smiles: str) -> str:
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None

    # Fragment the molecule
    frags = Chem.GetMolFrags(mol, asMols=True, sanitizeFrags=False)

    # Keep the largest fragment
    largest_frag = max(frags, default=mol, key=lambda m: m.GetNumAtoms())

    return Chem.MolToSmiles(largest_frag)

# Load your dataset
input_file_path = '/content/Unknown60K.csv'  # Update this to your actual input file path
data = pd.read_csv(input_file_path)

# Apply desalting to each SMILES string
data['Desalted_SMILES'] = data['SMILES'].apply(remove_salts)

# Save the results to a new CSV file
output_file_path = '/content/desalted_compounds.csv'  # Update this to your desired output path
data.to_csv(output_file_path, index=False)

print(f"Desalted compounds saved to {output_file_path}")