In [None]:
# @markdown ## 5. Molecular Docking Analysis Tool 📊
# @markdown ### Instructions:
# @markdown a. Click the **Run** ▶️ button to start

# @markdown b. Upload your docking results (ZIP file) 📤 when prompted

# @markdown c. Wait for the analysis to complete ⏱️

# @markdown d. Results will be automatically downloaded 📥

import os
import zipfile
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from google.colab import files
from datetime import datetime
from scipy import stats
from sklearn.cluster import KMeans
from matplotlib.gridspec import GridSpec
import shutil
import matplotlib.font_manager as fm

!apt-get update -qq
!apt-get install -y fonts-dejavu fonts-noto-cjk
!rm ~/.cache/matplotlib -rf

plt.rcParams['font.sans-serif'] = ['DejaVu Sans', 'Noto Sans CJK JP', 'Arial']
plt.rcParams['font.family'] = 'sans-serif'
plt.rcParams['axes.unicode_minus'] = False

NATURE_COLORS = {
    'main_blue': '#2B6A99',      # Primary blue
    'accent_red': '#CC3311',     # Accent red
    'accent_green': '#009988',   # Accent green
    'neutral_grey': '#666666',   # Neutral grey
    'light_blue': '#5A9FD4',     # Light blue
    'pale_blue': '#A9D1EA',      # Pale blue
    'dark_red': '#992211',       # Dark red
    'light_grey': '#CCCCCC'      # Light grey
}

class DockingAnalyzer:

    def __init__(self, output_dir='analysis_results'):
        """Initialize analyzer with Nature publication settings"""
        self.output_dir = Path(output_dir)
        self.output_dir.mkdir(exist_ok=True)
        self.results = []
        self.scores = []
        self.setup_nature_style()

    def setup_nature_style(self):
        plt.style.use('default')

        plt.rcParams.update({
            'font.sans-serif': ['DejaVu Sans', 'Noto Sans CJK JP', 'Arial'],
            'font.family': 'sans-serif',
            'font.size': 8,
            'figure.dpi': 300,

            'figure.figsize': [3.5, 2.625],  # Nature single column width
            'figure.constrained_layout.use': True,

            'axes.titlesize': 9,
            'axes.labelsize': 8,
            'axes.linewidth': 0.5,
            'axes.grid': False,

            'xtick.labelsize': 7,
            'ytick.labelsize': 7,
            'xtick.major.width': 0.5,
            'ytick.major.width': 0.5,
            'xtick.minor.width': 0.3,
            'ytick.minor.width': 0.3,

            'legend.fontsize': 7,
            'legend.frameon': False,
            'legend.markerscale': 0.8,

            'lines.linewidth': 1.0,
            'lines.markersize': 4,

            'savefig.bbox': 'tight',
            'savefig.pad_inches': 0.02,
            'savefig.dpi': 300,

            'figure.autolayout': True,
            'axes.unicode_minus': False
        })

    def upload_results(self):
        print("\n📂 Please upload your docking results (ZIP file)...")
        uploaded = files.upload()

        if not uploaded:
            print("❌ No file uploaded")
            return False

        zip_file = list(uploaded.keys())[0]
        if not zip_file.endswith('.zip'):
            print("❌ Please upload a ZIP file")
            return False

        return self.process_zip_file(zip_file)

    def process_zip_file(self, zip_file):
        print(f"\n📦 Processing: {zip_file}")
        temp_dir = Path('temp_analysis')

        try:
            if temp_dir.exists():
                shutil.rmtree(temp_dir)
            temp_dir.mkdir()

            with zipfile.ZipFile(zip_file, 'r') as zipf:
                zipf.extractall(temp_dir)
            print("✓ Files extracted")

            results = []
            for pdbqt_file in temp_dir.glob('**/*_docked.pdbqt'):
                result = self.parse_docking_file(pdbqt_file)
                if result:
                    results.extend(result)

            if not results:
                print("❌ No valid docking results found")
                return False

            self.results = results
            self.scores = [r['score'] for r in results]
            print(f"✓ Successfully parsed {len(self.results)} docking poses")
            return True

        except Exception as e:
            print(f"❌ Error processing file: {str(e)}")
            return False
        finally:
            if temp_dir.exists():
                shutil.rmtree(temp_dir)

    def create_publication_figures(self):
        try:
            print("\n📊 Generating publication-quality figures...")

            self.create_energy_distribution_plot()

            self.create_energy_landscape_plot()

            self.create_sar_plot()

            self.create_clustering_analysis()

            self.create_statistical_summary()

            print("✓ All figures generated successfully")

        except Exception as e:
            print(f"⚠️ Error creating figures: {str(e)}")

    def create_energy_distribution_plot(self):
        fig = plt.figure(figsize=(7.2, 5))  # Nature double column width
        gs = GridSpec(2, 2, figure=fig, hspace=0.3, wspace=0.3)

        ax1 = fig.add_subplot(gs[0, :])
        sns.histplot(data=self.scores, bins=30, color=NATURE_COLORS['main_blue'],
                    alpha=0.6, ax=ax1, kde=True, line_kws={'color': NATURE_COLORS['neutral_grey']})
        ax1.axvline(np.mean(self.scores), color=NATURE_COLORS['accent_red'],
                   linestyle='--', label=f'Mean: {np.mean(self.scores):.2f} kcal/mol')
        ax1.axvline(np.median(self.scores), color=NATURE_COLORS['accent_green'],
                   linestyle='--', label=f'Median: {np.median(self.scores):.2f} kcal/mol')
        ax1.set_xlabel('Binding Energy (kcal/mol)')
        ax1.set_ylabel('Frequency')
        ax1.set_title('a', loc='left', weight='bold')
        ax1.legend(frameon=True, framealpha=0.8, edgecolor='none')

        # Q-Q Plot
        ax2 = fig.add_subplot(gs[1, 0])
        stats.probplot(self.scores, dist="norm", plot=ax2)
        ax2.get_lines()[0].set_markerfacecolor(NATURE_COLORS['main_blue'])
        ax2.get_lines()[0].set_markeredgecolor('none')
        ax2.get_lines()[1].set_color(NATURE_COLORS['accent_red'])
        ax2.set_title('b', loc='left', weight='bold')

        # Box Plot with Strip Plot
        ax3 = fig.add_subplot(gs[1, 1])
        sns.boxplot(y=self.scores, color=NATURE_COLORS['pale_blue'],
                   width=0.5, ax=ax3, fliersize=0)
        sns.stripplot(y=self.scores, color=NATURE_COLORS['main_blue'],
                     alpha=0.4, size=3, ax=ax3)
        ax3.set_ylabel('Binding Energy (kcal/mol)')
        ax3.set_title('c', loc='left', weight='bold')

        plt.savefig(self.output_dir / 'energy_distribution_analysis.png')
        plt.close()

    def create_energy_landscape_plot(self):
        fig = plt.figure(figsize=(3.5, 3.5))  # Nature single column width

        scores_array = np.array(self.scores)
        rmsd_array = np.array([r['rmsd_lb'] for r in self.results])

        plt.hist2d(rmsd_array, scores_array, bins=30,
                  cmap='viridis', alpha=0.8)
        cbar = plt.colorbar(label='Frequency')
        cbar.ax.tick_params(size=0)

        best_indices = np.argsort(scores_array)[:5]
        plt.scatter(rmsd_array[best_indices], scores_array[best_indices],
                   color=NATURE_COLORS['accent_red'], marker='*',
                   s=50, label='Top 5 poses', zorder=5)

        plt.xlabel('RMSD from Reference (Å)')
        plt.ylabel('Binding Energy (kcal/mol)')
        plt.title('Energy Landscape', loc='left', weight='bold')
        plt.legend(frameon=True, framealpha=0.8, edgecolor='none')

        plt.savefig(self.output_dir / 'energy_landscape.png')
        plt.close()

    def create_sar_plot(self):
        df = pd.DataFrame(self.results)

        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(7.2, 3))

        scatter = ax1.scatter(df['rmsd_lb'], df['score'],
                            c=df['model'], cmap='viridis',
                            alpha=0.6, s=20)
        ax1.set_xlabel('RMSD from Best Mode (Å)')
        ax1.set_ylabel('Binding Energy (kcal/mol)')
        ax1.set_title('a', loc='left', weight='bold')
        plt.colorbar(scatter, ax=ax1, label='Model')

        sns.violinplot(data=df, x='model', y='score', ax=ax2,
                      color=NATURE_COLORS['pale_blue'], inner='box')
        ax2.set_xlabel('Model')
        ax2.set_ylabel('Binding Energy (kcal/mol)')
        ax2.set_title('b', loc='left', weight='bold')

        plt.tight_layout()
        plt.savefig(self.output_dir / 'structure_activity.png')
        plt.close()

    def create_clustering_analysis(self):
        if len(self.scores) < 3:
            return

        X = np.column_stack([
            [r['rmsd_lb'] for r in self.results],
            self.scores
        ])

        n_clusters = min(3, len(self.scores))
        kmeans = KMeans(n_clusters=n_clusters, random_state=42)
        clusters = kmeans.fit_predict(X)

        plt.figure(figsize=(3.5, 3.5))
        colors = [NATURE_COLORS['main_blue'], NATURE_COLORS['accent_green'],
                 NATURE_COLORS['accent_red']]

        for i in range(n_clusters):
            mask = clusters == i
            plt.scatter(X[mask, 0], X[mask, 1], c=[colors[i]], alpha=0.6,
                      s=30, label=f'Cluster {i+1}')

        plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1],
                   c='black', marker='x', s=100, linewidths=2, label='Centroids')

        plt.xlabel('RMSD (Å)')
        plt.ylabel('Binding Energy (kcal/mol)')
        plt.title('Pose Clustering Analysis', loc='left', weight='bold')
        plt.legend(frameon=True, framealpha=0.8, edgecolor='none')

        plt.savefig(self.output_dir / 'clustering_analysis.png')
        plt.close()

    def create_statistical_summary(self):
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(7.2, 3))

        ax1.plot(range(len(self.scores)), sorted(self.scores),
                color=NATURE_COLORS['main_blue'], marker='o',
                markersize=3, alpha=0.6, linewidth=1)
        ax1.set_xlabel('Pose Rank')
        ax1.set_ylabel('Binding Energy (kcal/mol)')
        ax1.set_title('a', loc='left', weight='bold')
        ax1.grid(True, alpha=0.3, linestyle='--')

        scores_sorted = np.sort(self.scores)
        cumulative = np.arange(1, len(scores_sorted) + 1) / len(scores_sorted)
        ax2.plot(scores_sorted, cumulative, color=NATURE_COLORS['main_blue'],
                marker='o', markersize=3, alpha=0.6, linewidth=1)
        ax2.set_xlabel('Binding Energy (kcal/mol)')
        ax2.set_ylabel('Cumulative Probability')
        ax2.set_title('b', loc='left', weight='bold')
        ax2.grid(True, alpha=0.3, linestyle='--')

        plt.tight_layout()
        plt.savefig(self.output_dir / 'statistical_summary.png')
        plt.close()

    def analyze_results(self):
        if not self.results:
            print("❌ No data to analyze")
            return False

        try:
            print("\n📊 Performing Analysis:")
            print("-" * 40)

            print("\nStatistical Summary:")
            print(f"Total poses: {len(self.results)}")
            print(f"Best score: {min(self.scores):.2f} kcal/mol")
            print(f"Mean score: {np.mean(self.scores):.2f} kcal/mol")
            print(f"Std. deviation: {np.std(self.scores):.2f} kcal/mol")

            self.create_publication_figures()

            self.save_analysis_report()

            self.package_analysis_results()

            return True

        except Exception as e:
            print(f"❌ Analysis error: {str(e)}")
            return False

    def save_analysis_report(self):
        try:
            df = pd.DataFrame(self.results)
            df.to_csv(self.output_dir / 'docking_analysis.csv', index=False)

            with open(self.output_dir / 'analysis_report.txt', 'w', encoding='utf-8') as f:
                f.write("Molecular Docking Analysis Report\n")
                f.write("================================\n\n")
                f.write(f"Analysis Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")

                f.write("Statistical Summary\n")
                f.write("-----------------\n")
                f.write(f"Total poses analyzed: {len(self.results)}\n")
                f.write(f"Best binding energy: {min(self.scores):.2f} kcal/mol\n")
                f.write(f"Mean binding energy: {np.mean(self.scores):.2f} kcal/mol\n")
                f.write(f"Median binding energy: {np.median(self.scores):.2f} kcal/mol\n")
                f.write(f"Standard deviation: {np.std(self.scores):.2f} kcal/mol\n")

                ci = stats.t.interval(0.95, len(self.scores)-1,
                                    loc=np.mean(self.scores),
                                    scale=stats.sem(self.scores))
                f.write(f"95% Confidence Interval: [{ci[0]:.2f}, {ci[1]:.2f}] kcal/mol\n\n")

                f.write("Distribution Analysis\n")
                f.write("--------------------\n")
                f.write(f"Skewness: {stats.skew(self.scores):.3f}\n")
                f.write(f"Kurtosis: {stats.kurtosis(self.scores):.3f}\n")
                shapiro_test = stats.shapiro(self.scores)
                f.write(f"Shapiro-Wilk normality test: W={shapiro_test[0]:.3f}, p={shapiro_test[1]:.3f}\n\n")

                f.write("Top 5 Binding Poses\n")
                f.write("-----------------\n")
                top_poses = sorted(self.results, key=lambda x: x['score'])[:5]
                for i, pose in enumerate(top_poses, 1):
                    f.write(f"\nRank {i}:\n")
                    f.write(f"  File: {pose['file']}\n")
                    f.write(f"  Model: {pose['model']}\n")
                    f.write(f"  Binding Energy: {pose['score']:.2f} kcal/mol\n")
                    f.write(f"  RMSD: {pose['rmsd_lb']:.2f}-{pose['rmsd_ub']:.2f} Å\n")

                print("✓ Saved detailed analysis report")

        except Exception as e:
            print(f"⚠️ Error saving report: {str(e)}")

    def package_analysis_results(self):
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        zip_name = f'docking_analysis_{timestamp}.zip'

        try:
            with zipfile.ZipFile(zip_name, 'w') as zipf:
                for file in self.output_dir.glob('*'):
                    if file.is_file():
                        zipf.write(file, arcname=file.name)
                        print(f"✓ Added {file.name}")

            print(f"\n📦 Analysis results packaged: {zip_name}")
            files.download(zip_name)

        except Exception as e:
            print(f"⚠️ Error packaging results: {str(e)}")

    def parse_docking_file(self, file_path):
        results = []
        try:
            current_model = None
            with open(file_path, 'r') as f:
                for line in f:
                    if line.startswith('MODEL'):
                        try:
                            current_model = int(line.split()[1])
                        except ValueError:
                            current_model = None
                    elif line.startswith('REMARK VINA RESULT:'):
                        try:
                            parts = line.strip().split()
                            score = float(parts[3])
                            rmsd_lb = float(parts[4])
                            rmsd_ub = float(parts[5])
                            results.append({
                                'file': file_path.name,
                                'model': current_model,
                                'score': score,
                                'rmsd_lb': rmsd_lb,
                                'rmsd_ub': rmsd_ub
                            })
                        except (IndexError, ValueError) as e:
                            print(f"⚠️ Error parsing line in {file_path.name}: {str(e)}")
                            continue
            return results

        except Exception as e:
            print(f"⚠️ Error reading file {file_path.name}: {str(e)}")
            return None

def main():
    print("\n🔬 Publication-Quality Docking Analysis Tool")
    print("========================================")

    analyzer = DockingAnalyzer()

    if analyzer.upload_results():
        analyzer.analyze_results()
    else:
        print("\n❌ Analysis terminated")

if __name__ == "__main__":
    main()