<a href="https://colab.research.google.com/github/Dur-e-yashfeen/-Thermophysical-Property-Melting-Point/blob/main/melting_point_pred.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# IMPORTANT: SOME KAGGLE DATA SOURCES ARE PRIVATE
# RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES.
import kagglehub
kagglehub.login()


In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

melting_point_path = kagglehub.competition_download('melting-point')

print('Data source import complete.')


<!-- Main Title - Gold Melting Background -->
# <p style="background: linear-gradient(135deg, #FFD700, #FFA500); font-family:Pacifico,cursive;font-size:150%; color:#8B4513; text-align:center; border-radius: 20%; padding:25px; font-weight: normal; border: 4px solid #8B4513; box-shadow: 0px 10px 25px rgba(255, 215, 0, 0.3);text-shadow: 1px 1px 2px rgba(255, 255, 255, 0.5);">Melting Point Prediction</p>

<!-- Section 1 - Setup Background -->
## <p style="background: linear-gradient(135deg, #E6F3FF, #B0E0E6); font-family:Pacifico,cursive;font-size:150%; color:#2F4F4F; text-align:center; border-radius: 20%; padding:25px; font-weight: normal; border: 4px solid #4682B4; box-shadow: 0px 10px 25px rgba(176, 224, 230, 0.3);text-shadow: 1px 1px 2px rgba(255, 255, 255, 0.5);">1. Environment Setup & Imports</p>

In [None]:
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=DeprecationWarning)
warnings.filterwarnings('ignore', category=UserWarning)
warnings.filterwarnings('ignore', category=RuntimeWarning)
warnings.filterwarnings('ignore', message='numpy.dtype size changed')
warnings.filterwarnings('ignore', message='numpy.ufunc size changed')

# Import libraries
import numpy as np
import pandas as pd
from tqdm import tqdm
import sys
import os

# Set environment variables to suppress warnings
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
os.environ['KMP_WARNINGS'] = '0'

# Import Plotly for interactive visualizations
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.io as pio

# FIXED: Get plotly version correctly
import plotly
plotly_version = plotly.__version__

# Configure Plotly theme
pio.templates.default = "plotly_white"

# Import sklearn
from sklearn.model_selection import KFold, train_test_split
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import Lasso, Ridge, ElasticNet
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings('ignore', category=ConvergenceWarning)

# Import models
import lightgbm as lgb
import xgboost as xgb
import catboost as cb

# Configure libraries to suppress warnings
np.seterr(all='ignore')
pd.options.mode.chained_assignment = None

# FIXED: Correct way to configure LightGBM to be quiet
os.environ['LIGHTGBM_VERBOSE'] = '0'  # This is the most reliable method

# Simple logger class for LightGBM if needed
class SilentLogger:
    def info(self, *args, **kwargs):
        pass
    def warning(self, *args, **kwargs):
        pass
    def debug(self, *args, **kwargs):
        pass
    def error(self, *args, **kwargs):
        pass

# Try to register the silent logger (optional)
try:
    silent_logger = SilentLogger()
    lgb.register_logger(silent_logger)
    print("‚úì Silent logger registered for LightGBM")
except Exception as e:
    print(f"Note: Using environment variable for LightGBM quiet mode")

print("‚úÖ Environment setup complete with all warnings suppressed!")
print(f"üì¶ NumPy version: {np.__version__}")
print(f"üêº Pandas version: {pd.__version__}")
print(f"üìä Plotly version: {plotly_version}")  # FIXED: Using plotly.__version__
print(f"üåü LightGBM version: {lgb.__version__}")
print(f"üöÄ XGBoost version: {xgb.__version__}")
print(f"üò∫ CatBoost version: {cb.__version__}")

# Quick test to ensure models can be created without errors
print("\nüß™ Testing model imports...")

# Test LightGBM
try:
    test_lgb = lgb.LGBMRegressor(n_estimators=2, verbose=-1, random_state=42)
    print("  ‚úì LightGBM: Ready (verbose=-1)")
except Exception as e:
    print(f"  ‚úó LightGBM Error: {e}")

# Test XGBoost
try:
    test_xgb = xgb.XGBRegressor(n_estimators=2, verbosity=0, random_state=42)
    print("  ‚úì XGBoost: Ready (verbosity=0)")
except Exception as e:
    print(f"  ‚úó XGBoost Error: {e}")

# Test CatBoost
try:
    test_cb = cb.CatBoostRegressor(iterations=2, verbose=False, random_seed=42)
    print("  ‚úì CatBoost: Ready (verbose=False)")
except Exception as e:
    print(f"  ‚úó CatBoost Error: {e}")

print("\nüéØ All libraries imported successfully! Ready for data analysis.")

# <p style="background: linear-gradient(135deg, #32CD32, #90EE90); font-family:Pacifico,cursive;font-size:150%; color:#FFFFFF; text-align:center; border-radius: 20%; padding:25px; font-weight: normal; border: 4px solid #FFFFFF; box-shadow: 0px 10px 25px rgba(50, 205, 50, 0.3);text-shadow: 2px 2px 4px rgba(0, 0, 0, 0.4);"> Exploratory Data Analysis</p>

## <p style="background: linear-gradient(135deg, #4682B4, #5F9EA0, #6495ED); font-family:Pacifico,cursive;font-size:150%; color:#FFFFFF; text-align:center; border-radius: 20%; padding:25px; font-weight: normal; border: 4px solid #FFFFFF; box-shadow: 0px 10px 25px rgba(70, 130, 180, 0.3);text-shadow: 2px 2px 4px rgba(0, 0, 0, 0.4);">2. Data Loading & Initial Inspection</p>

In [None]:
# Load all data files
train_df = pd.read_csv('/kaggle/input/melting-point/train.csv')
test_df = pd.read_csv('/kaggle/input/melting-point/test.csv')
sample_submission = pd.read_csv('/kaggle/input/melting-point/sample_submission.csv')

print("üìÅ DATA SHAPES:")
print(f"Train data: {train_df.shape}")
print(f"Test data: {test_df.shape}")
print(f"Sample submission: {sample_submission.shape}")

# Display first few rows
print("\nüìã TRAIN DATA HEAD:")
display(train_df.head(3))

print("\nüìã TEST DATA HEAD:")
display(test_df.head(3))

# Check for missing values
print("\nüîç MISSING VALUES CHECK:")
print(f"Train missing values: {train_df.isnull().sum().sum()}")
print(f"Test missing values: {test_df.isnull().sum().sum()}")

# Identify feature columns
group_cols = [col for col in train_df.columns if col.startswith('Group')]
print(f"\nüéØ Number of Group descriptor columns: {len(group_cols)}")
print(f"First 5 Group columns: {group_cols[:5]}")

# Basic statistics
print("\nüìä TARGET VARIABLE (Tm) STATISTICS:")
print(train_df['Tm'].describe())

## <p style="background: linear-gradient(135deg, #5F9EA0, #20B2AA, #00CED1); font-family:Pacifico,cursive;font-size:150%; color:#FFFFFF; text-align:center; border-radius: 20%; padding:25px; font-weight: normal; border: 4px solid #FFFFFF; box-shadow: 0px 10px 25px rgba(95, 158, 160, 0.3);text-shadow: 2px 2px 4px rgba(0, 0, 0, 0.4);">4. Data Visualization</p>

In [None]:
# Create interactive visualization dashboard
fig = make_subplots(
    rows=3, cols=3,
    subplot_titles=('Distribution of Melting Points',
                    'Box Plot of Melting Points',
                    'Feature Sparsity Distribution',
                    'Top Correlated Features',
                    'SMILES Length Distribution',
                    'Target vs Top Feature',
                    '3D Feature Correlation',
                    'Feature Value Ranges',
                    'Interactive Correlation Matrix'),
    specs=[[{'type': 'histogram'}, {'type': 'box'}, {'type': 'histogram'}],
           [{'type': 'bar'}, {'type': 'histogram'}, {'type': 'scatter'}],
           [{'type': 'scatter3d'}, {'type': 'bar'}, {'type': 'heatmap'}]],
    vertical_spacing=0.08,
    horizontal_spacing=0.08
)

# 1. Target distribution (Histogram)
fig.add_trace(
    go.Histogram(
        x=train_df['Tm'],
        nbinsx=50,
        name='Melting Points',
        marker_color='skyblue',
        opacity=0.7,
        hoverinfo='x+y',
        hovertemplate='<b>Melting Point</b>: %{x:.1f}K<br>Count: %{y}<extra></extra>'
    ),
    row=1, col=1
)

# Add mean and median lines
mean_val = train_df['Tm'].mean()
median_val = train_df['Tm'].median()

fig.add_vline(x=mean_val, line_dash="dash", line_color="red",
              annotation_text=f"Mean: {mean_val:.1f}K",
              annotation_position="top right",
              row=1, col=1)

fig.add_vline(x=median_val, line_dash="dash", line_color="green",
              annotation_text=f"Median: {median_val:.1f}K",
              annotation_position="top left",
              row=1, col=1)

# 2. Box plot
fig.add_trace(
    go.Box(
        y=train_df['Tm'],
        name='Tm',
        boxpoints='outliers',
        marker_color='lightcoral',
        line_color='black',
        hoverinfo='y'
    ),
    row=1, col=2
)
# 3. Feature sparsity distribution (first 50 features)
feature_sparsity = []
for col in group_cols[:50]:
    zero_count = (train_df[col] == 0).sum()
    sparsity = zero_count / len(train_df) * 100
    feature_sparsity.append(sparsity)

fig.add_trace(
    go.Histogram(
        x=feature_sparsity,
        nbinsx=30,
        name='Feature Sparsity',
        marker_color='lightgreen',
        opacity=0.7,
        hoverinfo='x+y',
        hovertemplate='<b>Sparsity</b>: %{x:.1f}%<br>Count: %{y}<extra></extra>'
    ),
    row=1, col=3
)

# 4. Top correlated features
correlations = []
for col in group_cols[:30]:
    corr = train_df[col].corr(train_df['Tm'])
    correlations.append(abs(corr) if not pd.isna(corr) else 0)

top_indices = np.argsort(correlations)[-10:]  # Top 10
top_features = [group_cols[i] for i in top_indices]
top_correlations = [correlations[i] for i in top_indices]

fig.add_trace(
    go.Bar(
        x=top_correlations,
        y=[f'G{i+1}' for i in top_indices],
        orientation='h',
        name='Correlation',
        marker_color=px.colors.sequential.Viridis,
        hoverinfo='x+y',
        hovertemplate='<b>Feature</b>: %{y}<br>Correlation: %{x:.3f}<extra></extra>'
    ),
    row=2, col=1
)

# 5. SMILES length distribution
train_df['SMILES_length'] = train_df['SMILES'].apply(len)
test_df['SMILES_length'] = test_df['SMILES'].apply(len)

fig.add_trace(
    go.Histogram(
        x=train_df['SMILES_length'],
        name='Train',
        nbinsx=30,
        opacity=0.6,
        marker_color='blue',
        hoverinfo='x+y',
        hovertemplate='<b>Train</b><br>Length: %{x}<br>Count: %{y}<extra></extra>'
    ),
    row=2, col=2
)

fig.add_trace(
    go.Histogram(
        x=test_df['SMILES_length'],
        name='Test',
        nbinsx=30,
        opacity=0.6,
        marker_color='red',
        hoverinfo='x+y',
        hovertemplate='<b>Test</b><br>Length: %{x}<br>Count: %{y}<extra></extra>'
    ),
    row=2, col=2
)

# 6. Target vs top feature scatter
if len(top_features) > 0:
    top_feature = top_features[-1]  # Most correlated
    fig.add_trace(
        go.Scatter(
            x=train_df[top_feature],
            y=train_df['Tm'],
            mode='markers',
            name=f'Tm vs {top_feature}',
            marker=dict(
                color=train_df['Tm'],
                colorscale='Viridis',
                size=6,
                opacity=0.6,
                showscale=True,
                colorbar=dict(title="Tm (K)")
            ),
            hoverinfo='x+y',
            hovertemplate=f'<b>{top_feature}</b>: %{{x}}<br>Tm: %{{y:.1f}}K<extra></extra>'
        ),
        row=2, col=3
    )

# 7. 3D Scatter plot of top 3 features
if len(top_features) >= 3:
    fig.add_trace(
        go.Scatter3d(
            x=train_df[top_features[-1]][:200],  # Sample for performance
            y=train_df[top_features[-2]][:200],
            z=train_df[top_features[-3]][:200],
            mode='markers',
            marker=dict(
                size=4,
                color=train_df['Tm'][:200],
                colorscale='Plasma',
                opacity=0.7,
                colorbar=dict(title="Tm (K)", x=1.0)
            ),
            name='3D Feature Space',
            hoverinfo='x+y+z+text',
            hovertext=train_df['Tm'][:200].apply(lambda x: f'Tm: {x:.1f}K')
        ),
        row=3, col=1
    )

# 8. Feature value ranges (first 15 features)
feature_means = []
feature_stds = []
for col in group_cols[:15]:
    feature_means.append(train_df[col].mean())
    feature_stds.append(train_df[col].std())

fig.add_trace(
    go.Bar(
        x=[f'G{i+1}' for i in range(15)],
        y=feature_means,
        name='Mean',
        marker_color='orange',
        opacity=0.7,
        hoverinfo='x+y',
        hovertemplate='<b>Feature</b>: %{x}<br>Mean: %{y:.3f}<extra></extra>'
    ),
    row=3, col=2
)

fig.add_trace(
    go.Bar(
        x=[f'G{i+1}' for i in range(15)],
        y=feature_stds,
        name='Std Dev',
        marker_color='purple',
        opacity=0.7,
        hoverinfo='x+y',
        hovertemplate='<b>Feature</b>: %{x}<br>Std Dev: %{y:.3f}<extra></extra>'
    ),
    row=3, col=2
)

# 9. Interactive correlation matrix (top 10 features)
if len(top_features) >= 5:
    corr_data = train_df[top_features[:5] + ['Tm']].corr()

    fig.add_trace(
        go.Heatmap(
            z=corr_data.values,
            x=corr_data.columns,
            y=corr_data.index,
            colorscale='RdBu',
            zmid=0,
            hoverinfo='z',
            hovertemplate='<b>X</b>: %{x}<br><b>Y</b>: %{y}<br>Correlation: %{z:.3f}<extra></extra>',
            colorbar=dict(title="Correlation", len=0.3)
        ),
        row=3, col=3
    )

# Update layout
fig.update_layout(
    title_text="Interactive Data Visualization Dashboard",
    title_font=dict(size=24, color='darkblue'),
    height=1200,
    showlegend=True,
    hovermode='closest',
    template='plotly_white'
)

# Update axis labels
fig.update_xaxes(title_text="Melting Point (K)", row=1, col=1)
fig.update_xaxes(title_text="Sparsity (%)", row=1, col=3)
fig.update_xaxes(title_text="Correlation", row=2, col=1)
fig.update_xaxes(title_text="SMILES Length", row=2, col=2)
fig.update_xaxes(title_text=top_features[-1] if top_features else "Feature", row=2, col=3)
fig.update_xaxes(title_text="Features", row=3, col=2)
fig.update_xaxes(title_text="Features", row=3, col=3)

fig.update_yaxes(title_text="Count", row=1, col=1)
fig.update_yaxes(title_text="Melting Point (K)", row=1, col=2)
fig.update_yaxes(title_text="Count", row=1, col=3)
fig.update_yaxes(title_text="Features", row=2, col=1)
fig.update_yaxes(title_text="Count", row=2, col=2)
fig.update_yaxes(title_text="Tm (K)", row=2, col=3)
fig.update_yaxes(title_text="Feature 2", row=3, col=1)
fig.update_yaxes(title_text="Value", row=3, col=2)
fig.update_yaxes(title_text="Features", row=3, col=3)

fig.show()

# Print summary statistics
print("\nüìà DATA SUMMARY:")
print(f"‚Ä¢ Number of training samples: {len(train_df)}")
print(f"‚Ä¢ Number of test samples: {len(test_df)}")
print(f"‚Ä¢ Number of features: {len(group_cols)}")
print(f"‚Ä¢ Target range: {train_df['Tm'].min():.1f}K to {train_df['Tm'].max():.1f}K")
print(f"‚Ä¢ Target mean ¬± std: {train_df['Tm'].mean():.1f}K ¬± {train_df['Tm'].std():.1f}K")
print(f"‚Ä¢ Average SMILES length (train): {train_df['SMILES_length'].mean():.1f}")
print(f"‚Ä¢ Average SMILES length (test): {test_df['SMILES_length'].mean():.1f}")

##  <p style="background: linear-gradient(135deg, #5F9EA0, #20B2AA, #00CED1); font-family:Pacifico,cursive;font-size:150%; color:#FFFFFF; text-align:center; border-radius: 20%; padding:25px; font-weight: normal; border: 4px solid #FFFFFF; box-shadow: 0px 10px 25px rgba(95, 158, 160, 0.3);text-shadow: 2px 2px 4px rgba(0, 0, 0, 0.4);">4. Feature Engineering & Preprocessing</p>

In [None]:
# Advanced SMILES feature extraction with RDKit (optional)
def extract_advanced_smiles_features(df):
    """Extract advanced molecular features using RDKit if available"""
    features = pd.DataFrame(index=df.index)

    try:
        from rdkit import Chem
        from rdkit.Chem import Descriptors

        print("Using RDKit for advanced feature extraction...")

        mol_features = []
        for smiles in df['SMILES']:
            try:
                mol = Chem.MolFromSmiles(str(smiles))
                if mol is not None:
                    feat = {
                        'mol_weight': Descriptors.MolWt(mol),
                        'heavy_atoms': Descriptors.HeavyAtomCount(mol),
                        'num_rotatable_bonds': Descriptors.NumRotatableBonds(mol),
                        'num_h_donors': Descriptors.NumHDonors(mol),
                        'num_h_acceptors': Descriptors.NumHAcceptors(mol),
                        'tpsa': Descriptors.TPSA(mol),
                        'logp': Descriptors.MolLogP(mol),
                    }
                else:
                    feat = {key: 0 for key in ['mol_weight', 'heavy_atoms', 'num_rotatable_bonds',
                                              'num_h_donors', 'num_h_acceptors', 'tpsa', 'logp']}
            except:
                feat = {key: 0 for key in ['mol_weight', 'heavy_atoms', 'num_rotatable_bonds',
                                          'num_h_donors', 'num_h_acceptors', 'tpsa', 'logp']}
            mol_features.append(feat)

        mol_df = pd.DataFrame(mol_features)
        features = pd.concat([features, mol_df], axis=1)
        print(f"Added {mol_df.shape[1]} RDKit features")

    except ImportError:
        print("RDKit not available. Using basic features only.")

    return features

## <p style="background: linear-gradient(135deg, #FF8C00, #FF6347); font-family:Pacifico,cursive;font-size:150%; color:#FFFFFF; text-align:center; border-radius: 20%; padding:25px; font-weight: normal; border: 4px solid #FFFFFF; box-shadow: 0px 10px 25px rgba(255, 140, 0, 0.3);text-shadow: 2px 2px 4px rgba(0, 0, 0, 0.4);">5. Model Training & Evaluation</p>

In [None]:
print("üîß Preparing features and target...")
# Identify feature columns (Group 1..N)
group_cols = [col for col in train_df.columns if col.startswith('Group')]

# Separate features and target
X_train = train_df[group_cols]
X_test = test_df[group_cols]
y_train = train_df['Tm'].values

print(f"Original shapes - X_train: {X_train.shape}, X_test: {X_test.shape}")

print("üõ†Ô∏è Adding SMILES-based features...")

def extract_smiles_features(df):
    """Extract basic features from SMILES strings"""
    features = pd.DataFrame(index=df.index)

    features['smiles_length'] = df['SMILES'].str.len()
    features['num_c'] = df['SMILES'].str.count('C')
    features['num_o'] = df['SMILES'].str.count('O')
    features['num_n'] = df['SMILES'].str.count('N')
    features['num_double_bonds'] = df['SMILES'].str.count('=')
    features['num_triple_bonds'] = df['SMILES'].str.count('#')

    # Use raw strings for parentheses
    features['num_branch'] = df['SMILES'].str.count(r'\(') + df['SMILES'].str.count(r'\)')

    return features

# Add SMILES features
train_smiles_features = extract_smiles_features(train_df)
test_smiles_features = extract_smiles_features(test_df)

# Combine with original features
X_train = pd.concat([X_train, train_smiles_features], axis=1)
X_test = pd.concat([X_test, test_smiles_features], axis=1)

print(f"After feature engineering - X_train: {X_train.shape}, X_test: {X_test.shape}")

print("üßπ Handling missing values...")
X_train = X_train.fillna(X_train.median())
X_test = X_test.fillna(X_test.median())

print("‚öñÔ∏è Scaling features...")
scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("‚úÖ Data preprocessing complete!")
print(f"Final shapes - X_train_scaled: {X_train_scaled.shape}, X_test_scaled: {X_test_scaled.shape}")

# ====================================================================
# MODEL TRAINING - YOUR EXISTING CODE (NOW IT WILL WORK)
# ====================================================================

print("ü§ñ Training Machine Learning Models...")

# Define models
models = {
    'LightGBM': lgb.LGBMRegressor(
        n_estimators=1000,
        learning_rate=0.05,
        num_leaves=31,
        random_state=42,
        n_jobs=-1,
        verbose=-1
    ),
    'XGBoost': xgb.XGBRegressor(
        n_estimators=1000,
        learning_rate=0.05,
        max_depth=6,
        random_state=42,
        n_jobs=-1
    ),
    'RandomForest': RandomForestRegressor(
        n_estimators=500,
        max_depth=10,
        random_state=42,
        n_jobs=-1
    ),
    'Ridge': Ridge(alpha=1.0, random_state=42),
    'Lasso': Lasso(alpha=0.001, max_iter=10000, random_state=42)
}

# Cross-validation setup
n_folds = 5
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

# Store results
cv_results = {}
oof_predictions = {}
test_predictions = {}

print(f"\nüîç Performing {n_folds}-fold cross-validation...")

for model_name, model in models.items():
    print(f"\nüìä Training {model_name}...")

    fold_scores = []
    oof_pred = np.zeros(len(X_train_scaled))
    test_pred = np.zeros(len(X_test_scaled))

    for fold, (train_idx, val_idx) in enumerate(kf.split(X_train_scaled, y_train)):
        # Split data
        X_tr, X_val = X_train_scaled[train_idx], X_train_scaled[val_idx]
        y_tr, y_val = y_train[train_idx], y_train[val_idx]

        # Train
        model.fit(X_tr, y_tr)

        # Predict
        val_pred = model.predict(X_val)
        oof_pred[val_idx] = val_pred

        # Test prediction (average across folds)
        test_pred += model.predict(X_test_scaled) / n_folds

        # Calculate MAE for this fold
        fold_mae = mean_absolute_error(y_val, val_pred)
        fold_scores.append(fold_mae)

        print(f"  Fold {fold+1}: MAE = {fold_mae:.3f}")

    # Store results
    mean_mae = np.mean(fold_scores)
    std_mae = np.std(fold_scores)

    cv_results[model_name] = {
        'mean_mae': mean_mae,
        'std_mae': std_mae,
        'scores': fold_scores,
        'r2': r2_score(y_train, oof_pred)
    }

    oof_predictions[model_name] = oof_pred
    test_predictions[model_name] = test_pred

    print(f"  ‚úÖ {model_name} CV MAE: {mean_mae:.3f} ¬± {std_mae:.3f}")
    print(f"  üìà {model_name} R¬≤ Score: {cv_results[model_name]['r2']:.4f}")

# Create interactive results visualization
results_df = pd.DataFrame({
    'Model': list(cv_results.keys()),
    'Mean MAE': [cv_results[m]['mean_mae'] for m in cv_results.keys()],
    'Std MAE': [cv_results[m]['std_mae'] for m in cv_results.keys()],
    'R¬≤ Score': [cv_results[m]['r2'] for m in cv_results.keys()],
    'Min MAE': [min(cv_results[m]['scores']) for m in cv_results.keys()],
    'Max MAE': [max(cv_results[m]['scores']) for m in cv_results.keys()]
}).sort_values('Mean MAE')

print("\n" + "="*60)
print("üìã CROSS-VALIDATION RESULTS")
print("="*60)
display(results_df)

# Interactive visualization
fig = make_subplots(
    rows=2, cols=2,
    subplot_titles=('Model Performance Comparison (MAE)',
                    'Model R¬≤ Scores',
                    'Cross-Validation Fold Performance',
                    'Actual vs Predicted - Best Model'),
    specs=[[{'type': 'bar'}, {'type': 'bar'}],
           [{'type': 'box'}, {'type': 'scatter'}]],
    vertical_spacing=0.12,
    horizontal_spacing=0.1
)

# 1. MAE comparison with error bars
fig.add_trace(
    go.Bar(
        x=results_df['Model'],
        y=results_df['Mean MAE'],
        error_y=dict(type='data', array=results_df['Std MAE']),
        name='MAE ¬± Std',
        marker_color=px.colors.qualitative.Set3,
        text=results_df['Mean MAE'].round(3),
        textposition='auto',
        hoverinfo='x+y+text',
        hovertemplate='<b>%{x}</b><br>MAE: %{y:.3f}<br>Std: %{error_y.array:.3f}<extra></extra>'
    ),
    row=1, col=1
)

# 2. R¬≤ scores
fig.add_trace(
    go.Bar(
        x=results_df['Model'],
        y=results_df['R¬≤ Score'],
        name='R¬≤ Score',
        marker_color=px.colors.sequential.Viridis,
        text=results_df['R¬≤ Score'].round(3),
        textposition='auto',
        hoverinfo='x+y',
        hovertemplate='<b>%{x}</b><br>R¬≤: %{y:.3f}<extra></extra>'
    ),
    row=1, col=2
)

# 3. Box plot of fold scores
fold_data = []
for model_name in cv_results.keys():
    fold_data.append(go.Box(
        y=cv_results[model_name]['scores'],
        name=model_name,
        marker_color=px.colors.qualitative.Pastel[list(cv_results.keys()).index(model_name)],
        boxpoints='all',
        jitter=0.3,
        pointpos=-1.8
    ))

for trace in fold_data:
    fig.add_trace(trace, row=2, col=1)

# 4. Actual vs Predicted for best model
best_model = results_df.iloc[0]['Model']
y_pred_best = oof_predictions[best_model]

fig.add_trace(
    go.Scatter(
        x=y_train,
        y=y_pred_best,
        mode='markers',
        name='Predictions',
        marker=dict(
            color=y_train,
            colorscale='Viridis',
            size=8,
            opacity=0.6,
            showscale=True,
            colorbar=dict(title="Actual Tm", x=1.02)
        ),
        hoverinfo='x+y',
        hovertemplate='<b>Actual</b>: %{x:.1f}K<br><b>Predicted</b>: %{y:.1f}K<extra></extra>'
    ),
    row=2, col=2
)

# Add perfect prediction line
min_val = min(y_train.min(), y_pred_best.min())
max_val = max(y_train.max(), y_pred_best.max())
fig.add_trace(
    go.Scatter(
        x=[min_val, max_val],
        y=[min_val, max_val],
        mode='lines',
        name='Perfect Prediction',
        line=dict(color='red', dash='dash', width=2),
        hoverinfo='skip'
    ),
    row=2, col=2
)

# Update layout
fig.update_layout(
    title_text=f"Model Performance Dashboard | Best Model: {best_model}",
    title_font=dict(size=22, color='darkblue'),
    height=900,
    showlegend=True,
    template='plotly_white',
    hovermode='closest'
)

# Update axis labels
fig.update_xaxes(title_text="Model", row=1, col=1)
fig.update_xaxes(title_text="Model", row=1, col=2)
fig.update_xaxes(title_text="Model", row=2, col=1)
fig.update_xaxes(title_text="Actual Tm (K)", row=2, col=2)

fig.update_yaxes(title_text="Mean Absolute Error (MAE)", row=1, col=1)
fig.update_yaxes(title_text="R¬≤ Score", row=1, col=2)
fig.update_yaxes(title_text="Fold MAE", row=2, col=1)
fig.update_yaxes(title_text="Predicted Tm (K)", row=2, col=2)

fig.show()

## <p style="background: linear-gradient(135deg, #FFD700, #FFA500, #FF8C00); font-family:Pacifico,cursive;font-size:150%; color:#8B4513; text-align:center; border-radius: 20%; padding:25px; font-weight: normal; border: 4px solid #8B4513; box-shadow: 0px 10px 25px rgba(255, 215, 0, 0.3);text-shadow: 1px 1px 2px rgba(255, 255, 255, 0.5);">6. Ensemble Modeling & Final Predictions</p>

In [None]:
print("üéØ Creating Ensemble Model...")

# Create weighted ensemble
weights = {}
for model_name in cv_results.keys():
    # Lower MAE = higher weight
    weights[model_name] = 1 / cv_results[model_name]['mean_mae']

# Normalize weights
total_weight = sum(weights.values())
for model_name in weights:
    weights[model_name] /= total_weight

print("\n‚öñÔ∏è Ensemble Weights:")
for model_name, weight in weights.items():
    print(f"  {model_name}: {weight:.3f}")

# Create ensemble prediction
ensemble_train = np.zeros(len(X_train_scaled))
ensemble_test = np.zeros(len(X_test_scaled))

for model_name in test_predictions.keys():
    ensemble_train += oof_predictions[model_name] * weights[model_name]
    ensemble_test += test_predictions[model_name] * weights[model_name]

# Calculate ensemble performance
ensemble_mae = mean_absolute_error(y_train, ensemble_train)
ensemble_r2 = r2_score(y_train, ensemble_train)

print(f"\n‚úÖ Ensemble Performance:")
print(f"  MAE: {ensemble_mae:.3f}")
print(f"  R¬≤: {ensemble_r2:.3f}")

# Create interactive ensemble visualization
fig = make_subplots(
    rows=2, cols=2,
    subplot_titles=('Model Weights in Ensemble',
                    'Performance Improvement (%)',
                    'Residual Distribution',
                    'Ensemble Predictions vs Actual'),
    specs=[[{'type': 'bar'}, {'type': 'bar'}],
           [{'type': 'histogram'}, {'type': 'scatter'}]]
)

# 1. Model weights
fig.add_trace(
    go.Bar(
        x=list(weights.keys()),
        y=list(weights.values()),
        name='Weight',
        marker_color=px.colors.sequential.Plasma,
        text=[f'{w:.3f}' for w in weights.values()],
        textposition='auto',
        hoverinfo='x+y',
        hovertemplate='<b>%{x}</b><br>Weight: %{y:.3f}<extra></extra>'
    ),
    row=1, col=1
)

# 2. Performance improvement
improvements = []
for model_name in cv_results.keys():
    model_mae = cv_results[model_name]['mean_mae']
    improvement = ((model_mae - ensemble_mae) / model_mae) * 100
    improvements.append(improvement)

fig.add_trace(
    go.Bar(
        x=list(cv_results.keys()),
        y=improvements,
        name='Improvement',
        marker_color=['green' if imp > 0 else 'red' for imp in improvements],
        text=[f'{imp:+.1f}%' for imp in improvements],
        textposition='auto',
        hoverinfo='x+y',
        hovertemplate='<b>%{x}</b><br>Improvement: %{y:.1f}%<extra></extra>'
    ),
    row=1, col=2
)

# 3. Residual distribution
residuals = y_train - ensemble_train

fig.add_trace(
    go.Histogram(
        x=residuals,
        nbinsx=50,
        name='Residuals',
        marker_color='lightblue',
        opacity=0.7,
        hoverinfo='x+y',
        hovertemplate='<b>Residual</b>: %{x:.2f}<br>Count: %{y}<extra></extra>'
    ),
    row=2, col=1
)

# Add mean and std lines
residual_mean = residuals.mean()
residual_std = residuals.std()

fig.add_vline(x=residual_mean, line_dash="dash", line_color="red",
              annotation_text=f"Mean: {residual_mean:.2f}",
              annotation_position="top right",
              row=2, col=1)

fig.add_vline(x=residual_mean + residual_std, line_dash="dot", line_color="orange",
              annotation_text=f"+1œÉ",
              row=2, col=1)

fig.add_vline(x=residual_mean - residual_std, line_dash="dot", line_color="orange",
              annotation_text=f"-1œÉ",
              row=2, col=1)

# 4. Ensemble predictions vs actual
fig.add_trace(
    go.Scatter(
        x=y_train,
        y=ensemble_train,
        mode='markers',
        name='Predictions',
        marker=dict(
            color=residuals,
            colorscale='RdBu',
            size=8,
            opacity=0.6,
            showscale=True,
            colorbar=dict(title="Residuals", x=1.02),
            cmin=-3*residual_std,
            cmax=3*residual_std
        ),
        hoverinfo='x+y+text',
        hovertemplate='<b>Actual</b>: %{x:.1f}K<br><b>Predicted</b>: %{y:.1f}K<br><b>Error</b>: %{text:.1f}K<extra></extra>',
        text=abs(residuals)
    ),
    row=2, col=2
)

# Add perfect prediction line
fig.add_trace(
    go.Scatter(
        x=[min_val, max_val],
        y=[min_val, max_val],
        mode='lines',
        name='Perfect',
        line=dict(color='red', dash='dash', width=2),
        hoverinfo='skip'
    ),
    row=2, col=2
)

# Update layout
fig.update_layout(
    title_text=f"Ensemble Model Analysis | MAE: {ensemble_mae:.3f} | R¬≤: {ensemble_r2:.3f}",
    title_font=dict(size=22, color='darkblue'),
    height=800,
    showlegend=True,
    template='plotly_white'
)

fig.update_xaxes(title_text="Model", row=1, col=1)
fig.update_xaxes(title_text="Model", row=1, col=2)
fig.update_xaxes(title_text="Residual (Actual - Predicted)", row=2, col=1)
fig.update_xaxes(title_text="Actual Tm (K)", row=2, col=2)

fig.update_yaxes(title_text="Weight", row=1, col=1)
fig.update_yaxes(title_text="Improvement (%)", row=1, col=2)
fig.update_yaxes(title_text="Count", row=2, col=1)
fig.update_yaxes(title_text="Predicted Tm (K)", row=2, col=2)

fig.show()

## <p style="background: linear-gradient(135deg, #DC143C, #FF4500); font-family:Pacifico,cursive;font-size:150%; color:#FFFFFF; text-align:center; border-radius: 20%; padding:25px; font-weight: normal; border: 4px solid #FFD700; box-shadow: 0px 10px 25px rgba(220, 20, 60, 0.3);text-shadow: 2px 2px 4px rgba(0, 0, 0, 0.4);">7. Results & Submission</p>


In [None]:
# %% [code]
# <p style="background-image: url(https://th.bing.com/th/id/OIP.QDu8NVyGf3k-w9iVMvmKJAHaEO?rs=1&pid=ImgDetMain);font-family:Pacifico,cursive;font-size:150%; color:#E6E6FA; text-align:center; border-radius: 20%; padding:25px; font-weight: normal; border: 4px solid rgb(118, 20, 198); box-shadow: 0px 10px 25px rgba(0, 0, 0, 0.3);text-shadow: 2px 2px 4px rgba(0, 0, 0, 0.4); background-size: cover; background-repeat: no-repeat; background-position: center;">7. Final Submission</p>

print("üì§ Preparing Final Submission...")

# Create submission DataFrame
submission = sample_submission.copy()
submission['Tm'] = ensemble_test

# Clip predictions to reasonable range
train_min, train_max = train_df['Tm'].min(), train_df['Tm'].max()
margin = 0.1 * (train_max - train_min)
submission['Tm'] = submission['Tm'].clip(train_min - margin, train_max + margin)

# Save submission file
submission_file = 'submission.csv'
submission.to_csv(submission_file, index=False)
print(f"\n‚úÖ Submission file saved as: {submission_file}")

# Create simple histogram comparison
fig = go.Figure()

# Add training data histogram
fig.add_trace(go.Histogram(
    x=train_df['Tm'],
    nbinsx=40,
    name='Training Data',
    marker_color='blue',
    opacity=0.6,
    hovertemplate='<b>Training</b><br>Tm: %{x:.1f}K<br>Count: %{y}<extra></extra>'
))

# Add test predictions histogram
fig.add_trace(go.Histogram(
    x=submission['Tm'],
    nbinsx=40,
    name='Test Predictions',
    marker_color='red',
    opacity=0.6,
    hovertemplate='<b>Prediction</b><br>Tm: %{x:.1f}K<br>Count: %{y}<extra></extra>'
))

# Update layout
fig.update_layout(
    title=f"Final Submission: Distribution Comparison<br><sup>Ensemble MAE: {ensemble_mae:.3f} | R¬≤: {ensemble_r2:.3f}</sup>",
    title_font=dict(size=20, color='darkblue'),
    xaxis_title="Melting Point (K)",
    yaxis_title="Count",
    barmode='overlay',
    template='plotly_white',
    height=500,
    showlegend=True,
    legend=dict(x=0.02, y=0.98, bgcolor='rgba(255, 255, 255, 0.8)'),
    annotations=[
        dict(
            text=f"Best Model: {results_df.iloc[0]['Model']}",
            x=0.98, y=0.98,
            xref="paper", yref="paper",
            showarrow=False,
            font=dict(size=12),
            align="right",
            bgcolor="lightyellow"
        ),
        dict(
            text=f"Test Samples: {len(submission)}",
            x=0.98, y=0.92,
            xref="paper", yref="paper",
            showarrow=False,
            font=dict(size=12),
            align="right",
            bgcolor="lightyellow"
        )
    ]
)

# Reduce opacity to see both histograms
fig.update_traces(opacity=0.6)

fig.show()

# Display sample of submission
print("\nüìã SUBMISSION FILE SAMPLE:")
display(submission.head())

# Print final statistics
print("\n" + "="*60)
print("üìä PREDICTION STATISTICS:")
print("="*60)
print(f"Minimum Prediction: {submission['Tm'].min():.2f}K")
print(f"Maximum Prediction: {submission['Tm'].max():.2f}K")
print(f"Mean Prediction:    {submission['Tm'].mean():.2f}K")
print(f"Std Deviation:      {submission['Tm'].std():.2f}K")
print(f"25th Percentile:    {submission['Tm'].quantile(0.25):.2f}K")
print(f"75th Percentile:    {submission['Tm'].quantile(0.75):.2f}K")

print("\n" + "="*60)
print("üéâ NOTEBOOK COMPLETED SUCCESSFULLY! üéâ")
print("="*60)
print(f"\nüìã FINAL RESULTS SUMMARY:")
print(f"  ‚Ä¢ Best Individual Model: {results_df.iloc[0]['Model']} (MAE: {results_df.iloc[0]['Mean MAE']:.3f})")
print(f"  ‚Ä¢ Ensemble MAE: {ensemble_mae:.3f}")
print(f"  ‚Ä¢ Ensemble R¬≤: {ensemble_r2:.3f}")
print(f"  ‚Ä¢ Total Features Used: {X_train_scaled.shape[1]}")
print(f"  ‚Ä¢ Submission File: {submission_file}")

### <p style="background: linear-gradient(135deg, #FF69B4, #FF1493, #DB7093); font-family:Pacifico,cursive;font-size:150%; color:#FFFFFF; text-align:center; border-radius: 20%; padding:25px; font-weight: normal; border: 4px solid #FFFFFF; box-shadow: 0px 10px 25px rgba(255, 105, 180, 0.3);text-shadow: 2px 2px 4px rgba(0, 0, 0, 0.4);">Thanks for Reading! If you like it, please Share and Upvote this notebook ‚ù§Ô∏è</p>
