In [8]:

# !pip install numpy=1.26.4 pandas=2.2.2 -y


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler ,  StandardScaler, OneHotEncoder
from sklearn.model_selection import GridSearchCV,train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
import pickle

AttributeError: module 'pyarrow' has no attribute '__version__'

In [9]:
!pip install tensorflow

Collecting tensorflow
  Obtaining dependency information for tensorflow from https://files.pythonhosted.org/packages/3c/e3/e868f1d5951047f950d2ba1e04a765a3328a51f06996b67976d6102f8227/tensorflow-2.19.0-cp311-cp311-win_amd64.whl.metadata
  Using cached tensorflow-2.19.0-cp311-cp311-win_amd64.whl.metadata (4.1 kB)
Collecting wrapt>=1.11.0 (from tensorflow)
  Obtaining dependency information for wrapt>=1.11.0 from https://files.pythonhosted.org/packages/47/f8/fb1773491a253cbc123c5d5dc15c86041f746ed30416535f2a8df1f4a392/wrapt-1.17.2-cp311-cp311-win_amd64.whl.metadata
  Downloading wrapt-1.17.2-cp311-cp311-win_amd64.whl.metadata (6.5 kB)
Collecting markdown>=2.6.8 (from tensorboard~=2.19.0->tensorflow)
  Obtaining dependency information for markdown>=2.6.8 from https://files.pythonhosted.org/packages/3f/08/83871f3c50fc983b88547c196d11cf8c3340e37c32d2e9d6152abe2c61f7/Markdown-3.7-py3-none-any.whl.metadata
  Downloading Markdown-3.7-py3-none-any.whl.metadata (7.0 kB)
Collecting werkzeug>=1.0.

In [None]:
df_feat = pd.read_csv(r'C:\Users\NAGUI\Downloads\Projet PIE - Encrypted\Projet PIE - Encrypted\encoded data\opamps-features.csv')
df_ref = pd.read_csv(r'C:\Users\NAGUI\Downloads\Projet PIE - Encrypted\Projet PIE - Encrypted\encoded data\opamps-xref.csv')

# Prétraitement

In [None]:
categorical_columns = ["Supplier_Package","MANUFACTURER"]
numerical_columns = ["Maximum Input Offset Voltage","Maximum Single Supply Voltage","Minimum Single Supply Voltage","Number of Channels per Chip","Typical Gain Bandwidth Product"]

## Normalisation des données numériques

In [None]:
import helper_functions.preprocess as pp

df_feat_scaled , scaler = pp.scale(df_feat,numerical_columns)
df = pp.merge_datasets(df_feat_scaled,df_ref)

## Conversion de la Cross Reference Type en numerique

In [None]:
base_means = {'A': 0.95, 'B': 0.8, 'C': 0.65, 'D': 0.5}
base_std = {'A': 0.025 / 3, 'B': 0.025 / 2.5, 'C': 0.025 / 2, 'D': 0.025 / 1.5}

In [None]:
df = pp.generate_closeness(df,base_means,base_std,n_std=3)
sns.kdeplot(data=df, x='Closeness', hue='Cross Reference Type', fill=True, common_norm=False)

In [None]:
df

In [None]:
df.columns

In [None]:
df=df[['MPN', 'MANUFACTURER',
       'Maximum Input Offset Voltage', 'Maximum Single Supply Voltage',
       'Minimum Single Supply Voltage', 'Number of Channels per Chip',
       'Supplier_Package', 'Typical Gain Bandwidth Product']]

In [None]:
df['Supplier_Package'].unique()

In [None]:
df=df.drop_duplicates(subset=['MPN', 'MANUFACTURER'])

In [None]:
df

In [None]:
print(nrow:=len(df))
package_counts = df.groupby('Supplier_Package').size()
print(package_frequencies := package_counts/nrow)

In [None]:
# Extract original numerical columns and Supplier_Package
original_numerical = ['Maximum Input Offset Voltage', 'Maximum Single Supply Voltage',
                      'Minimum Single Supply Voltage', 'Number of Channels per Chip',
                      'Typical Gain Bandwidth Product']
X_numerical = df[original_numerical].values
y = df['MANUFACTURER'].values


# Create frequency-weighted one-hot encoding for Supplier_Package
package_freq_df = pd.get_dummies(df['Supplier_Package']).astype('float')
for col in package_freq_df.columns:
    package_freq_df[col] = package_freq_df[col] * package_frequencies[col]
X_package_freq = package_freq_df.values

# Combine unchanged numerical features with frequency-weighted package features
X = np.hstack((X_numerical, X_package_freq))
numerical_columns = original_numerical + package_freq_df.columns.tolist()


In [None]:
# Debug: Check the frequency-weighted matrix
print("X_package_freq sample:")
print(X_package_freq[:5])  # Print first 5 rows to inspect
print(f"X_package_freq min: {X_package_freq.min()}, max: {X_package_freq.max()}")

In [None]:
from sklearn.manifold import TSNE
from sklearn.metrics import silhouette_score
import itertools
from sklearn.decomposition import NMF

# Assuming your preprocessed df is already loaded from the notebook
# If not, you would need to run the preprocessing steps first

# Define the hyperparameter search space
param_grid = {
    'perplexity': [30, 50, 70],         
    'learning_rate': [100, 200, 'auto'], 
    'n_iter': [1000, 2000],             
    'early_exaggeration': [12.0]        
}

## 2D t-SNE Optimization Function

This function performs hyperparameter optimization for 2D t-SNE projection:
- Takes numerical features (X) and manufacturer labels (y) as input
- Performs grid search over the reduced parameter space (~18 combinations)
- Evaluates each combination using silhouette score
- Returns the best model, parameters, and score

In [None]:
numerical_columns

In [None]:
def optimize_tsne_2d(X, y, param_grid, random_state=42):
    """
    Optimize TSNE for 2D projection
    X: numerical features
    y: manufacturer labels
    """
    best_score = -1
    best_params = None
    best_model = None
    
    # Generate all parameter combinations
    param_combinations = list(itertools.product(
        param_grid['perplexity'],
        param_grid['learning_rate'],
        param_grid['n_iter'],
        param_grid['early_exaggeration']
    ))
    
    for params in param_combinations:
        perplexity, learning_rate, n_iter, early_exaggeration = params
        
        try:
            # Create and fit TSNE model
            tsne = TSNE(
                n_components=2,
                perplexity=perplexity,
                learning_rate=learning_rate,
                n_iter=n_iter,
                early_exaggeration=early_exaggeration,
                random_state=random_state,
                n_jobs=-1
            )
            
            # Transform the data
            X_transformed = tsne.fit_transform(X)
            
            # Calculate silhouette score
            score = silhouette_score(X_transformed, y)
            
            # Update best parameters if current score is better
            if score > best_score:
                best_score = score
                best_params = {
                    'perplexity': perplexity,
                    'learning_rate': learning_rate,
                    'n_iter': n_iter,
                    'early_exaggeration': early_exaggeration
                }
                best_model = tsne
                
            print(f"2D - Params: {params}, Silhouette Score: {score:.4f}")
            
        except Exception as e:
            print(f"2D - Error with params {params}: {str(e)}")
            continue
    
    return best_model, best_params, best_score

## 3D t-SNE Optimization Function

This function performs hyperparameter optimization for 3D t-SNE projection:
- Similar to the 2D version but projects to 3 dimensions
- Uses the same reduced parameter space (~18 combinations)
- Evaluates using silhouette score with MANUFACTURER as reference
- Returns the best model, parameters, and score

In [None]:
def optimize_tsne_3d(X, y, param_grid, random_state=42):
    """
    Optimize TSNE for 3D projection
    X: numerical features
    y: manufacturer labels
    """
    best_score = -1
    best_params = None
    best_model = None
    
    # Generate all parameter combinations
    param_combinations = list(itertools.product(
        param_grid['perplexity'],
        param_grid['learning_rate'],
        param_grid['n_iter'],
        param_grid['early_exaggeration']
    ))
    
    for params in param_combinations:
        perplexity, learning_rate, n_iter, early_exaggeration = params
        
        try:
            # Create and fit TSNE model
            tsne = TSNE(
                n_components=3,
                perplexity=perplexity,
                learning_rate=learning_rate,
                n_iter=n_iter,
                early_exaggeration=early_exaggeration,
                random_state=random_state,
                n_jobs=-1
            )
            
            # Transform the data
            X_transformed = tsne.fit_transform(X)
            
            # Calculate silhouette score
            score = silhouette_score(X_transformed, y)
            
            # Update best parameters if current score is better
            if score > best_score:
                best_score = score
                best_params = {
                    'perplexity': perplexity,
                    'learning_rate': learning_rate,
                    'n_iter': n_iter,
                    'early_exaggeration': early_exaggeration
                }
                best_model = tsne
                
            print(f"3D - Params: {params}, Silhouette Score: {score:.4f}")
            
        except Exception as e:
            print(f"3D - Error with params {params}: {str(e)}")
            continue
    
    return best_model, best_params, best_score

## Data Preparation and Optimization Execution

This section:
- Prepares the numerical features and labels from the preprocessed DataFrame
- Executes the optimization for both 2D and 3D projections with reduced grid
- Displays the best results for each dimensionality
- Expected to run much faster with only 18 combinations per dimension

In [None]:
# Combine unchanged numerical features with frequency-weighted package features
X = np.hstack((X_numerical, X_package_freq))
y = df['MANUFACTURER'].values



# Shift to non-negative for NMF (add min value if negative)
X_shifted = X - X.min() if X.min() < 0 else X

# Apply NMF to reduce dimensions
nmf = NMF(n_components=10, random_state=42, init='nndsvd', max_iter=10000)  # 10000 components as a starting point
X_nmf = nmf.fit_transform(X_shifted)

# Optimize for 2D
print("Optimizing 2D t-SNE with NMF-reduced scaled features...")
best_model_2d, best_params_2d, best_score_2d = optimize_tsne_2d(X_nmf, y, param_grid)
print("\nBest 2D Results:")
print(f"Best Parameters: {best_params_2d}")
print(f"Best Silhouette Score: {best_score_2d:.4f}")

# Optimize for 3D
print("\nOptimizing 3D t-SNE with NMF-reduced scaled features...")
best_model_3d, best_params_3d, best_score_3d = optimize_tsne_3d(X_nmf, y, param_grid)
print("\nBest 3D Results:")
print(f"Best Parameters: {best_params_3d}")
print(f"Best Silhouette Score: {best_score_3d:.4f}")

## Visualization of Results

This section creates visualizations of the best t-SNE projections:
- 2D scatter plot showing manufacturer clusters
- 3D scatter plot showing manufacturer clusters
- Both colored by MANUFACTURER with silhouette scores in titles
- Uses the best models found from the grid search

In [None]:
# Visualization of the best results
# 2D Visualization
X_2d = best_model_2d.fit_transform(X_nmf)
plt.figure(figsize=(10, 8))
scatter = plt.scatter(X_2d[:, 0], X_2d[:, 1], c=pd.factorize(y)[0], cmap='viridis')
plt.colorbar(scatter, label='MANUFACTURER')
plt.title(f'2D t-SNE (Silhouette Score: {best_score_2d:.4f})')
plt.xlabel('t-SNE Component 1')
plt.ylabel('t-SNE Component 2')
plt.show()

In [None]:
# 3D Visualization
X_3d = best_model_3d.fit_transform(X_nmf)
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')
scatter = ax.scatter(X_3d[:, 0], X_3d[:, 1], X_3d[:, 2], c=pd.factorize(y)[0], cmap='viridis')
plt.colorbar(scatter, label='MANUFACTURER')
ax.set_title(f'3D t-SNE (Silhouette Score: {best_score_3d:.4f})')
ax.set_xlabel('t-SNE Component 1')
ax.set_ylabel('t-SNE Component 2')
ax.set_zlabel('t-SNE Component 3')
plt.show()

In [7]:
!python --version

In [8]:

import os
import time

# Path to your Anaconda site-packages
site_packages = r"C:\Users\NAGUI\anaconda3\Lib\site-packages"
tf_dir = os.path.join(site_packages, "tensorflow")

if os.path.exists(tf_dir):
    print(f"TensorFlow directory exists at {tf_dir}")
    print(f"Last modified: {time.ctime(os.path.getmtime(tf_dir))}")
    print("Checking if it's still being modified...")
    initial_time = os.path.getmtime(tf_dir)
    time.sleep(5)  # Wait 5 seconds
    new_time = os.path.getmtime(tf_dir)
    if new_time > initial_time:
        print("Directory is being modified—installation likely in progress.")
    else:
        print("No recent changes—installation might be done or stalled.")
else:
    print("TensorFlow directory not found—installation hasn’t started or failed.")

import os

# Check pip cache for TensorFlow downloads
cache_dir = os.path.expanduser("~\\AppData\\Local\\pip\\cache")
if os.path.exists(cache_dir):
    for root, dirs, files in os.walk(cache_dir):
        for file in files:
            if "tensorflow" in file.lower():
                file_path = os.path.join(root, file)
                print(f"Found TensorFlow file: {file_path}")
                print(f"Size: {os.path.getsize(file_path) / (1024 * 1024):.2f} MB")
                print(f"Last modified: {time.ctime(os.path.getmtime(file_path))}")
else:
    print("Pip cache not found or empty.")

# Check if pip is still downloading
print("\nRunning processes:")
!tasklist | findstr "pip"