In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
import warnings
warnings.filterwarnings('ignore')

class BreastCancerAnalysis:
    def __init__(self, random_state=42):
        self.random_state = random_state
        self.scaler = StandardScaler()
        
    def load_data(self, filepath):
        # Define column names
        columns = ['id', 'clump_thickness', 'cell_size_uniformity', 
                  'cell_shape_uniformity', 'marginal_adhesion', 
                  'epithelial_cell_size', 'bare_nuclei', 'bland_chromatin',
                  'normal_nucleoli', 'mitoses', 'class']
        
        # Read the data
        df = pd.read_csv(filepath, header=None, names=columns)
        
        # Handle missing values
        df = df.replace('?', np.nan)
        df = df.dropna()
        
        # Convert to numeric
        for col in df.columns:
            if col != 'id':
                df[col] = pd.to_numeric(df[col])
        
        # Convert class to binary (2: benign -> 0, 4: malignant -> 1)
        df['class'] = (df['class'] == 4).astype(int)
        
        # Separate features and target
        X = df.drop(['id', 'class'], axis=1)
        y = df['class']
        
        # Scale features
        X_scaled = self.scaler.fit_transform(X)
        X_scaled = pd.DataFrame(X_scaled, columns=X.columns)
        
        return X_scaled, y
    
    def train_and_evaluate(self, X, y, train_ratio, trial):
        # Split data
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=(1-train_ratio), random_state=self.random_state+trial
        )
        
        models = {
            'SVM': SVC(random_state=self.random_state),
            'Random Forest': RandomForestClassifier(random_state=self.random_state),
            'KNN': KNeighborsClassifier()
        }
        
        results = []
        for name, model in models.items():
            # Train model
            model.fit(X_train, y_train)
            
            # Calculate scores
            train_score = model.score(X_train, y_train)
            test_score = model.score(X_test, y_test)
            cv_score = np.mean(cross_val_score(model, X_train, y_train, cv=5))
            
            # Store results
            results.append({
                'Model': name,
                'Train Ratio': train_ratio,
                'Train Score': train_score,
                'Test Score': test_score,
                'CV Score': cv_score,
                'Trial': trial
            })
            
            # Print detailed results for this model
            print(f"\nResults for {name} (Trial {trial}, Train Ratio {train_ratio}):")
            y_pred = model.predict(X_test)
            print(classification_report(y_test, y_pred))
            
            # Print feature importance for Random Forest
            if name == 'Random Forest':
                importance = pd.DataFrame({
                    'Feature': X.columns,
                    'Importance': model.feature_importances_
                }).sort_values('Importance', ascending=False)
                print("\nFeature Importance:")
                print(importance)
        
        return pd.DataFrame(results)

def main():
    # Initialize analysis
    analysis = BreastCancerAnalysis()
    
    # Load data
    X, y = analysis.load_data('breast-cancer-wisconsin.data')
    
    # Test different train/test ratios
    train_ratios = [0.2, 0.5, 0.8]
    all_results = []
    
    for train_ratio in train_ratios:
        for trial in range(3):
            results = analysis.train_and_evaluate(X, y, train_ratio, trial)
            all_results.append(results)
    
    # Combine and display final results
    final_results = pd.concat(all_results)
    print("\nAverage Results:")
    print(final_results.groupby(['Model', 'Train Ratio'])[['Train Score', 'Test Score', 'CV Score']].mean())

if __name__ == "__main__":
    main()


Results for SVM (Trial 0, Train Ratio 0.2):
              precision    recall  f1-score   support

           0       0.99      0.96      0.98       359
           1       0.93      0.98      0.96       188

    accuracy                           0.97       547
   macro avg       0.96      0.97      0.97       547
weighted avg       0.97      0.97      0.97       547


Results for Random Forest (Trial 0, Train Ratio 0.2):
              precision    recall  f1-score   support

           0       0.98      0.97      0.98       359
           1       0.95      0.97      0.96       188

    accuracy                           0.97       547
   macro avg       0.97      0.97      0.97       547
weighted avg       0.97      0.97      0.97       547


Feature Importance:
                 Feature  Importance
1   cell_size_uniformity    0.289088
2  cell_shape_uniformity    0.265420
4   epithelial_cell_size    0.118238
5            bare_nuclei    0.117994
3      marginal_adhesion    0.082844
6  

In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
import warnings
warnings.filterwarnings('ignore')

class AbaloneAnalysis:
    def __init__(self, random_state=42):
        self.random_state = random_state
        self.scaler = StandardScaler()
        self.le = LabelEncoder()
        
    def load_data(self, filepath):
        # Create column names
        columns = ['Sex', 'Length', 'Diameter', 'Height', 'Whole_weight', 
                  'Shucked_weight', 'Viscera_weight', 'Shell_weight', 'Rings']
        
        # Read the data
        df = pd.read_csv(filepath, header=None, names=columns)
        
        # Convert Sex to numerical values
        df['Sex'] = self.le.fit_transform(df['Sex'])
        
        # Create binary classification (young/old based on rings)
        df['Class'] = (df['Rings'] > 10).astype(int)
        
        # Split features and target
        X = df.drop(['Rings', 'Class'], axis=1)
        y = df['Class']
        
        # Scale features
        X_scaled = self.scaler.fit_transform(X)
        X_scaled = pd.DataFrame(X_scaled, columns=X.columns)
        
        return X_scaled, y
    
    def train_and_evaluate(self, X, y, train_ratio, trial):
        # Split data
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=(1-train_ratio), random_state=self.random_state+trial
        )
        
        models = {
            'SVM': SVC(random_state=self.random_state),
            'Random Forest': RandomForestClassifier(random_state=self.random_state),
            'KNN': KNeighborsClassifier()
        }
        
        results = []
        for name, model in models.items():
            # Train model
            model.fit(X_train, y_train)
            
            # Calculate scores
            train_score = model.score(X_train, y_train)
            test_score = model.score(X_test, y_test)
            cv_score = np.mean(cross_val_score(model, X_train, y_train, cv=5))
            
            # Store results
            results.append({
                'Model': name,
                'Train Ratio': train_ratio,
                'Train Score': train_score,
                'Test Score': test_score,
                'CV Score': cv_score,
                'Trial': trial
            })
            
            # Print detailed results for this model
            print(f"\nResults for {name} (Trial {trial}, Train Ratio {train_ratio}):")
            y_pred = model.predict(X_test)
            print(classification_report(y_test, y_pred))
            
            # Print feature importance for Random Forest
            if name == 'Random Forest':
                importance = pd.DataFrame({
                    'Feature': X.columns,
                    'Importance': model.feature_importances_
                }).sort_values('Importance', ascending=False)
                print("\nFeature Importance:")
                print(importance)
        
        return pd.DataFrame(results)

def main():
    # Initialize analysis
    analysis = AbaloneAnalysis()
    
    # Load data
    X, y = analysis.load_data('abalone.data')
    
    # Test different train/test ratios
    train_ratios = [0.2, 0.5, 0.8]
    all_results = []
    
    for train_ratio in train_ratios:
        for trial in range(3):
            results = analysis.train_and_evaluate(X, y, train_ratio, trial)
            all_results.append(results)
    
    # Combine and display final results
    final_results = pd.concat(all_results)
    print("\nAverage Results:")
    print(final_results.groupby(['Model', 'Train Ratio'])[['Train Score', 'Test Score', 'CV Score']].mean())

if __name__ == "__main__":
    main()


Results for SVM (Trial 0, Train Ratio 0.2):
              precision    recall  f1-score   support

           0       0.77      0.91      0.84      2182
           1       0.75      0.49      0.59      1160

    accuracy                           0.77      3342
   macro avg       0.76      0.70      0.71      3342
weighted avg       0.76      0.77      0.75      3342


Results for Random Forest (Trial 0, Train Ratio 0.2):
              precision    recall  f1-score   support

           0       0.80      0.87      0.83      2182
           1       0.70      0.58      0.63      1160

    accuracy                           0.77      3342
   macro avg       0.75      0.72      0.73      3342
weighted avg       0.76      0.77      0.76      3342


Feature Importance:
          Feature  Importance
7    Shell_weight    0.197486
5  Shucked_weight    0.168504
4    Whole_weight    0.138465
3          Height    0.121815
2        Diameter    0.117302
6  Viscera_weight    0.115785
1          Leng

In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
import warnings
warnings.filterwarnings('ignore')

class MushroomAnalysis:
    def __init__(self, random_state=42):
        self.random_state = random_state
        self.label_encoders = {}
        
    def load_data(self, filepath):
        # Define column names
        columns = ['class', 'cap-shape', 'cap-surface', 'cap-color', 'bruises', 
                  'odor', 'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
                  'stalk-shape', 'stalk-root', 'stalk-surface-above-ring', 
                  'stalk-surface-below-ring', 'stalk-color-above-ring',
                  'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number',
                  'ring-type', 'spore-print-color', 'population', 'habitat']
        
        # Read the data
        df = pd.read_csv(filepath, header=None, names=columns)
        
        # Handle missing values in stalk-root
        df['stalk-root'] = df['stalk-root'].replace('?', df['stalk-root'].mode()[0])
        
        # Convert target (e: edible -> 0, p: poisonous -> 1)
        df['class'] = (df['class'] == 'p').astype(int)
        
        # Encode all categorical variables
        X = df.drop('class', axis=1)
        y = df['class']
        
        # Apply label encoding to all columns
        X_encoded = X.copy()
        for column in X.columns:
            le = LabelEncoder()
            X_encoded[column] = le.fit_transform(X[column])
            self.label_encoders[column] = le
            
        return X_encoded, y
    
    def train_and_evaluate(self, X, y, train_ratio, trial):
        # Split data
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=(1-train_ratio), random_state=self.random_state+trial
        )
        
        models = {
            'SVM': SVC(random_state=self.random_state),
            'Random Forest': RandomForestClassifier(random_state=self.random_state),
            'KNN': KNeighborsClassifier()
        }
        
        results = []
        for name, model in models.items():
            # Train model
            model.fit(X_train, y_train)
            
            # Calculate scores
            train_score = model.score(X_train, y_train)
            test_score = model.score(X_test, y_test)
            cv_score = np.mean(cross_val_score(model, X_train, y_train, cv=5))
            
            # Store results
            results.append({
                'Model': name,
                'Train Ratio': train_ratio,
                'Train Score': train_score,
                'Test Score': test_score,
                'CV Score': cv_score,
                'Trial': trial
            })
            
            # Print detailed results for this model
            print(f"\nResults for {name} (Trial {trial}, Train Ratio {train_ratio}):")
            y_pred = model.predict(X_test)
            print(classification_report(y_test, y_pred))
            
            # Print feature importance for Random Forest
            if name == 'Random Forest':
                importance = pd.DataFrame({
                    'Feature': X.columns,
                    'Importance': model.feature_importances_
                }).sort_values('Importance', ascending=False)
                print("\nFeature Importance:")
                print(importance)
        
        return pd.DataFrame(results)

def main():
    # Initialize analysis
    analysis = MushroomAnalysis()
    
    # Load data
    X, y = analysis.load_data('agaricus-lepiota.data')
    
    # Test different train/test ratios
    train_ratios = [0.2, 0.5, 0.8]
    all_results = []
    
    for train_ratio in train_ratios:
        for trial in range(3):
            results = analysis.train_and_evaluate(X, y, train_ratio, trial)
            all_results.append(results)
    
    # Combine and display final results
    final_results = pd.concat(all_results)
    print("\nAverage Results:")
    print(final_results.groupby(['Model', 'Train Ratio'])[['Train Score', 'Test Score', 'CV Score']].mean())

if __name__ == "__main__":
    main()


Results for SVM (Trial 0, Train Ratio 0.2):
              precision    recall  f1-score   support

           0       0.95      0.97      0.96      3362
           1       0.97      0.94      0.96      3138

    accuracy                           0.96      6500
   macro avg       0.96      0.96      0.96      6500
weighted avg       0.96      0.96      0.96      6500


Results for Random Forest (Trial 0, Train Ratio 0.2):
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3362
           1       1.00      1.00      1.00      3138

    accuracy                           1.00      6500
   macro avg       1.00      1.00      1.00      6500
weighted avg       1.00      1.00      1.00      6500


Feature Importance:
                     Feature  Importance
4                       odor    0.158609
19         spore-print-color    0.126338
7                  gill-size    0.119140
8                 gill-color    0.119067
20                po

In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
import warnings
warnings.filterwarnings('ignore')

class ModelAnalysis:
    def __init__(self, random_state=42):
        self.random_state = random_state
        self.scaler = StandardScaler()
        self.le = LabelEncoder()
        
    def perform_grid_search(self, X, y, model_type):
        """Perform grid search for hyperparameter tuning"""
        if model_type == 'SVM':
            param_grid = {
                'C': [0.1, 1, 10],
                'kernel': ['rbf', 'linear'],
                'gamma': ['scale', 'auto']
            }
            model = SVC(random_state=self.random_state)
            
        elif model_type == 'RF':
            param_grid = {
                'n_estimators': [100, 200],
                'max_depth': [10, 20, None],
                'min_samples_split': [2, 5]
            }
            model = RandomForestClassifier(random_state=self.random_state)
            
        else:  # KNN
            param_grid = {
                'n_neighbors': [3, 5, 7, 9],
                'weights': ['uniform', 'distance']
            }
            model = KNeighborsClassifier()
        
        grid_search = GridSearchCV(model, param_grid, cv=5, n_jobs=-1, verbose=1)
        grid_search.fit(X, y)
        
        print(f"\nBest parameters for {model_type}:")
        print(grid_search.best_params_)
        print(f"Best cross-validation score: {grid_search.best_score_:.4f}")
        
        # Test the best model
        y_pred = grid_search.predict(X)
        print("\nTraining Performance:")
        print(classification_report(y, y_pred))
        
        return grid_search.best_estimator_, grid_search.best_params_, grid_search.best_score_

    def process_breast_cancer_data(self, filepath):
        columns = ['id', 'clump_thickness', 'cell_size_uniformity', 
                  'cell_shape_uniformity', 'marginal_adhesion', 
                  'epithelial_cell_size', 'bare_nuclei', 'bland_chromatin',
                  'normal_nucleoli', 'mitoses', 'class']
        
        df = pd.read_csv(filepath, header=None, names=columns)
        df = df.replace('?', np.nan).dropna()
        
        for col in df.columns:
            if col != 'id':
                df[col] = pd.to_numeric(df[col])
        
        df['class'] = (df['class'] == 4).astype(int)
        X = df.drop(['id', 'class'], axis=1)
        y = df['class']
        
        X_scaled = self.scaler.fit_transform(X)
        return X_scaled, y

    def process_abalone_data(self, filepath):
        columns = ['Sex', 'Length', 'Diameter', 'Height', 'Whole_weight', 
                  'Shucked_weight', 'Viscera_weight', 'Shell_weight', 'Rings']
        
        df = pd.read_csv(filepath, header=None, names=columns)
        df['Sex'] = self.le.fit_transform(df['Sex'])
        df['Class'] = (df['Rings'] > 10).astype(int)
        
        X = df.drop(['Rings', 'Class'], axis=1)
        y = df['Class']
        
        X_scaled = self.scaler.fit_transform(X)
        return X_scaled, y

    def process_mushroom_data(self, filepath):
        columns = ['class', 'cap-shape', 'cap-surface', 'cap-color', 'bruises', 
                  'odor', 'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
                  'stalk-shape', 'stalk-root', 'stalk-surface-above-ring', 
                  'stalk-surface-below-ring', 'stalk-color-above-ring',
                  'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number',
                  'ring-type', 'spore-print-color', 'population', 'habitat']
        
        df = pd.read_csv(filepath, header=None, names=columns)
        df['stalk-root'] = df['stalk-root'].replace('?', df['stalk-root'].mode()[0])
        df['class'] = (df['class'] == 'p').astype(int)
        
        # Encode all categorical variables
        X = df.drop('class', axis=1)
        y = df['class']
        
        label_encoders = {}
        X_encoded = pd.DataFrame()
        
        for column in X.columns:
            label_encoders[column] = LabelEncoder()
            X_encoded[column] = label_encoders[column].fit_transform(X[column])
            
        return X_encoded, y

def main():
    analysis = ModelAnalysis()
    
    # Process each dataset
    datasets = {
        'Breast Cancer': ('breast-cancer-wisconsin.data', analysis.process_breast_cancer_data),
        'Abalone': ('abalone.data', analysis.process_abalone_data),
        'Mushroom': ('agaricus-lepiota.data', analysis.process_mushroom_data)
    }
    
    for dataset_name, (filepath, process_func) in datasets.items():
        print(f"\n{'='*50}")
        print(f"Processing {dataset_name} Dataset")
        print('='*50)
        
        # Load and process data
        X, y = process_func(filepath)
        
        # Perform grid search for each model
        models = ['SVM', 'RF', 'KNN']
        results = []
        
        for model_type in models:
            print(f"\nPerforming grid search for {model_type}")
            best_model, best_params, best_score = analysis.perform_grid_search(X, y, model_type)
            
            results.append({
                'Dataset': dataset_name,
                'Model': model_type,
                'Best Parameters': best_params,
                'Best Score': best_score
            })
        
        # Display summary of results
        print("\nSummary of results:")
        results_df = pd.DataFrame(results)
        print(results_df)

if __name__ == "__main__":
    main()


Processing Breast Cancer Dataset

Performing grid search for SVM
Fitting 5 folds for each of 12 candidates, totalling 60 fits

Best parameters for SVM:
{'C': 0.1, 'gamma': 'scale', 'kernel': 'linear'}
Best cross-validation score: 0.9693

Training Performance:
              precision    recall  f1-score   support

           0       0.98      0.98      0.98       444
           1       0.95      0.97      0.96       239

    accuracy                           0.97       683
   macro avg       0.97      0.97      0.97       683
weighted avg       0.97      0.97      0.97       683


Performing grid search for RF
Fitting 5 folds for each of 12 candidates, totalling 60 fits

Best parameters for RF:
{'max_depth': 10, 'min_samples_split': 5, 'n_estimators': 200}
Best cross-validation score: 0.9664

Training Performance:
              precision    recall  f1-score   support

           0       1.00      0.99      0.99       444
           1       0.98      1.00      0.99       239

    accur