In [25]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.cluster import KMeans
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from sklearn.metrics import (
    confusion_matrix,
    classification_report,
    roc_curve,
    auc
)
from sklearn.model_selection import learning_curve

class ComprehensiveStudentPredictor:
    def __init__(self, train_path='train.csv', test_path='test.csv'):
        """
        Initialize the predictor with comprehensive analysis capabilities.

        Args:
            train_path (str): Path to training dataset
            test_path (str): Path to testing dataset
        """

        # Load and prepare data
        self.train_df = pd.read_csv(train_path)
        self.test_df = pd.read_csv(test_path)

        # Tracking variables for model iterations
        self.iteration_results = []

    def prepare_data(self):
        """
        Comprehensive data preparation with detailed preprocessing.

        Returns:
            dict: Prepared datasets and preprocessing details
        """
        # Separate features and target
        X_train = self.train_df.drop(columns=['Target'])
        y_train = self.train_df['Target']
        X_test = self.test_df.copy()

        # Encode target variable if categorical
        self.label_encoder = LabelEncoder()
        y_train = self.label_encoder.fit_transform(y_train)

        # Identify numeric columns
        numeric_cols = X_train.select_dtypes(include=['int64', 'float64']).columns

        # Impute missing values
        imputer = SimpleImputer(strategy='median')
        X_train_imputed = pd.DataFrame(
            imputer.fit_transform(X_train[numeric_cols]),
            columns=numeric_cols,
            index=X_train.index
        )
        X_test_imputed = pd.DataFrame(
            imputer.transform(X_test[numeric_cols]),
            columns=numeric_cols,
            index=X_test.index
        )

        # Scale features
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train_imputed)
        X_test_scaled = scaler.transform(X_test_imputed)

        return {
            'X_train_scaled': X_train_scaled,
            'y_train': y_train,
            'X_test_scaled': X_test_scaled,
            'feature_names': numeric_cols.tolist()
        }

    def perform_clustering(self, X_train_scaled, n_iterations=4):
        """
        Perform iterative K-means clustering with visualization.

        Args:
            X_train_scaled (np.ndarray): Scaled training features
            n_iterations (int): Number of iterations to explore

        Returns:
            list: Clustering results for each iteration
        """
        clustering_results = []

        plt.figure(figsize=(15, 10))
        plt.suptitle('K-means Clustering Exploration', fontsize=16)

        for i in range(1, n_iterations + 1):
            # Perform K-means clustering
            n_clusters = 3 + i  # Varying number of clusters
            kmeans = KMeans(n_clusters=n_clusters, random_state=42)
            cluster_labels = kmeans.fit_predict(X_train_scaled)

            # Dimensionality reduction for visualization
            from sklearn.decomposition import PCA
            pca = PCA(n_components=2)
            X_pca = pca.fit_transform(X_train_scaled)

            # Visualization subplot
            plt.subplot(2, 2, i)
            scatter = plt.scatter(
                X_pca[:, 0],
                X_pca[:, 1],
                c=cluster_labels,
                cmap='viridis'
            )
            plt.title(f'Iteration {i}: {n_clusters} Clusters')
            plt.colorbar(scatter)

            # Store clustering details
            clustering_results.append({
                'n_clusters': n_clusters,
                'inertia': kmeans.inertia_,
                'cluster_centers': kmeans.cluster_centers_
            })

        plt.tight_layout()
        plt.savefig('clustering_iterations.png')
        plt.close()

        return clustering_results

    def train_model_iteratively(self, prepared_data, n_iterations=4):
        """
        Train logistic regression iteratively with comprehensive tracking.

        Args:
            prepared_data (dict): Prepared training data
            n_iterations (int): Number of training iterations

        Returns:
            dict: Comprehensive model training results
        """
        X_train_scaled = prepared_data['X_train_scaled']
        y_train = prepared_data['y_train']

        # Perform initial clustering
        clustering_results = self.perform_clustering(X_train_scaled)

        # Learning curves
        plt.figure(figsize=(15, 10))
        plt.suptitle('Learning Curves', fontsize=16)

        for i in range(1, n_iterations + 1):
            # Combine original features with cluster labels
            kmeans = KMeans(n_clusters=3 + i, random_state=42)
            cluster_labels = kmeans.fit_predict(X_train_scaled)
            X_train_clustered = np.column_stack([X_train_scaled, cluster_labels])

            # Train logistic regression
            logreg = LogisticRegression(
                multi_class='ovr',
                max_iter=1000,
                random_state=42
            )
            logreg.fit(X_train_clustered, y_train)

            # Compute learning curve
            train_sizes, train_scores, test_scores = learning_curve(
                logreg,
                X_train_clustered,
                y_train,
                cv=5,
                train_sizes=np.linspace(0.1, 1.0, 5)
            )

            # Visualize learning curve
            plt.subplot(2, 2, i)
            train_mean = np.mean(train_scores, axis=1)
            test_mean = np.mean(test_scores, axis=1)

            plt.plot(train_sizes, train_mean, label='Training Score')
            plt.plot(train_sizes, test_mean, label='Cross-validation Score')
            plt.title(f'Iteration {i}: Learning Curve')
            plt.xlabel('Training Examples')
            plt.ylabel('Score')
            plt.legend()

            # Store iteration results
            self.iteration_results.append({
                'n_clusters': 3 + i,
                'train_accuracy': logreg.score(X_train_clustered, y_train),
                'model': logreg
            })

        plt.tight_layout()
        plt.savefig('learning_curves.png')
        plt.close()

        return self.iteration_results

    def generate_comprehensive_report(self, prepared_data):
        """
        Generate a comprehensive model performance report.

        Args:
            prepared_data (dict): Prepared training data

        Returns:
            dict: Comprehensive model evaluation results
        """
        # Final model selection (typically the last iteration)
        best_model = self.iteration_results[-1]['model']
        X_train_scaled = prepared_data['X_train_scaled']
        y_train = prepared_data['y_train']

        # Compute detailed metrics
        y_pred = best_model.predict(X_train_scaled)

        # Confusion Matrix
        plt.figure(figsize=(10, 8))
        cm = confusion_matrix(y_train, y_pred)
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
        plt.title('Confusion Matrix')
        plt.ylabel('True Label')
        plt.xlabel('Predicted Label')
        plt.tight_layout()
        plt.savefig('confusion_matrix.png')
        plt.close()

        # ROC Curve
        plt.figure(figsize=(10, 8))
        y_pred_proba = best_model.predict_proba(X_train_scaled)
        n_classes = len(np.unique(y_train))

        for i in range(n_classes):
            fpr, tpr, _ = roc_curve(
                (y_train == i).astype(int),
                y_pred_proba[:, i]
            )
            roc_auc = auc(fpr, tpr)
            plt.plot(fpr, tpr, label=f'ROC curve (class {i}, AUC = {roc_auc:.2f})')

        plt.plot([0, 1], [0, 1], 'k--')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('Receiver Operating Characteristic (ROC) Curve')
        plt.legend(loc="lower right")
        plt.tight_layout()
        plt.savefig('roc_curve.png')
        plt.close()

        # Detailed Classification Report
        print("\nDetailed Classification Report:")
        print(classification_report(y_train, y_pred,
            target_names=self.label_encoder.classes_))

        return {
            'confusion_matrix': cm,
            'classification_report': classification_report(y_train, y_pred,
                target_names=self.label_encoder.classes_)
        }

    def run_comprehensive_analysis(self):
        """
        Execute the entire machine learning pipeline with comprehensive analysis.

        Returns:
            dict: Full analysis results
        """
        # Prepare data
        prepared_data = self.prepare_data()

        # Train model iteratively
        self.train_model_iteratively(prepared_data)

        # Generate comprehensive report
        report = self.generate_comprehensive_report(prepared_data)

        print("\nAnalysis Complete. Visualizations saved:")
        print("1. clustering_iterations.png")
        print("2. learning_curves.png")
        print("3. confusion_matrix.png")
        print("4. roc_curve.png")

        return report

# Run the comprehensive analysis
predictor = ComprehensiveStudentPredictor()
results = predictor.run_comprehensive_analysis()



ValueError: X has 37 features, but LogisticRegression is expecting 38 features as input.

In [24]:
import warnings

def fxn():
    warnings.warn("deprecated", DeprecationWarning)

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    fxn()

