In [5]:
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import pandas as pd

In [6]:
class PMHighClassifier:
    def __init__(self, n_clusters=5, random_state=42):
        """
        Initialize the classifier with the number of clusters and random state.
        
        Args:
            n_clusters (int): Number of clusters for k-means
            random_state (int): Random seed for reproducibility
        """
        self.n_clusters = n_clusters
        self.random_state = random_state
        self.kmeans = KMeans(n_clusters=n_clusters, random_state=random_state)
        self.scaler = StandardScaler()
        self.cluster_labels = None
        
    def fit(self, X, y):
        """
        Fit the classifier using k-means clustering.
        
        Args:
            X (array-like): Training features
            y (array-like): Target labels (PM_HIGH)
        """
        # Scale the features
        X_scaled = self.scaler.fit_transform(X)
        
        # Fit k-means
        self.kmeans.fit(X_scaled)
        
        # Get cluster assignments for training data
        cluster_assignments = self.kmeans.predict(X_scaled)
        
        # Determine majority label for each cluster
        self.cluster_labels = np.zeros(self.n_clusters)
        for cluster in range(self.n_clusters):
            mask = cluster_assignments == cluster
            if np.any(mask):
                cluster_labels = y[mask]
                self.cluster_labels[cluster] = np.round(np.mean(cluster_labels))
        
        return self
    
    def predict(self, X):
        """
        Predict PM_HIGH values for new data.
        
        Args:
            X (array-like): Features to predict
            
        Returns:
            array: Predicted labels
        """
        # Scale the features
        X_scaled = self.scaler.transform(X)
        
        # Get cluster assignments
        cluster_assignments = self.kmeans.predict(X_scaled)
        
        # Map clusters to their majority labels
        predictions = self.cluster_labels[cluster_assignments]
        
        return predictions
    
    def score(self, X, y):
        """
        Calculate accuracy score for the classifier.
        
        Args:
            X (array-like): Features
            y (array-like): True labels
            
        Returns:
            float: Accuracy score
        """
        predictions = self.predict(X)
        return np.mean(predictions == y)

In [7]:
def prepare_data(data_path):
    """
    Prepare the data for training and testing.
    
    Args:
        data_path (str): Path to the CSV file
        
    Returns:
        tuple: X (features) and y (target) DataFrames
    """
    # Read data
    df = pd.read_csv(data_path)
    
    # Separate features and target
    y = df['PM_HIGH']
    X = df.drop('PM_HIGH', axis=1)
    
    return X, y

def evaluate_model(model, X_train, X_test, y_train, y_test):
    """
    Evaluate the model on training and test sets.
    
    Args:
        model: Trained classifier
        X_train, X_test, y_train, y_test: Train and test data
        
    Returns:
        tuple: Training and test accuracy
    """
    train_accuracy = model.score(X_train, y_train)
    test_accuracy = model.score(X_test, y_test)
    
    return train_accuracy, test_accuracy

In [8]:
# Initialize classifier
classifier = PMHighClassifier(n_clusters=5)

# Load and prepare data for Beijing and Shenyang
X_train, y_train = prepare_data('Cities/Beijing_labeled.csv')

# Load test data for Guangzhou and Shanghai
X_test, y_test = prepare_data('Cities/Guangzhou_labeled.csv')

# Train the model
classifier.fit(X_train, y_train)

# Evaluate the model
train_acc, test_acc = evaluate_model(
    classifier, X_train, X_test, y_train, y_test
)

print(f"Training accuracy: {train_acc:.3f}")
print(f"Test accuracy: {test_acc:.3f}")

Training accuracy: 0.687
Test accuracy: 0.936
