## 随机森林

In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
import joblib
import glob

def extract_features_from_window(window):
    features = []
    for column in window.columns:
        sensor_data = window[column]
        
        # Detect positive pulses
        pulses = sensor_data[sensor_data > sensor_data.mean()]
        
        # Extract features: height, duration, area, max, min, mean, std
        heights = pulses.values
        durations = np.diff(pulses.index)
        
        # Adjust lengths to match
        heights = heights[:-1]  # Drop the last height to match durations length
        
        areas = heights * durations
        max_height = np.max(heights) if len(heights) > 0 else 0
        min_height = np.min(heights) if len(heights) > 0 else 0
        mean_height = np.mean(heights) if len(heights) > 0 else 0
        std_height = np.std(heights) if len(heights) > 0 else 0
        pulse_frequency = len(heights) / len(sensor_data)
        
        features.extend([max_height, min_height, mean_height, std_height, pulse_frequency])
    
    return features

def prepare_dataset_from_windows(df, state, window_size=89):
    X = []
    y = []
    for start in range(0, len(df) - window_size + 1, window_size):
        window = df.iloc[start:start + window_size]
        features = extract_features_from_window(window)
        X.append(features)
        y.append(state)
    return np.array(X), np.array(y)

def load_and_process_files(file_paths, window_size=89):
    all_X = []
    all_y = []

    for file_path in file_paths:
        # Extract state from the file name
        state = file_path.split('/')[-1].split('_')[0]
        print(f"Processing file: {file_path} with state: {state}")
        
        # Load and process the CSV file
        try:
            df = pd.read_csv(file_path)
        except Exception as e:
            print(f"Error loading {file_path}: {e}")
            continue
        
        df.columns = [f's{i}' for i in range(len(df.columns))]
        
        # Prepare dataset from windows
        X, y = prepare_dataset_from_windows(df, state, window_size)
        
        all_X.append(X)
        all_y.append(y)
    
    if not all_X:
        print("No data to process.")
        return None, None
    
    # Combine data from all files
    all_X = np.vstack(all_X)
    all_y = np.concatenate(all_y)
    
    return all_X, all_y

def main():
    file_paths = glob.glob('*.csv')  # Adjust the path as necessary
    window_size = 89  # Set the window size to 89 or 90 based on your preference

    # Load and process files
    all_X, all_y = load_and_process_files(file_paths, window_size)
    
    if all_X is None or all_y is None:
        print("No data to process.")
        return
    
    print("Feature extraction and dataset preparation completed.")
    print(f"Total samples: {len(all_y)}")
    
    # Standardize the features
    scaler = StandardScaler()
    all_X = scaler.fit_transform(all_X)
    
    # Split dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(all_X, all_y, test_size=0.2, random_state=42)
    
    # Define the parameter grid
    param_grid = {
        'n_estimators': [100, 200, 300],
        'max_features': ['sqrt', 'log2'],
        'max_depth': [10, 20, 30, None],
        'criterion': ['gini', 'entropy']
    }
    
    # Initialize the Random Forest classifier
    clf = RandomForestClassifier()
    
    # Initialize GridSearchCV
    grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
    
    # Fit the model
    grid_search.fit(X_train, y_train)
    
    # Best model
    best_clf = grid_search.best_estimator_
    
    # Predict and evaluate
    y_pred = best_clf.predict(X_test)
    print(classification_report(y_test, y_pred))
    print(f"Best Parameters: {grid_search.best_params_}")
    
    # Save the model
    joblib.dump(best_clf, 'best_random_forest_model.pkl')
    print("Model saved as 'best_random_forest_model.pkl'")

# Run the main function
main()


## 交叉验证和模型保存

In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
import joblib
import glob

def extract_features_from_window(window):
    features = []
    for column in window.columns:
        sensor_data = window[column]
        
        # Detect positive pulses
        pulses = sensor_data[sensor_data > sensor_data.mean()]
        
        # Extract features: height, duration, area, max, min, mean, std
        heights = pulses.values
        durations = np.diff(pulses.index)
        
        # Adjust lengths to match
        heights = heights[:-1]  # Drop the last height to match durations length
        
        areas = heights * durations
        max_height = np.max(heights) if len(heights) > 0 else 0
        min_height = np.min(heights) if len(heights) > 0 else 0
        mean_height = np.mean(heights) if len(heights) > 0 else 0
        std_height = np.std(heights) if len(heights) > 0 else 0
        pulse_frequency = len(heights) / len(sensor_data)
        
        features.extend([max_height, min_height, mean_height, std_height, pulse_frequency])
    
    return features

def prepare_dataset_from_windows(df, state, window_size=89):
    X = []
    y = []
    for start in range(0, len(df) - window_size + 1, window_size):
        window = df.iloc[start:start + window_size]
        features = extract_features_from_window(window)
        X.append(features)
        y.append(state)
    return np.array(X), np.array(y)

def load_and_process_files(file_paths, window_size=89):
    all_X = []
    all_y = []

    for file_path in file_paths:
        # Extract state from the file name
        state = file_path.split('/')[-1].split('_')[0]
        print(f"Processing file: {file_path} with state: {state}")
        
        # Load and process the CSV file
        try:
            df = pd.read_csv(file_path)
        except Exception as e:
            print(f"Error loading {file_path}: {e}")
            continue
        
        df.columns = [f's{i}' for i in range(len(df.columns))]
        
        # Prepare dataset from windows
        X, y = prepare_dataset_from_windows(df, state, window_size)
        
        all_X.append(X)
        all_y.append(y)
    
    if not all_X:
        print("No data to process.")
        return None, None
    
    # Combine data from all files
    all_X = np.vstack(all_X)
    all_y = np.concatenate(all_y)
    
    return all_X, all_y

def main():
    file_paths = glob.glob('*.csv')  # Adjust the path as necessary
    window_size = 89  # Set the window size to 89 or 90 based on your preference

    # Load and process files
    all_X, all_y = load_and_process_files(file_paths, window_size)
    
    if all_X is None or all_y is None:
        print("No data to process.")
        return
    
    print("Feature extraction and dataset preparation completed.")
    print(f"Total samples: {len(all_y)}")
    
    # Standardize the features
    scaler = StandardScaler()
    all_X = scaler.fit_transform(all_X)
    
    # Define the parameter grid
    param_grid = {
        'n_estimators': [100, 200, 300],
        'max_features': ['sqrt', 'log2'],
        'max_depth': [10, 20, 30, None],
        'criterion': ['gini', 'entropy']
    }
    
    # Initialize the Random Forest classifier
    clf = RandomForestClassifier()
    
    # Initialize GridSearchCV
    grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
    
    # Fit the model using GridSearchCV
    grid_search.fit(all_X, all_y)
    
    # Best model
    best_clf = grid_search.best_estimator_
    
    # Cross-validation
    cv_scores = cross_val_score(best_clf, all_X, all_y, cv=5)
    print(f"Cross-validation scores: {cv_scores}")
    print(f"Mean cross-validation score: {np.mean(cv_scores)}")
    
    # Split dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(all_X, all_y, test_size=0.2, random_state=42)
    
    # Fit the best model on the training set
    best_clf.fit(X_train, y_train)
    
    # Predict and evaluate
    y_pred = best_clf.predict(X_test)
    print(classification_report(y_test, y_pred))
    print(f"Best Parameters: {grid_search.best_params_}")
    
    # Save the model
    joblib.dump(best_clf, 'best_random_forest_model.pkl')
    print("Model saved as 'best_random_forest_model.pkl'")

# Run the main function
main()


In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
import joblib
import glob

def extract_features_from_window(window):
    features = []
    for column in window.columns:
        sensor_data = window[column]
        
        # Detect positive pulses
        pulses = sensor_data[sensor_data > sensor_data.mean()]
        
        # Extract features: height, duration, area, max, min, mean, std
        heights = pulses.values
        durations = np.diff(pulses.index)
        
        # Adjust lengths to match
        heights = heights[:-1]  # Drop the last height to match durations length
        
        areas = heights * durations
        max_height = np.max(heights) if len(heights) > 0 else 0
        min_height = np.min(heights) if len(heights) > 0 else 0
        mean_height = np.mean(heights) if len(heights) > 0 else 0
        std_height = np.std(heights) if len(heights) > 0 else 0
        pulse_frequency = len(heights) / len(sensor_data)
        
        # Additional features
        energy = np.sum(sensor_data ** 2)  # Signal energy
        mean = np.mean(sensor_data)  # Mean value
        std = np.std(sensor_data)  # Standard deviation
        var = np.var(sensor_data)  # Variance
        rms = np.sqrt(np.mean(sensor_data ** 2))  # Root mean square
        
        features.extend([
            max_height, min_height, mean_height, std_height, pulse_frequency,
            energy, mean, std, var, rms
        ])
    
    return features

def prepare_dataset_from_windows(df, state, window_size=89):
    X = []
    y = []
    for start in range(0, len(df) - window_size + 1, window_size):
        window = df.iloc[start:start + window_size]
        features = extract_features_from_window(window)
        X.append(features)
        y.append(state)
    return np.array(X), np.array(y)

def load_and_process_files(file_paths, window_size=89):
    data = {}
    for file_path in file_paths:
        # Extract state from the file name
        state = file_path.split('/')[-1].split('_')[0]
        print(f"Processing file: {file_path} with state: {state}")
        
        # Load and process the CSV file
        try:
            df = pd.read_csv(file_path)
        except Exception as e:
            print(f"Error loading {file_path}: {e}")
            continue
        
        df.columns = [f's{i}' for i in range(len(df.columns))]
        
        # Prepare dataset from windows
        X, y = prepare_dataset_from_windows(df, state, window_size)
        
        data[state] = (X, y)
    
    return data

def leave_one_out_cross_validation(data):
    all_states = list(data.keys())
    results = {}

    for state in all_states:
        # Use the current state as the validation set
        X_val, y_val = data[state]
        
        # Use the other states as the training set
        X_train = np.vstack([data[s][0] for s in all_states if s != state])
        y_train = np.concatenate([data[s][1] for s in all_states if s != state])
        
        # Standardize the features
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_val = scaler.transform(X_val)
        
        # Define the parameter grid
        param_grid = {
            'n_estimators': [100, 200, 300],
            'max_features': ['sqrt', 'log2'],
            'max_depth': [10, 20, 30, None],
            'criterion': ['gini', 'entropy']
        }
        
        # Initialize the Random Forest classifier
        clf = RandomForestClassifier()
        
        # Initialize GridSearchCV
        grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
        
        # Fit the model using GridSearchCV
        grid_search.fit(X_train, y_train)
        
        # Best model
        best_clf = grid_search.best_estimator_
        
        # Predict and evaluate
        y_pred = best_clf.predict(X_val)
        report = classification_report(y_val, y_pred, output_dict=True, zero_division=0)
        results[state] = report
        print(f"Validation on state {state}:")
        print(classification_report(y_val, y_pred, zero_division=0))
        
        # Print distribution of predictions
        unique, counts = np.unique(y_pred, return_counts=True)
        prediction_distribution = dict(zip(unique, counts))
        print(f"Prediction distribution for state {state}: {prediction_distribution}")
    
    return results

def main():
    file_paths = glob.glob('*.csv')  # Adjust the path as necessary
    window_size = 89  # Set the window size to 89 or 90 based on your preference

    # Load and process files
    data = load_and_process_files(file_paths, window_size)
    
    if not data:
        print("No data to process.")
        return
    
    print("Feature extraction and dataset preparation completed.")
    
    # Perform leave-one-out cross-validation
    results = leave_one_out_cross_validation(data)
    
    # Print the results
    for state, report in results.items():
        print(f"Results for state {state}:")
        print(report)

# Run the main function
main()


## 支持向量机

In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
import joblib
import glob

def extract_features_from_window(window):
    features = []
    for column in window.columns:
        sensor_data = window[column]
        
        # Detect positive pulses
        pulses = sensor_data[sensor_data > sensor_data.mean()]
        
        # Extract features: height, duration, area, max, min, mean, std
        heights = pulses.values
        durations = np.diff(pulses.index)
        
        # Adjust lengths to match
        heights = heights[:-1]  # Drop the last height to match durations length
        
        areas = heights * durations
        max_height = np.max(heights) if len(heights) > 0 else 0
        min_height = np.min(heights) if len(heights) > 0 else 0
        mean_height = np.mean(heights) if len(heights) > 0 else 0
        std_height = np.std(heights) if len(heights) > 0 else 0
        pulse_frequency = len(heights) / len(sensor_data)
        
        # Additional features
        energy = np.sum(sensor_data ** 2)  # Signal energy
        mean = np.mean(sensor_data)  # Mean value
        std = np.std(sensor_data)  # Standard deviation
        var = np.var(sensor_data)  # Variance
        rms = np.sqrt(np.mean(sensor_data ** 2))  # Root mean square
        skewness = pd.Series(sensor_data).skew()  # Skewness
        kurtosis = pd.Series(sensor_data).kurt()  # Kurtosis
        
        features.extend([
            max_height, min_height, mean_height, std_height, pulse_frequency,
            energy, mean, std, var, rms, skewness, kurtosis
        ])
    
    return features

def prepare_dataset_from_windows(df, state, window_size=89):
    X = []
    y = []
    for start in range(0, len(df) - window_size + 1, window_size):
        window = df.iloc[start:start + window_size]
        features = extract_features_from_window(window)
        X.append(features)
        y.append(state)
    return np.array(X), np.array(y)

def load_and_process_files(file_paths, window_size=89):
    all_X = []
    all_y = []

    for file_path in file_paths:
        # Extract state from the file name
        state = file_path.split('/')[-1].split('_')[0]
        print(f"Processing file: {file_path} with state: {state}")
        
        # Load and process the CSV file
        try:
            df = pd.read_csv(file_path)
        except Exception as e:
            print(f"Error loading {file_path}: {e}")
            continue
        
        df.columns = [f's{i}' for i in range(len(df.columns))]
        
        # Prepare dataset from windows
        X, y = prepare_dataset_from_windows(df, state, window_size)
        
        all_X.append(X)
        all_y.append(y)
    
    if not all_X:
        print("No data to process.")
        return None, None
    
    # Combine data from all files
    all_X = np.vstack(all_X)
    all_y = np.concatenate(all_y)
    
    return all_X, all_y

def train_and_evaluate_model(X, y):
    # Standardize the features
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    
    # Split dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Define the parameter grid for RandomForest
    param_grid_rf = {
        'n_estimators': [100, 200, 300],
        'max_features': ['sqrt', 'log2'],
        'max_depth': [10, 20, 30, None],
        'criterion': ['gini', 'entropy']
    }
    
    # Define the parameter grid for SVM
    param_grid_svm = {
        'C': [0.1, 1, 10, 100],
        'gamma': ['scale', 'auto'],
        'kernel': ['linear', 'rbf']
    }
    
    # Initialize the RandomForest classifier
    clf_rf = RandomForestClassifier()
    
    # Initialize the SVM classifier
    clf_svm = SVC()
    
    # Initialize GridSearchCV for RandomForest
    grid_search_rf = GridSearchCV(estimator=clf_rf, param_grid=param_grid_rf, cv=5, n_jobs=-1, verbose=2)
    
    # Initialize GridSearchCV for SVM
    grid_search_svm = GridSearchCV(estimator=clf_svm, param_grid=param_grid_svm, cv=5, n_jobs=-1, verbose=2)
    
    # Fit the model using GridSearchCV for RandomForest
    grid_search_rf.fit(X_train, y_train)
    
    # Fit the model using GridSearchCV for SVM
    grid_search_svm.fit(X_train, y_train)
    
    # Best models
    best_clf_rf = grid_search_rf.best_estimator_
    best_clf_svm = grid_search_svm.best_estimator_
    
    # Predict and evaluate RandomForest
    y_pred_rf = best_clf_rf.predict(X_test)
    print("RandomForest Classification Report:")
    print(classification_report(y_test, y_pred_rf, zero_division=0))
    
    # Predict and evaluate SVM
    y_pred_svm = best_clf_svm.predict(X_test)
    print("SVM Classification Report:")
    print(classification_report(y_test, y_pred_svm, zero_division=0))
    
    # Save the models
    joblib.dump(best_clf_rf, 'best_random_forest_model.pkl')
    print("RandomForest model saved as 'best_random_forest_model.pkl'")
    
    joblib.dump(best_clf_svm, 'best_svm_model.pkl')
    print("SVM model saved as 'best_svm_model.pkl'")

def main():
    file_paths = glob.glob('*.csv')  # Adjust the path as necessary
    window_size = 89  # Set the window size to 89 or 90 based on your preference

    # Load and process files
    all_X, all_y = load_and_process_files(file_paths, window_size)
    
    if all_X is None or all_y is None:
        print("No data to process.")
        return
    
    print("Feature extraction and dataset preparation completed.")
    print(f"Total samples: {len(all_y)}")
    
    # Train and evaluate model
    train_and_evaluate_model(all_X, all_y)

# Run the main function
main()


In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
import joblib
import glob

def extract_features_from_window(window):
    features = []
    for column in window.columns:
        sensor_data = window[column]
        
        # Detect positive pulses
        pulses = sensor_data[sensor_data > sensor_data.mean()]
        
        # Extract features: height, duration, area, max, min, mean, std
        heights = pulses.values
        durations = np.diff(pulses.index)
        
        # Adjust lengths to match
        heights = heights[:-1]  # Drop the last height to match durations length
        
        areas = heights * durations
        max_height = np.max(heights) if len(heights) > 0 else 0
        min_height = np.min(heights) if len(heights) > 0 else 0
        mean_height = np.mean(heights) if len(heights) > 0 else 0
        std_height = np.std(heights) if len(heights) > 0 else 0
        pulse_frequency = len(heights) / len(sensor_data)
        
        # Additional features
        energy = np.sum(sensor_data ** 2)  # Signal energy
        mean = np.mean(sensor_data)  # Mean value
        std = np.std(sensor_data)  # Standard deviation
        var = np.var(sensor_data)  # Variance
        rms = np.sqrt(np.mean(sensor_data ** 2))  # Root mean square
        skewness = pd.Series(sensor_data).skew()  # Skewness
        kurtosis = pd.Series(sensor_data).kurt()  # Kurtosis
        
        features.extend([
            max_height, min_height, mean_height, std_height, pulse_frequency,
            energy, mean, std, var, rms, skewness, kurtosis
        ])
    
    return features

def prepare_dataset_from_windows(df, state, window_size=89):
    X = []
    y = []
    for start in range(0, len(df) - window_size + 1, window_size):
        window = df.iloc[start:start + window_size]
        features = extract_features_from_window(window)
        X.append(features)
        y.append(state)
    return np.array(X), np.array(y)

def load_and_process_files(file_paths, window_size=89):
    data = {}
    for file_path in file_paths:
        # Extract state from the file name
        state = file_path.split('/')[-1].split('_')[0]
        print(f"Processing file: {file_path} with state: {state}")
        
        # Load and process the CSV file
        try:
            df = pd.read_csv(file_path)
        except Exception as e:
            print(f"Error loading {file_path}: {e}")
            continue
        
        df.columns = [f's{i}' for i in range(len(df.columns))]
        
        # Prepare dataset from windows
        X, y = prepare_dataset_from_windows(df, state, window_size)
        
        data[state] = (X, y)
    
    return data

def leave_one_out_cross_validation(data):
    all_states = list(data.keys())
    results = {}

    for state in all_states:
        # Use the current state as the validation set
        X_val, y_val = data[state]
        
        # Use the other states as the training set
        X_train = np.vstack([data[s][0] for s in all_states if s != state])
        y_train = np.concatenate([data[s][1] for s in all_states if s != state])
        
        # Standardize the features
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_val = scaler.transform(X_val)
        
        # Define the parameter grid for SVM
        param_grid = {
            'C': [0.1, 1, 10, 100],
            'gamma': ['scale', 'auto'],
            'kernel': ['linear', 'rbf']
        }
        
        # Initialize the SVM classifier
        clf = SVC()
        
        # Initialize GridSearchCV
        grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
        
        # Fit the model using GridSearchCV
        grid_search.fit(X_train, y_train)
        
        # Best model
        best_clf = grid_search.best_estimator_
        
        # Cross-validation
        cv_scores = cross_val_score(best_clf, X_train, y_train, cv=5)
        print(f"Cross-validation scores for state {state}: {cv_scores}")
        print(f"Mean cross-validation score for state {state}: {np.mean(cv_scores)}")
        
        # Predict and evaluate
        y_pred = best_clf.predict(X_val)
        report = classification_report(y_val, y_pred, output_dict=True, zero_division=0)
        results[state] = report
        print(f"Validation on state {state}:")
        print(classification_report(y_val, y_pred, zero_division=0))
        
        # Print distribution of predictions
        unique, counts = np.unique(y_pred, return_counts=True)
        prediction_distribution = dict(zip(unique, counts))
        print(f"Prediction distribution for state {state}: {prediction_distribution}")
    
    return results

def main():
    file_paths = glob.glob('*.csv')  # Adjust the path as necessary
    window_size = 89  # Set the window size to 89 or 90 based on your preference

    # Load and process files
    data = load_and_process_files(file_paths, window_size)
    
    if not data:
        print("No data to process.")
        return
    
    print("Feature extraction and dataset preparation completed.")
    
    # Perform leave-one-out cross-validation
    results = leave_one_out_cross_validation(data)
    
    # Print the results
    for state, report in results.items():
        print(f"Results for state {state}:")
        print(report)

# Run the main function
main()


In [10]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
import joblib
import glob

def extract_features_from_window(window):
    features = []
    for column in window.columns:
        sensor_data = window[column]
        
        # Detect positive pulses
        pulses = sensor_data[sensor_data > sensor_data.mean()]
        
        # Extract features: height, duration, area, max, min, mean, std
        heights = pulses.values
        durations = np.diff(pulses.index)
        
        # Adjust lengths to match
        heights = heights[:-1]  # Drop the last height to match durations length
        
        areas = heights * durations
        max_height = np.max(heights) if len(heights) > 0 else 0
        min_height = np.min(heights) if len(heights) > 0 else 0
        mean_height = np.mean(heights) if len(heights) > 0 else 0
        std_height = np.std(heights) if len(heights) > 0 else 0
        pulse_frequency = len(heights) / len(sensor_data)
        
        # Additional features
        energy = np.sum(sensor_data ** 2)  # Signal energy
        mean = np.mean(sensor_data)  # Mean value
        std = np.std(sensor_data)  # Standard deviation
        var = np.var(sensor_data)  # Variance
        rms = np.sqrt(np.mean(sensor_data ** 2))  # Root mean square
        skewness = pd.Series(sensor_data).skew()  # Skewness
        kurtosis = pd.Series(sensor_data).kurt()  # Kurtosis
        
        features.extend([
            max_height, min_height, mean_height, std_height, pulse_frequency,
            energy, mean, std, var, rms, skewness, kurtosis
        ])
    
    return features

def prepare_dataset_from_windows(df, window_size=89):
    X = []
    for start in range(0, len(df) - window_size + 1, window_size):
        window = df.iloc[start:start + window_size]
        features = extract_features_from_window(window)
        X.append(features)
    return np.array(X)

def load_and_process_files(file_paths, window_size=89):
    all_X = []
    all_y = []

    for file_path in file_paths:
        # Extract state from the file name
        state = file_path.split('/')[-1].split('_')[0]
        print(f"Processing file: {file_path} with state: {state}")
        
        # Load and process the CSV file
        try:
            df = pd.read_csv(file_path)
        except Exception as e:
            print(f"Error loading {file_path}: {e}")
            continue
        
        df.columns = [f's{i}' for i in range(len(df.columns))]
        
        # Prepare dataset from windows
        X = prepare_dataset_from_windows(df, window_size)
        
        all_X.append(X)
        all_y.extend([state] * len(X))
    
    if not all_X:
        print("No data to process.")
        return None, None
    
    # Combine data from all files
    all_X = np.vstack(all_X)
    all_y = np.array(all_y)
    
    return all_X, all_y

def main():
    # 加载已经保存的模型
    model = joblib.load('best_svm_model.pkl')
    print("Model loaded from 'best_svm_model.pkl'")
    
    # 加载和处理新数据
    file_paths = glob.glob('*.csv')  # Adjust the path as necessary
    window_size = 89  # Set the window size to 89 or 90 based on your preference
    
    all_X, all_y = load_and_process_files(file_paths, window_size)
    
    if all_X is None or all_y is None:
        print("No data to process.")
        return
    
    print("Feature extraction and dataset preparation completed.")
    print(f"Total samples: {len(all_y)}")
    
    # 标准化特征
    scaler = StandardScaler()
    all_X = scaler.fit_transform(all_X)
    
    # 使用加载的模型进行预测
    y_pred = model.predict(all_X)
    
    # 评估模型性能
    report = classification_report(all_y, y_pred, output_dict=True, zero_division=0)
    print("Classification Report:")
    print(classification_report(all_y, y_pred, zero_division=0))
    
    # 打印预测结果的分布
    unique, counts = np.unique(y_pred, return_counts=True)
    prediction_distribution = dict(zip(unique, counts))
    print(f"Prediction distribution: {prediction_distribution}")

# 运行主函数
main()


In [11]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
import joblib
import glob

def extract_features_from_window(window):
    features = []
    for column in window.columns:
        sensor_data = window[column]
        
        # Detect positive pulses
        pulses = sensor_data[sensor_data > sensor_data.mean()]
        
        # Extract features: height, duration, area, max, min, mean, std
        heights = pulses.values
        durations = np.diff(pulses.index)
        
        # Adjust lengths to match
        heights = heights[:-1]  # Drop the last height to match durations length
        
        areas = heights * durations
        max_height = np.max(heights) if len(heights) > 0 else 0
        min_height = np.min(heights) if len(heights) > 0 else 0
        mean_height = np.mean(heights) if len(heights) > 0 else 0
        std_height = np.std(heights) if len(heights) > 0 else 0
        pulse_frequency = len(heights) / len(sensor_data)
        
        # Additional features
        energy = np.sum(sensor_data ** 2)  # Signal energy
        mean = np.mean(sensor_data)  # Mean value
        std = np.std(sensor_data)  # Standard deviation
        var = np.var(sensor_data)  # Variance
        rms = np.sqrt(np.mean(sensor_data ** 2))  # Root mean square
        skewness = pd.Series(sensor_data).skew()  # Skewness
        kurtosis = pd.Series(sensor_data).kurt()  # Kurtosis
        
        features.extend([
            max_height, min_height, mean_height, std_height, pulse_frequency,
            energy, mean, std, var, rms, skewness, kurtosis
        ])
    
    return features

def prepare_dataset_from_windows(df, window_size=89):
    X = []
    for start in range(0, len(df) - window_size + 1, window_size):
        window = df.iloc[start:start + window_size]
        features = extract_features_from_window(window)
        X.append(features)
    return np.array(X)

def load_and_process_files(file_paths, window_size=89):
    data = {}
    for file_path in file_paths:
        # Extract state from the file name
        state = file_path.split('/')[-1].split('_')[0]
        print(f"Processing file: {file_path} with state: {state}")
        
        # Load and process the CSV file
        try:
            df = pd.read_csv(file_path)
        except Exception as e:
            print(f"Error loading {file_path}: {e}")
            continue
        
        df.columns = [f's{i}' for i in range(len(df.columns))]
        
        # Prepare dataset from windows
        X = prepare_dataset_from_windows(df, window_size)
        
        data[state] = (X, [state] * len(X))
    
    return data

def leave_one_out_cross_validation_with_loaded_model(data, model_path):
    all_states = list(data.keys())
    results = {}

    # Load the pre-trained model
    model = joblib.load(model_path)
    print(f"Model loaded from '{model_path}'")

    for state in all_states:
        # Use the current state as the validation set
        X_val, y_val = data[state]
        
        # Use the other states as the training set
        X_train = np.vstack([data[s][0] for s in all_states if s != state])
        y_train = np.concatenate([data[s][1] for s in all_states if s != state])
        
        # Standardize the features
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_val = scaler.transform(X_val)
        
        # Evaluate the loaded model on the validation set
        y_pred = model.predict(X_val)
        report = classification_report(y_val, y_pred, output_dict=True, zero_division=0)
        results[state] = report
        print(f"Validation on state {state}:")
        print(classification_report(y_val, y_pred, zero_division=0))
        
        # Print distribution of predictions
        unique, counts = np.unique(y_pred, return_counts=True)
        prediction_distribution = dict(zip(unique, counts))
        print(f"Prediction distribution for state {state}: {prediction_distribution}")
    
    return results

def main():
    file_paths = glob.glob('*.csv')  # Adjust the path as necessary
    window_size = 89  # Set the window size to 89 or 90 based on your preference

    # Load and process files
    data = load_and_process_files(file_paths, window_size)
    
    if not data:
        print("No data to process.")
        return
    
    print("Feature extraction and dataset preparation completed.")
    
    # Perform leave-one-out cross-validation with the loaded model
    model_path = 'best_svm_model.pkl'  # Path to the saved model
    results = leave_one_out_cross_validation_with_loaded_model(data, model_path)
    
    # Print the results
    for state, report in results.items():
        print(f"Results for state {state}:")
        print(report)

# Run the main function
main()


In [12]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
import joblib
import glob

def extract_features_from_window(window):
    features = []
    for column in window.columns:
        sensor_data = window[column]
        
        # Detect positive pulses
        pulses = sensor_data[sensor_data > sensor_data.mean()]
        
        # Extract features: height, duration, area, max, min, mean, std
        heights = pulses.values
        durations = np.diff(pulses.index)
        
        # Adjust lengths to match
        heights = heights[:-1]  # Drop the last height to match durations length
        
        areas = heights * durations
        max_height = np.max(heights) if len(heights) > 0 else 0
        min_height = np.min(heights) if len(heights) > 0 else 0
        mean_height = np.mean(heights) if len(heights) > 0 else 0
        std_height = np.std(heights) if len(heights) > 0 else 0
        pulse_frequency = len(heights) / len(sensor_data)
        
        # Additional features
        energy = np.sum(sensor_data ** 2)  # Signal energy
        mean = np.mean(sensor_data)  # Mean value
        std = np.std(sensor_data)  # Standard deviation
        var = np.var(sensor_data)  # Variance
        rms = np.sqrt(np.mean(sensor_data ** 2))  # Root mean square
        skewness = pd.Series(sensor_data).skew()  # Skewness
        kurtosis = pd.Series(sensor_data).kurt()  # Kurtosis
        
        features.extend([
            max_height, min_height, mean_height, std_height, pulse_frequency,
            energy, mean, std, var, rms, skewness, kurtosis
        ])
    
    return features

def prepare_dataset_from_windows(df, window_size=89):
    X = []
    for start in range(0, len(df) - window_size + 1, window_size):
        window = df.iloc[start:start + window_size]
        features = extract_features_from_window(window)
        X.append(features)
    return np.array(X)

def load_and_process_files(file_paths, window_size=89):
    data = {}
    for file_path in file_paths:
        # Extract state from the file name
        state = file_path.split('/')[-1].split('_')[0]
        print(f"Processing file: {file_path} with state: {state}")
        
        # Load and process the CSV file
        try:
            df = pd.read_csv(file_path)
        except Exception as e:
            print(f"Error loading {file_path}: {e}")
            continue
        
        df.columns = [f's{i}' for i in range(len(df.columns))]
        
        # Prepare dataset from windows
        X = prepare_dataset_from_windows(df, window_size)
        
        data[state] = (X, [state] * len(X))
    
    return data

def leave_one_out_cross_validation_with_loaded_model(data, model_path):
    all_states = list(data.keys())
    results = {}

    # Load the pre-trained model
    model = joblib.load(model_path)
    print(f"Model loaded from '{model_path}'")

    for state in all_states:
        # Use the current state as the validation set
        X_val, y_val = data[state]
        
        # Use the other states as the training set
        X_train = np.vstack([data[s][0] for s in all_states if s != state])
        y_train = np.concatenate([data[s][1] for s in all_states if s != state])
        
        # Standardize the features
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_val = scaler.transform(X_val)
        
        # Evaluate the loaded model on the validation set
        y_pred = model.predict(X_val)
        report = classification_report(y_val, y_pred, output_dict=True, zero_division=0)
        results[state] = report
        print(f"Validation on state {state}:")
        print(classification_report(y_val, y_pred, zero_division=0))
        
        # Print distribution of predictions
        unique, counts = np.unique(y_pred, return_counts=True)
        prediction_distribution = dict(zip(unique, counts))
        print(f"Prediction distribution for state {state}: {prediction_distribution}")
    
    return results

def main():
    file_paths = glob.glob('*.csv')  # Adjust the path as necessary
    window_size = 89  # Set the window size to 89 or 90 based on your preference

    # Load and process files
    data = load_and_process_files(file_paths, window_size)
    
    if not data:
        print("No data to process.")
        return
    
    print("Feature extraction and dataset preparation completed.")
    
    # Perform leave-one-out cross-validation with the loaded model
    model_path = 'best_svm_model.pkl'  # Path to the saved model
    results = leave_one_out_cross_validation_with_loaded_model(data, model_path)
    
    # Print the results
    for state, report in results.items():
        print(f"Results for state {state}:")
        print(report)

# Run the main function
main()


## SVM交叉验证

In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
import joblib
import glob

def extract_features_from_window(window):
    features = []
    for column in window.columns:
        sensor_data = window[column]
        
        # Detect positive pulses
        pulses = sensor_data[sensor_data > sensor_data.mean()]
        
        # Extract features: height, duration, area, max, min, mean, std
        heights = pulses.values
        durations = np.diff(pulses.index)
        
        # Adjust lengths to match
        heights = heights[:-1]  # Drop the last height to match durations length
        
        areas = heights * durations
        max_height = np.max(heights) if len(heights) > 0 else 0
        min_height = np.min(heights) if len(heights) > 0 else 0
        mean_height = np.mean(heights) if len(heights) > 0 else 0
        std_height = np.std(heights) if len(heights) > 0 else 0
        pulse_frequency = len(heights) / len(sensor_data)
        
        # Additional features
        energy = np.sum(sensor_data ** 2)  # Signal energy
        mean = np.mean(sensor_data)  # Mean value
        std = np.std(sensor_data)  # Standard deviation
        var = np.var(sensor_data)  # Variance
        rms = np.sqrt(np.mean(sensor_data ** 2))  # Root mean square
        skewness = pd.Series(sensor_data).skew()  # Skewness
        kurtosis = pd.Series(sensor_data).kurt()  # Kurtosis
        
        features.extend([
            max_height, min_height, mean_height, std_height, pulse_frequency,
            energy, mean, std, var, rms, skewness, kurtosis
        ])
    
    return features

def prepare_dataset_from_windows(df, window_size=89):
    X = []
    for start in range(0, len(df) - window_size + 1, window_size):
        window = df.iloc[start:start + window_size]
        features = extract_features_from_window(window)
        X.append(features)
    return np.array(X)

def load_and_process_files(file_paths, window_size=89):
    all_X = []
    all_y = []

    for file_path in file_paths:
        # Extract state from the file name
        state = file_path.split('/')[-1].split('_')[0]
        print(f"Processing file: {file_path} with state: {state}")
        
        # Load and process the CSV file
        try:
            df = pd.read_csv(file_path)
        except Exception as e:
            print(f"Error loading {file_path}: {e}")
            continue
        
        df.columns = [f's{i}' for i in range(len(df.columns))]
        
        # Prepare dataset from windows
        X = prepare_dataset_from_windows(df, window_size)
        
        all_X.append(X)
        all_y.extend([state] * len(X))
    
    if not all_X:
        print("No data to process.")
        return None, None
    
    # Combine data from all files
    all_X = np.vstack(all_X)
    all_y = np.array(all_y)
    
    return all_X, all_y

def main():
    # 加载已经保存的模型
    model = joblib.load('best_svm_model.pkl')
    print("Model loaded from 'best_svm_model.pkl'")
    
    # 加载和处理新数据
    file_paths = glob.glob('*.csv')  # Adjust the path as necessary
    window_size = 89  # Set the window size to 89 or 90 based on your preference
    
    all_X, all_y = load_and_process_files(file_paths, window_size)
    
    if all_X is None or all_y is None:
        print("No data to process.")
        return
    
    print("Feature extraction and dataset preparation completed.")
    print(f"Total samples: {len(all_y)}")
    
    # 标准化特征
    scaler = StandardScaler()
    all_X = scaler.fit_transform(all_X)
    
    # 使用交叉验证评估模型性能
    cv_scores = cross_val_score(model, all_X, all_y, cv=5)
    print(f"Cross-validation scores: {cv_scores}")
    print(f"Mean cross-validation score: {np.mean(cv_scores)}")
    
    # 使用加载的模型进行预测
    y_pred = model.predict(all_X)
    
    # 评估模型性能
    report = classification_report(all_y, y_pred, output_dict=True, zero_division=0)
    print("Classification Report:")
    print(classification_report(all_y, y_pred, zero_division=0))
    
    # 打印预测结果的分布
    unique, counts = np.unique(y_pred, return_counts=True)
    prediction_distribution = dict(zip(unique, counts))
    print(f"Prediction distribution: {prediction_distribution}")

# 运行主函数
main()


## 随机森林交叉验证

In [18]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
import joblib
import glob

def extract_features_from_window(window):
    features = []
    for column in window.columns:
        sensor_data = window[column]
        
        # Detect positive pulses
        pulses = sensor_data[sensor_data > sensor_data.mean()]
        
        # Extract features: height, duration, area, max, min, mean, std
        heights = pulses.values
        durations = np.diff(pulses.index)
        
        # Adjust lengths to match
        heights = heights[:-1]  # Drop the last height to match durations length
        
        areas = heights * durations
        max_height = np.max(heights) if len(heights) > 0 else 0
        min_height = np.min(heights) if len(heights) > 0 else 0
        mean_height = np.mean(heights) if len(heights) > 0 else 0
        std_height = np.std(heights) if len(heights) > 0 else 0
        pulse_frequency = len(heights) / len(sensor_data)
        
        # Additional features
        energy = np.sum(sensor_data ** 2)  # Signal energy
        mean = np.mean(sensor_data)  # Mean value
        std = np.std(sensor_data)  # Standard deviation
        var = np.var(sensor_data)  # Variance
        rms = np.sqrt(np.mean(sensor_data ** 2))  # Root mean square
        skewness = pd.Series(sensor_data).skew()  # Skewness
        kurtosis = pd.Series(sensor_data).kurt()  # Kurtosis
        
        features.extend([
            max_height, min_height, mean_height, std_height, pulse_frequency,
            energy, mean, std, var, rms, skewness, kurtosis
        ])
    
    return features

def prepare_dataset_from_windows(df, window_size=89):
    X = []
    for start in range(0, len(df) - window_size + 1, window_size):
        window = df.iloc[start:start + window_size]
        features = extract_features_from_window(window)
        X.append(features)
    return np.array(X)

def load_and_process_files(file_paths, window_size=89):
    all_X = []
    all_y = []

    for file_path in file_paths:
        # Extract state from the file name
        state = file_path.split('/')[-1].split('_')[0]
        print(f"Processing file: {file_path} with state: {state}")
        
        # Load and process the CSV file
        try:
            df = pd.read_csv(file_path)
        except Exception as e:
            print(f"Error loading {file_path}: {e}")
            continue
        
        df.columns = [f's{i}' for i in range(len(df.columns))]
        
        # Prepare dataset from windows
        X = prepare_dataset_from_windows(df, window_size)
        
        all_X.append(X)
        all_y.extend([state] * len(X))
    
    if not all_X:
        print("No data to process.")
        return None, None
    
    # Combine data from all files
    all_X = np.vstack(all_X)
    all_y = np.array(all_y)
    
    return all_X, all_y

def main():
    # 加载已经保存的模型
    model = joblib.load('best_svm_model.pkl')
    print("Model loaded from 'best_random_forest_model.pkl'")
    
    # 加载和处理新数据
    file_paths = glob.glob('*.csv')  # Adjust the path as necessary
    window_size = 89  # Set the window size to 89 or 90 based on your preference
    
    all_X, all_y = load_and_process_files(file_paths, window_size)
    
    if all_X is None or all_y is None:
        print("No data to process.")
        return
    
    print("Feature extraction and dataset preparation completed.")
    print(f"Total samples: {len(all_y)}")
    
    # 标准化特征
    scaler = StandardScaler()
    all_X = scaler.fit_transform(all_X)
    
    # 使用交叉验证评估模型性能
    cv_scores = cross_val_score(model, all_X, all_y, cv=5)
    print(f"Cross-validation scores: {cv_scores}")
    print(f"Mean cross-validation score: {np.mean(cv_scores)}")
    
    # 使用加载的模型进行预测
    y_pred = model.predict(all_X)
    
    # 评估模型性能
    report = classification_report(all_y, y_pred, output_dict=True, zero_division=0)
    print("Classification Report:")
    print(classification_report(all_y, y_pred, zero_division=0))
    
    # 打印预测结果的分布
    unique, counts = np.unique(y_pred, return_counts=True)
    prediction_distribution = dict(zip(unique, counts))
    print(f"Prediction distribution: {prediction_distribution}")

# 运行主函数
main()


## SVM和随机森林一起验证

In [17]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
import joblib
import glob

def extract_features_from_window(window):
    features = []
    for column in window.columns:
        sensor_data = window[column]
        
        # Detect positive pulses
        pulses = sensor_data[sensor_data > sensor_data.mean()]
        
        # Extract features: height, duration, area, max, min, mean, std
        heights = pulses.values
        durations = np.diff(pulses.index)
        
        # Adjust lengths to match
        heights = heights[:-1]  # Drop the last height to match durations length
        
        areas = heights * durations
        max_height = np.max(heights) if len(heights) > 0 else 0
        min_height = np.min(heights) if len(heights) > 0 else 0
        mean_height = np.mean(heights) if len(heights) > 0 else 0
        std_height = np.std(heights) if len(heights) > 0 else 0
        pulse_frequency = len(heights) / len(sensor_data)
        
        # Additional features
        energy = np.sum(sensor_data ** 2)  # Signal energy
        mean = np.mean(sensor_data)  # Mean value
        std = np.std(sensor_data)  # Standard deviation
        var = np.var(sensor_data)  # Variance
        rms = np.sqrt(np.mean(sensor_data ** 2))  # Root mean square
        skewness = pd.Series(sensor_data).skew()  # Skewness
        kurtosis = pd.Series(sensor_data).kurt()  # Kurtosis
        
        features.extend([
            max_height, min_height, mean_height, std_height, pulse_frequency,
            energy, mean, std, var, rms, skewness, kurtosis
        ])
    
    return features

def prepare_dataset_from_windows(df, window_size=89):
    X = []
    for start in range(0, len(df) - window_size + 1, window_size):
        window = df.iloc[start:start + window_size]
        features = extract_features_from_window(window)
        X.append(features)
    return np.array(X)

def load_and_process_files(file_paths, window_size=89):
    all_X = []
    all_y = []

    for file_path in file_paths:
        # Extract state from the file name
        state = file_path.split('/')[-1].split('_')[0]
        print(f"Processing file: {file_path} with state: {state}")
        
        # Load and process the CSV file
        try:
            df = pd.read_csv(file_path)
        except Exception as e:
            print(f"Error loading {file_path}: {e}")
            continue
        
        df.columns = [f's{i}' for i in range(len(df.columns))]
        
        # Prepare dataset from windows
        X = prepare_dataset_from_windows(df, window_size)
        
        all_X.append(X)
        all_y.extend([state] * len(X))
    
    if not all_X:
        print("No data to process.")
        return None, None
    
    # Combine data from all files
    all_X = np.vstack(all_X)
    all_y = np.array(all_y)
    
    return all_X, all_y

def main():
    # 加载模型
    rf_model = joblib.load('best_random_forest_model.pkl')
    svm_model = joblib.load('best_svm_model.pkl')
    print("Models loaded from 'best_random_forest_model.pkl' and 'best_svm_model.pkl'")
    
    # 加载和处理数据
    file_paths = glob.glob('*.csv')  # Adjust the path as necessary
    window_size = 89  # Set the window size to 89 or 90 based on your preference
    
    all_X, all_y = load_and_process_files(file_paths, window_size)
    
    if all_X is None or all_y is None:
        print("No data to process.")
        return
    
    print("Feature extraction and dataset preparation completed.")
    print(f"Total samples: {len(all_y)}")
    
    # 标准化特征
    scaler = StandardScaler()
    all_X = scaler.fit_transform(all_X)
    
    # 交叉验证评估随机森林模型性能
    rf_cv_scores = cross_val_score(rf_model, all_X, all_y, cv=5)
    print(f"Random Forest Cross-validation scores: {rf_cv_scores}")
    print(f"Random Forest Mean cross-validation score: {np.mean(rf_cv_scores)}")
    
    # 交叉验证评估SVM模型性能
    svm_cv_scores = cross_val_score(svm_model, all_X, all_y, cv=5)
    print(f"SVM Cross-validation scores: {svm_cv_scores}")
    print(f"SVM Mean cross-validation score: {np.mean(svm_cv_scores)}")
    
    # 使用随机森林模型进行预测
    y_pred_rf = rf_model.predict(all_X)
    
    # 使用SVM模型进行预测
    y_pred_svm = svm_model.predict(all_X)
    
    # 评估随机森林模型性能
    rf_report = classification_report(all_y, y_pred_rf, output_dict=True, zero_division=0)
    print("Random Forest Classification Report:")
    print(classification_report(all_y, y_pred_rf, zero_division=0))
    
    # 评估SVM模型性能
    svm_report = classification_report(all_y, y_pred_svm, output_dict=True, zero_division=0)
    print("SVM Classification Report:")
    print(classification_report(all_y, y_pred_svm, zero_division=0))
    
    # 打印预测结果的分布
    unique_rf, counts_rf = np.unique(y_pred_rf, return_counts=True)
    prediction_distribution_rf = dict(zip(unique_rf, counts_rf))
    print(f"Random Forest Prediction distribution: {prediction_distribution_rf}")
    
    unique_svm, counts_svm = np.unique(y_pred_svm, return_counts=True)
    prediction_distribution_svm = dict(zip(unique_svm, counts_svm))
    print(f"SVM Prediction distribution: {prediction_distribution_svm}")

# 运行主函数
main()


当然，我可以详细解释分类报告中的各个参数，以及它们的意义。

### 分类报告参数解释

分类报告是评估分类模型性能的工具，主要包括以下几个指标：

1. **Precision（精度）**：
   - 定义：精度是正确预测的正样本数占所有预测为正样本的样本数的比例。
   - 计算公式：`Precision = TP / (TP + FP)`，其中TP为真阳性，FP为假阳性。
   - 意义：精度反映了模型在预测正样本时的准确性，即模型预测为正的样本中有多少是正确的。

2. **Recall（召回率）**：
   - 定义：召回率是正确预测的正样本数占所有实际为正样本的样本数的比例。
   - 计算公式：`Recall = TP / (TP + FN)`，其中FN为假阴性。
   - 意义：召回率反映了模型对正样本的识别能力，即模型能够识别出多少实际为正的样本。

3. **F1-Score（F1分数）**：
   - 定义：F1分数是精度和召回率的调和平均数。
   - 计算公式：`F1-Score = 2 * (Precision * Recall) / (Precision + Recall)`
   - 意义：F1分数综合了精度和召回率，是模型在精度和召回率之间的一种平衡。

4. **Support（支持度）**：
   - 定义：支持度是每个类别在测试集中出现的次数。
   - 意义：支持度反映了每个类别在测试集中的样本数量。

5. **Accuracy（准确率）**：
   - 定义：准确率是正确预测的样本数占所有样本数的比例。
   - 计算公式：`Accuracy = (TP + TN) / (TP + TN + FP + FN)`
   - 意义：准确率反映了模型总体的正确率。

6. **Macro Avg（宏平均）**：
   - 定义：宏平均是所有类别精度、召回率和F1分数的算术平均值。
   - 意义：宏平均对每个类别给予相同的权重，不考虑类别的不平衡。

7. **Weighted Avg（加权平均）**：
   - 定义：加权平均是所有类别精度、召回率和F1分数的加权平均值，权重为每个类别的支持度。
   - 意义：加权平均考虑了类别的不平衡，更能反映整体性能。

### 示例分类报告

```plaintext
              precision    recall  f1-score   support

        down       1.00      0.99      0.99       100
        left       0.98      1.00      0.99       100
       right       1.00      0.99      0.99       100
          up       1.00      1.00      1.00       100

    accuracy                           0.99       400
   macro avg       1.00      0.99      1.00       400
weighted avg       1.00      0.99      1.00       400
```

### 解读示例分类报告

1. **down** 类别：
   - 精度：1.00
     - 模型预测为“down”的样本中，100%是正确的。
   - 召回率：0.99
     - 实际为“down”的样本中，99%被模型正确识别。
   - F1分数：0.99
     - 精度和召回率的调和平均数为0.99。
   - 支持度：100
     - 测试集中“down”样本的数量为100。

2. **left** 类别：
   - 精度：0.98
     - 模型预测为“left”的样本中，98%是正确的。
   - 召回率：1.00
     - 实际为“left”的样本中，100%被模型正确识别。
   - F1分数：0.99
     - 精度和召回率的调和平均数为0.99。
   - 支持度：100
     - 测试集中“left”样本的数量为100。

3. **overall accuracy**：
   - 准确率：0.99
     - 总体上，模型在400个样本中有99%的预测是正确的。

4. **macro avg**：
   - 宏平均：
     - 精度：1.00
     - 召回率：0.99
     - F1分数：1.00
   - 宏平均值反映了所有类别的总体表现。

5. **weighted avg**：
   - 加权平均：
     - 精度：1.00
     - 召回率：0.99
     - F1分数：1.00
   - 加权平均值反映了考虑类别不平衡后的总体表现。

这些指标可以帮助你全面了解模型的性能，识别模型在特定类别上的强项和弱项，并做出相应的调整和改进。如果有任何问题或需要进一步的解释，请告诉我！

## 制作相似数据，测试SVM

In [32]:
import pandas as pd
import numpy as np
import joblib
from sklearn.preprocessing import StandardScaler

def analyze_data(file_path):
    df = pd.read_csv(file_path)
    stats = df.describe().transpose()
    return stats[['mean', 'std']]

def generate_synthetic_data(stats, num_samples=89):
    synthetic_data = {}
    for column in stats.index:
        mean = stats.loc[column, 'mean']
        std = stats.loc[column, 'std']
        synthetic_data[column] = np.random.normal(mean, std, num_samples)
    return pd.DataFrame(synthetic_data)

def extract_features_from_window(window):
    features = []
    for column in window.columns:
        sensor_data = window[column]
        
        # Detect positive pulses
        pulses = sensor_data[sensor_data > sensor_data.mean()]
        
        # Extract features: height, duration, area, max, min, mean, std
        heights = pulses.values
        durations = np.diff(pulses.index)
        
        # Adjust lengths to match
        heights = heights[:-1]  # Drop the last height to match durations length
        
        areas = heights * durations
        max_height = np.max(heights) if len(heights) > 0 else 0
        min_height = np.min(heights) if len(heights) > 0 else 0
        mean_height = np.mean(heights) if len(heights) > 0 else 0
        std_height = np.std(heights) if len(heights) > 0 else 0
        pulse_frequency = len(heights) / len(sensor_data)
        
        # Additional features
        energy = np.sum(sensor_data ** 2)  # Signal energy
        mean = np.mean(sensor_data)  # Mean value
        std = np.std(sensor_data)  # Standard deviation
        var = np.var(sensor_data)  # Variance
        rms = np.sqrt(np.mean(sensor_data ** 2))  # Root mean square
        skewness = pd.Series(sensor_data).skew()  # Skewness
        kurtosis = pd.Series(sensor_data).kurt()  # Kurtosis
        
        features.extend([
            max_height, min_height, mean_height, std_height, pulse_frequency,
            energy, mean, std, var, rms, skewness, kurtosis
        ])
    
    return features

def main():
    # 读取和分析提供的数据文件
    file_path = 'left_processed.csv'
    stats = analyze_data(file_path)
    
    # 生成与提供数据相似的新数据
    synthetic_data = generate_synthetic_data(stats)
    print("Generated synthetic data:")
    print(synthetic_data)
    
    # 加载模型
    model = joblib.load('best_svm_model.pkl')
    print("Model loaded from 'best_svm_model.pkl'")
    
    # 提取特征
    features = extract_features_from_window(synthetic_data)
    features = np.array(features).reshape(1, -1)  # Reshape for single sample
    
    # 标准化特征
    scaler = StandardScaler()
    features = scaler.fit_transform(features)
    
    # 进行预测
    prediction = model.predict(features)
    
    # 输出预测结果
    print(f"Prediction: {prediction[0]}")

# 运行主函数
main()


In [68]:
import pandas as pd
import numpy as np
import joblib
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report

def load_data(file_path):
    df = pd.read_csv(file_path)
    return df

def extract_features_from_window(window):
    features = []
    for column in window.columns:
        sensor_data = window[column]
        
        # Detect positive pulses
        pulses = sensor_data[sensor_data > sensor_data.mean()]
        
        # Extract features: height, duration, area, max, min, mean, std
        heights = pulses.values
        durations = np.diff(pulses.index)
        
        # Adjust lengths to match
        if len(heights) > 1:
            heights = heights[:-1]  # Drop the last height to match durations length
        
        areas = heights * durations if len(heights) > 0 else np.array([0])
        max_height = np.max(heights) if len(heights) > 0 else 0
        min_height = np.min(heights) if len(heights) > 0 else 0
        mean_height = np.mean(heights) if len(heights) > 0 else 0
        std_height = np.std(heights) if len(heights) > 0 else 0
        pulse_frequency = len(heights) / len(sensor_data)
        
        # Additional features
        energy = np.sum(sensor_data ** 2)  # Signal energy
        mean = np.mean(sensor_data)  # Mean value
        std = np.std(sensor_data)  # Standard deviation
        var = np.var(sensor_data)  # Variance
        rms = np.sqrt(np.mean(sensor_data ** 2))  # Root mean square
        skewness = pd.Series(sensor_data).skew()  # Skewness
        kurtosis = pd.Series(sensor_data).kurt()  # Kurtosis
        
        features.extend([
            max_height, min_height, mean_height, std_height, pulse_frequency,
            energy, mean, std, var, rms, skewness, kurtosis
        ])
    
    return features

def extract_features_from_data(df, window_size=89):
    X = []
    for start in range(0, len(df) - window_size + 1, window_size):
        window = df.iloc[start:start + window_size]
        features = extract_features_from_window(window)
        X.append(features)
    return np.array(X)

def main():
    # 加载模型
    model = joblib.load('best_svm_model.pkl')
    print("Model loaded from 'best_svm_model.pkl'")
    
    # 加载数据文件
    file_path = 'left_processed.csv'
    df = load_data(file_path)
    
    # 提取特征
    features = extract_features_from_data(df)
    
    # 标准化特征
    scaler = StandardScaler()
    features = scaler.fit_transform(features)
    
    # 进行预测
    predictions = model.predict(features)
    
    # 统计每种状态的预测次数
    unique, counts = np.unique(predictions, return_counts=True)
    prediction_counts = dict(zip(unique, counts))
    
    # 输出预测结果统计
    print("Prediction counts:")
    for state, count in prediction_counts.items():
        print(f"{state}: {count}")
    
    # 打印分类报告
    print("Classification Report:")
    print(classification_report(predictions, predictions, zero_division=0))

# 运行主函数
main()


In [69]:
import pandas as pd
import numpy as np
import joblib
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
import glob

def load_data(file_path):
    df = pd.read_csv(file_path)
    return df

def extract_features_from_window(window):
    features = []
    for column in window.columns:
        sensor_data = window[column]
        
        # Detect positive pulses
        pulses = sensor_data[sensor_data > sensor_data.mean()]
        
        # Extract features: height, duration, area, max, min, mean, std
        heights = pulses.values
        durations = np.diff(pulses.index)
        
        # Adjust lengths to match
        if len(heights) > 1:
            heights = heights[:-1]  # Drop the last height to match durations length
        
        areas = heights * durations if len(heights) > 0 else np.array([0])
        max_height = np.max(heights) if len(heights) > 0 else 0
        min_height = np.min(heights) if len(heights) > 0 else 0
        mean_height = np.mean(heights) if len(heights) > 0 else 0
        std_height = np.std(heights) if len(heights) > 0 else 0
        pulse_frequency = len(heights) / len(sensor_data)
        
        # Additional features
        energy = np.sum(sensor_data ** 2)  # Signal energy
        mean = np.mean(sensor_data)  # Mean value
        std = np.std(sensor_data)  # Standard deviation
        var = np.var(sensor_data)  # Variance
        rms = np.sqrt(np.mean(sensor_data ** 2))  # Root mean square
        skewness = pd.Series(sensor_data).skew()  # Skewness
        kurtosis = pd.Series(sensor_data).kurt()  # Kurtosis
        
        features.extend([
            max_height, min_height, mean_height, std_height, pulse_frequency,
            energy, mean, std, var, rms, skewness, kurtosis
        ])
    
    return features

def extract_features_from_data(df, window_size=89):
    X = []
    for start in range(0, len(df) - window_size + 1, window_size):
        window = df.iloc[start:start + window_size]
        features = extract_features_from_window(window)
        X.append(features)
    return np.array(X)

def main():
    # 加载模型
    rf_model = joblib.load('best_random_forest_model.pkl')
    svm_model = joblib.load('best_svm_model.pkl')
    print("Models loaded from 'best_random_forest_model.pkl' and 'best_svm_model.pkl'")
    
    # 获取所有数据文件路径
    file_paths = glob.glob('*_processed.csv')
    
    all_predictions_rf = []
    all_predictions_svm = []
    all_labels = []

    for file_path in file_paths:
        # 从文件名中提取状态
        state = file_path.split('/')[-1].split('_')[0]
        
        # 加载数据文件
        df = load_data(file_path)
        
        # 提取特征
        features = extract_features_from_data(df)
        
        # 标准化特征
        scaler = StandardScaler()
        features = scaler.fit_transform(features)
        
        # 使用随机森林模型进行预测
        predictions_rf = rf_model.predict(features)
        all_predictions_rf.extend(predictions_rf)
        
        # 使用SVM模型进行预测
        predictions_svm = svm_model.predict(features)
        all_predictions_svm.extend(predictions_svm)
        
        # 添加真实标签
        all_labels.extend([state] * len(predictions_rf))
    
    # 统计每种状态的预测次数（随机森林）
    unique_rf, counts_rf = np.unique(all_predictions_rf, return_counts=True)
    prediction_counts_rf = dict(zip(unique_rf, counts_rf))
    
    # 统计每种状态的预测次数（SVM）
    unique_svm, counts_svm = np.unique(all_predictions_svm, return_counts=True)
    prediction_counts_svm = dict(zip(unique_svm, counts_svm))
    
    # 输出预测结果统计
    print("Random Forest Prediction counts:")
    for state, count in prediction_counts_rf.items():
        print(f"{state}: {count}")
    
    print("SVM Prediction counts:")
    for state, count in prediction_counts_svm.items():
        print(f"{state}: {count}")
    
    # 打印分类报告
    print("Random Forest Classification Report:")
    print(classification_report(all_labels, all_predictions_rf, zero_division=0))
    
    print("SVM Classification Report:")
    print(classification_report(all_labels, all_predictions_svm, zero_division=0))

# 运行主函数
main()


## CNN classification

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

def load_data(file_path):
    df = pd.read_csv(file_path)
    return df

def extract_features_from_data(df, window_size=89):
    X = []
    for start in range(0, len(df) - window_size + 1, window_size):
        window = df.iloc[start:start + window_size].values
        X.append(window)
    return np.array(X)

def load_and_process_files(file_paths, window_size=89):
    all_X = []
    all_y = []

    for file_path in file_paths:
        # Extract state from the file name
        state = file_path.split('/')[-1].split('_')[0]
        
        # Load and process the CSV file
        df = load_data(file_path)
        
        # Prepare dataset from windows
        X = extract_features_from_data(df, window_size)
        y = [state] * len(X)
        
        all_X.append(X)
        all_y.extend(y)
    
    # Combine data from all files
    all_X = np.vstack(all_X)
    all_y = np.array(all_y)
    
    return all_X, all_y

# 将状态标签转换为数值编码
def encode_labels(labels):
    unique_labels = np.unique(labels)
    label_to_index = {label: index for index, label in enumerate(unique_labels)}
    return np.array([label_to_index[label] for label in labels]), label_to_index

def main():
    # 数据文件路径
    file_paths = glob.glob('*_processed.csv')  # Adjust the path as necessary
    window_size = 89  # Set the window size to 89 or 90 based on your preference
    
    # 加载和处理文件
    all_X, all_y = load_and_process_files(file_paths, window_size)
    
    # 将状态标签转换为数值编码
    all_y, label_to_index = encode_labels(all_y)
    all_y = to_categorical(all_y)
    
    # 标准化特征
    scaler = StandardScaler()
    all_X = scaler.fit_transform(all_X.reshape(-1, window_size * 4)).reshape(-1, window_size, 4)
    
    # 划分训练集和测试集
    X_train, X_test, y_train, y_test = train_test_split(all_X, all_y, test_size=0.2, random_state=42)
    
    # 创建CNN模型
    model = Sequential()
    model.add(Conv1D(64, kernel_size=3, activation='relu', input_shape=(window_size, 4)))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Conv1D(128, kernel_size=3, activation='relu'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Flatten())
    model.add(Dense(128, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(len(label_to_index), activation='softmax'))
    
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    
    # 训练模型
    early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=50, batch_size=32, callbacks=[early_stopping])
    
    # 评估模型
    loss, accuracy = model.evaluate(X_test, y_test)
    print(f"Test Accuracy: {accuracy}")
    
    # 保存模型
    model.save('cnn_model.h5')
    print("Model saved as 'cnn_model.h5'")
    
    # 打印分类报告
    y_pred = model.predict(X_test)
    y_pred_labels = np.argmax(y_pred, axis=1)
    y_test_labels = np.argmax(y_test, axis=1)
    
    print("Classification Report:")
    print(classification_report(y_test_labels, y_pred_labels, target_names=label_to_index.keys()))

# 运行主函数
main()
