In [None]:
import os
import pandas as pd

def convert_csv_to_data(csv_file_path):
    # Load the CSV file into a DataFrame (no header assumed here)
    df = pd.read_csv(csv_file_path, header=None)  
    
    # Generate the output .data file path
    data_file_path = csv_file_path.replace('.csv', '.data')
    
    # Format columns based on conditions
    formatted_columns = []
    for col in df.columns:
        if col >= 1024 and col <= 1025:
            # Retain full precision for columns 44 to 59 (no formatting change)
            formatted_column = df[col]
        else:
            # Apply 3 decimal places for other columns
            formatted_column = df[col].map(lambda x: '%.3f' % x if pd.notnull(x) else '')
        
        formatted_columns.append(formatted_column)
    
    # Concatenate all formatted columns back into a DataFrame
    formatted_df = pd.concat(formatted_columns, axis=1)
    
    # Save the formatted DataFrame to a .data file
    formatted_df.to_csv(data_file_path, index=False, header=False, sep=',', quoting=None)
    
    print(f"Converted {csv_file_path} to {data_file_path}")

# Example usage
csv_file = "C:\\Notebooks\\rrl_source\\dataset_raw\\train_segment_W2V2_full.csv"  # Path to the CSV file
convert_csv_to_data(csv_file)

In [None]:
import os

# Define the directory path where you want to save the file
directory_path = "C:\\Notebooks\\rrl_source\\dataset\\"  # Change this to your desired path

# Ensure the directory exists
os.makedirs(directory_path, exist_ok=True)

# Define the full file path
file_path = os.path.join(directory_path, "train_segment_W2V2_full.INFO")

# Open the file in write mode
with open(file_path, "w") as file:
    # Write 1024 continuous features
    for i in range(1, 1025):
        file.write(f"{i} continuous\n")
    
    # Write the remaining lines
    file.write("class discrete\n")
    file.write("LABEL_POS -1\n")

print(f"{file_path} has been created successfully.")

In [1]:
import pandas as pd

# Path to the large CSV file
file_path = r"C:\Notebooks\rrl_source\dataset_raw\train_segment_Wav2Vec2_1024.csv"

# Parameters for chunking
chunk_size = 150000  # Number of rows to read in each chunk, adjust if needed
output_size = 80000  # Total rows per output file
samples_per_label = output_size // 2  # Samples per label (40,000 for each label)

# Initialize counters
part_num = 1

# Create an iterator for reading the CSV file in chunks
csv_iterator = pd.read_csv(file_path, chunksize=chunk_size)

for chunk in csv_iterator:
    # Drop any rows with NaN in the 'Label' column (if present)
    chunk = chunk.dropna(subset=['Label'])
    
    # Separate data by label
    ones = chunk[chunk['Label'] == 1]
    zeros = chunk[chunk['Label'] == 0]
    
    # Check if we have enough samples for both labels
    if len(ones) < samples_per_label or len(zeros) < samples_per_label:
        continue  # Skip this chunk if it doesn't have enough samples
    
    # Sample 40,000 from each label to create a balanced set
    balanced_chunk = pd.concat([ones.sample(samples_per_label, random_state=42),
                                zeros.sample(samples_per_label, random_state=42)])
    
    # Shuffle the balanced data
    balanced_chunk = balanced_chunk.sample(frac=1, random_state=42).reset_index(drop=True)
    
    # Save to a new CSV file
    output_file = rf"C:\Notebooks\rrl_source\dataset_raw\train_segment_Wav2Vec2_part{part_num}.csv"
    balanced_chunk.to_csv(output_file, index=False)
    print(f"Saved {output_file} with {len(balanced_chunk)} samples.")
    
    part_num += 1

print("Finished processing and saving parts.")

Saved C:\Notebooks\rrl_source\dataset_raw\train_segment_Wav2Vec2_part1.csv with 80000 samples.
Saved C:\Notebooks\rrl_source\dataset_raw\train_segment_Wav2Vec2_part2.csv with 80000 samples.
Saved C:\Notebooks\rrl_source\dataset_raw\train_segment_Wav2Vec2_part3.csv with 80000 samples.
Finished processing and saving parts.


In [6]:
import os
import pandas as pd

def convert_csv_to_data_and_info(csv_file_path):
    # Load CSV file, removing header if it exists
    df = pd.read_csv(csv_file_path, header=0)
    
    # Remove FILEID column if present
    if 'FILEID' in df.columns[0] or df.columns[0].lower() == 'fileid' or df.columns[0].lower() == 'FileID':
        df = df.iloc[:, 1:]
        
    # Remove polarity column if present in the last position
    if df.columns[-1].lower() in ['polarity', 'positive', 'negative']:
        df = df.iloc[:, :-1]
    
    # Generate the output .data file path
    data_file_path = csv_file_path.replace('.csv', '.data')
    
    # Format all feature columns to 3 decimal places, ensure label is integer (0 or 1)
    formatted_df = df.copy()
    for col in df.columns[:-1]:  # Apply formatting to all except the label column
        formatted_df[col] = df[col].map(lambda x: '%.3f' % x if pd.notnull(x) else '')
    
    # Ensure the label column is integer without floating point
    formatted_df.iloc[:, -1] = df.iloc[:, -1].astype(int)
    
    # Save the formatted DataFrame to a .data file
    formatted_df.to_csv(data_file_path, index=False, header=False, sep=',', float_format='%.3f')
    print(f"Converted {csv_file_path} to {data_file_path}")
    
    # Generate the .info file
    num_features = len(df.columns) - 1  # Subtract label column
    info_file_path = csv_file_path.replace('.csv', '.INFO')
    with open(info_file_path, "w") as file:
        # Write continuous types for all features
        for i in range(1, num_features + 1):
            file.write(f"{i} continuous\n")
        # Write the label type as discrete
        file.write("class discrete\n")
        file.write("LABEL_POS -1\n")
    
    print(f"{info_file_path} has been created successfully.")

# Example usage
csv_file = "C:\\Notebooks\\rrl_source\\Spectnet_model_Halftruth_embedding\\feature_emb_256_norm.csv"  # Path to the CSV file
convert_csv_to_data_and_info(csv_file)


Converted C:\Notebooks\rrl_source\Spectnet_model_Halftruth_embedding\feature_emb_256_norm.csv to C:\Notebooks\rrl_source\Spectnet_model_Halftruth_embedding\feature_emb_256_norm.data
C:\Notebooks\rrl_source\Spectnet_model_Halftruth_embedding\feature_emb_256_norm.INFO has been created successfully.


In [3]:
import os
import pandas as pd

# Function to process and convert the merged CSV to .data and .info files
def convert_csv_to_data_and_info(csv_file_path, output_prefix):
    # Load the merged CSV file
    df = pd.read_csv(csv_file_path, header=0)
    
    # Remove the FILEID column if present
    if df.columns[0].lower() in ['fileid', 'file_id']:
        print(f"Removing FILEID column: {df.columns[0]}")
        df = df.iloc[:, 1:]
        
    # Generate the output .data file path
    data_file_path = f"{output_prefix}.data"
    
    # Format all feature columns to 3 decimal places, ensure label is integer (0 or 1)
    formatted_df = df.copy()
    for col in df.columns[:-1]:  # Format all except the label column
        formatted_df[col] = df[col].map(lambda x: f"{x:.3f}" if pd.notnull(x) else '')
    
    # Ensure the label column is integer
    formatted_df.iloc[:, -1] = df.iloc[:, -1].astype(int)
    
    # Save the formatted DataFrame to a .data file
    formatted_df.to_csv(data_file_path, index=False, header=False, sep=',', float_format='%.3f')
    print(f"Converted merged data to {data_file_path}")
    
    # Generate the .info file
    num_features = len(df.columns) - 1  # Subtract label column
    info_file_path = f"{output_prefix}.INFO"
    with open(info_file_path, "w") as file:
        # Write continuous types for all features
        for i in range(1, num_features + 1):
            file.write(f"{i} continuous\n")
        # Write the label type as discrete
        file.write("class discrete\n")
        file.write("LABEL_POS -1\n")
    
    print(f"{info_file_path} has been created successfully.")

# Merging all cleaned parts
def merge_cleaned_csv_files(input_dir, output_prefix):
    # Find all cleaned CSV files (cleaned_part_1.csv to cleaned_part_12.csv)
    csv_files = [os.path.join(input_dir, f"extracted_train_segment_merged_part_{i}.csv") for i in range(1, 13)]
    
    # Verify that files exist
    existing_files = [file for file in csv_files if os.path.exists(file)]
    if not existing_files:
        raise FileNotFoundError("No cleaned CSV files found in the specified directory.")
    print(f"Found {len(existing_files)} files to merge: {existing_files}")
    
    # Read and merge all CSVs
    data_frames = [pd.read_csv(file) for file in existing_files]
    merged_df = pd.concat(data_frames, ignore_index=True)
    print(f"Merged data shape: {merged_df.shape}")
    
    # Save the merged file temporarily
    merged_csv_path = f"{output_prefix}_merged.csv"
    merged_df.to_csv(merged_csv_path, index=False)
    print(f"Merged data saved temporarily to {merged_csv_path}")
    
    # Process the merged CSV to generate .data and .info files
    convert_csv_to_data_and_info(merged_csv_path, output_prefix)

    # Remove the temporary merged CSV
    os.remove(merged_csv_path)
    print(f"Temporary merged CSV removed: {merged_csv_path}")

# Example usage
input_directory = r"F:\Awais_data\Datasets\PartialSpoof\Specnet\train\embeddings"  # Directory with cleaned parts
output_file_prefix = os.path.join(input_directory, "train_segment_specnet_emb")  # Prefix for .data and .info files

merge_cleaned_csv_files(input_directory, output_file_prefix)


Found 7 files to merge: ['F:\\Awais_data\\Datasets\\PartialSpoof\\Specnet\\train\\embeddings\\extracted_train_segment_merged_part_1.csv', 'F:\\Awais_data\\Datasets\\PartialSpoof\\Specnet\\train\\embeddings\\extracted_train_segment_merged_part_2.csv', 'F:\\Awais_data\\Datasets\\PartialSpoof\\Specnet\\train\\embeddings\\extracted_train_segment_merged_part_3.csv', 'F:\\Awais_data\\Datasets\\PartialSpoof\\Specnet\\train\\embeddings\\extracted_train_segment_merged_part_4.csv', 'F:\\Awais_data\\Datasets\\PartialSpoof\\Specnet\\train\\embeddings\\extracted_train_segment_merged_part_5.csv', 'F:\\Awais_data\\Datasets\\PartialSpoof\\Specnet\\train\\embeddings\\extracted_train_segment_merged_part_6.csv', 'F:\\Awais_data\\Datasets\\PartialSpoof\\Specnet\\train\\embeddings\\extracted_train_segment_merged_part_7.csv']
Merged data shape: (558407, 259)
Merged data saved temporarily to F:\Awais_data\Datasets\PartialSpoof\Specnet\train\embeddings\train_segment_specnet_emb_merged.csv
Removing FILEID colu

In [5]:
# Define the file path
file_path = r"C:\Notebooks\rrl_source\dataset\train_segment_specnet_emb.data"

# Open the file, process, and save the updated content
with open(file_path, "r") as file:
    lines = file.readlines()

# Remove the leading comma from each line
updated_lines = [line.lstrip(',') for line in lines]

# Write the updated lines back to the file
with open(file_path, "w") as file:
    file.writelines(updated_lines)

print("Leading commas removed successfully!")


Leading commas removed successfully!


In [32]:
import pandas as pd
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import StandardScaler

save_path = r"C:\\Notebooks\\rrl_source\\dataset_raw\\merge\\segment\\"
extract_utterance_level = False

# Load the features file
features_file = f"{save_path}train_{'utterance' if extract_utterance_level else 'segment'}_merged_part_1.csv"
data = pd.read_csv(features_file)

# Exclude the FileID column and separate features and labels
file_ids = data.iloc[:, 0]  # First column with audio names (FileID)
features = data.iloc[:, 1:-1]  # All columns except the first (FileID) and last (label)
labels = data.iloc[:, -1].astype(int)  # The last column is assumed to be the label

# Standardize features
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

# Perform feature selection based on mutual information
mi_scores = mutual_info_classif(features_scaled, labels)
feature_importance = sorted(enumerate(mi_scores), key=lambda x: x[1], reverse=True)
selected_feature_indices = [idx for idx, score in feature_importance if score > 0.01]  # Adjust threshold as needed

# Extract effective features
effective_features = features.iloc[:, selected_feature_indices]

# Add the FileID and label columns back to the effective features
effective_features.insert(0, 'FileID', file_ids)  # Add FileID as the first column
effective_features['Label'] = labels.values  # Add label as the last column

# Save the effective features to a new CSV file
effective_features_save_name = f"{save_path}effective.csv"
effective_features.to_csv(effective_features_save_name, index=False)

print("Effective features extraction and saving completed.")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  effective_features['Label'] = labels.values  # Add label as the last column


Effective features extraction and saving completed.


In [30]:

import pandas as pd

def balance_dataset(input_csv, output_csv='C:\\Notebooks\\rrl_source\\dataset_raw\\merge\\segment\\train_segment_merged_part_1_balance.csv'):
    # Read the CSV file
    data = pd.read_csv(input_csv)
    
    # Check if the label column exists and identify the classes
    if 'Label' not in data.columns:
        raise ValueError("The CSV file must contain a 'label' column with 1 and 0 as classes.")
    
    # Separate the data based on class labels
    class_0 = data[data['Label'] == 0]
    class_1 = data[data['Label'] == 1]
    
    # Determine the smaller class size
    min_class_size = min(len(class_0), len(class_1))
    
    # Randomly sample to balance the classes
    balanced_class_0 = class_0.sample(n=min_class_size, random_state=42)
    balanced_class_1 = class_1.sample(n=min_class_size, random_state=42)
    
    # Concatenate the balanced classes
    balanced_data = pd.concat([balanced_class_0, balanced_class_1])
    
    # Shuffle the data to mix the classes
    balanced_data = balanced_data.sample(frac=1, random_state=42).reset_index(drop=True)
    
    # Save the balanced data to a new CSV file
    balanced_data.to_csv(output_csv, index=False)
    print(f"Balanced dataset saved to {output_csv}")

# Usage example
balance_dataset('C:\\Notebooks\\rrl_source\\dataset_raw\\merge\\segment\\train_segment_merged_part_1.csv')  # Replace 'input.csv' with the path to your CSV file


Balanced dataset saved to C:\Notebooks\rrl_source\dataset_raw\merge\segment\train_segment_merged_part_1_balance.csv


data analysis

In [6]:
import os
import pandas as pd
import glob

# Define the directory where CSV files are stored
csv_dir =  r"F:\Awais_data\Datasets\PartialSpoof\Specnet\train\embeddings"

# Ensure the output directory exists
output_path = csv_dir  # Save in the same directory
os.makedirs(output_path, exist_ok=True)

# Read all CSV files
all_files = glob.glob(os.path.join(csv_dir, "*.csv"))
print(f"Found {len(all_files)} files to process.")

# Combine all CSVs into a single DataFrame
data_frames = []
for file in all_files:
    df = pd.read_csv(file)
    data_frames.append(df)
merged_data = pd.concat(data_frames, ignore_index=True)
print(f"Merged data shape: {merged_data.shape}")

# ---- Data Analysis ----
# Identify columns with all zero values
zero_columns = [col for col in merged_data.columns if (merged_data[col] == 0).all()]
print(f"Number of columns with all zero values: {len(zero_columns)}")
if zero_columns:
    print(f"Columns with all zero values: {zero_columns}")

# Check for duplicate columns (columns with identical values)
duplicate_columns = set()
for i, col1 in enumerate(merged_data.columns):
    for col2 in merged_data.columns[i + 1:]:
        if merged_data[col1].equals(merged_data[col2]):
            duplicate_columns.add(col2)
print(f"Number of duplicate columns: {len(duplicate_columns)}")
if duplicate_columns:
    print(f"Duplicate columns: {list(duplicate_columns)}")

# Identify duplicate rows based on all features except 'file_id' and 'label'
if "file_id" in merged_data.columns and "label" in merged_data.columns:
    duplicates = merged_data[merged_data.duplicated(subset=merged_data.columns.difference(["file_id", "label"]), keep=False)]
    print(f"Number of duplicate rows: {duplicates.shape[0]}")
    if not duplicates.empty:
        print("Sample duplicate rows (file_id and label):")
        print(duplicates[["file_id", "label"]].head(10))  # Display the first 10 duplicate rows
else:
    print("Columns 'file_id' or 'label' not found for duplicate detection.")

# Summary of data
print("Data summary:")
print(merged_data.describe())

# ---- Data Cleaning ----
# Remove columns with all zero values
non_zero_columns = merged_data.loc[:, (merged_data != 0).any(axis=0)]
cleaned_data = non_zero_columns

# Ensure the label column is at the end
if "label" in cleaned_data.columns:
    label_column = cleaned_data.pop("label")
    cleaned_data["label"] = label_column

print(f"Cleaned data shape: {cleaned_data.shape}")

# ---- Save Cleaned Data in Parts ----
part_size = 50000
for i in range(0, len(cleaned_data), part_size):
    part_data = cleaned_data.iloc[i:i + part_size]
    part_name = os.path.join(output_path, f"cleaned_part_{i // part_size + 1}.csv")
    part_data.to_csv(part_name, index=False)
    print(f"Saved {part_name}")

print("Data cleaning and saving completed!")


Found 7 files to process.
Merged data shape: (558407, 259)
Number of columns with all zero values: 179
Columns with all zero values: ['feature_0', 'feature_1', 'feature_2', 'feature_4', 'feature_5', 'feature_6', 'feature_7', 'feature_8', 'feature_9', 'feature_10', 'feature_12', 'feature_15', 'feature_17', 'feature_20', 'feature_21', 'feature_23', 'feature_24', 'feature_27', 'feature_28', 'feature_29', 'feature_30', 'feature_31', 'feature_32', 'feature_33', 'feature_34', 'feature_35', 'feature_36', 'feature_37', 'feature_38', 'feature_40', 'feature_42', 'feature_44', 'feature_46', 'feature_48', 'feature_49', 'feature_50', 'feature_52', 'feature_54', 'feature_55', 'feature_57', 'feature_58', 'feature_59', 'feature_61', 'feature_62', 'feature_63', 'feature_64', 'feature_65', 'feature_66', 'feature_67', 'feature_68', 'feature_69', 'feature_70', 'feature_71', 'feature_73', 'feature_74', 'feature_76', 'feature_77', 'feature_79', 'feature_80', 'feature_82', 'feature_83', 'feature_84', 'featur

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_data["label"] = label_column


Saved F:\Awais_data\Datasets\PartialSpoof\Specnet\train\embeddings\cleaned_part_1.csv
Saved F:\Awais_data\Datasets\PartialSpoof\Specnet\train\embeddings\cleaned_part_2.csv
Saved F:\Awais_data\Datasets\PartialSpoof\Specnet\train\embeddings\cleaned_part_3.csv
Saved F:\Awais_data\Datasets\PartialSpoof\Specnet\train\embeddings\cleaned_part_4.csv
Saved F:\Awais_data\Datasets\PartialSpoof\Specnet\train\embeddings\cleaned_part_5.csv
Saved F:\Awais_data\Datasets\PartialSpoof\Specnet\train\embeddings\cleaned_part_6.csv
Saved F:\Awais_data\Datasets\PartialSpoof\Specnet\train\embeddings\cleaned_part_7.csv
Saved F:\Awais_data\Datasets\PartialSpoof\Specnet\train\embeddings\cleaned_part_8.csv
Saved F:\Awais_data\Datasets\PartialSpoof\Specnet\train\embeddings\cleaned_part_9.csv
Saved F:\Awais_data\Datasets\PartialSpoof\Specnet\train\embeddings\cleaned_part_10.csv
Saved F:\Awais_data\Datasets\PartialSpoof\Specnet\train\embeddings\cleaned_part_11.csv
Saved F:\Awais_data\Datasets\PartialSpoof\Specnet\tr

In [None]:
import os
import pandas as pd
import glob
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import MinMaxScaler

# Define the directory where CSV files are stored
csv_dir = r"F:\Awais_data\Datasets\PartialSpoof\Specnet\train\embeddings"

# Ensure the output directory exists
output_path = csv_dir  # Save in the same directory
os.makedirs(output_path, exist_ok=True)

# Read all CSV files
all_files = glob.glob(os.path.join(csv_dir, "*.csv"))
print(f"Found {len(all_files)} files to process.")

# Combine all CSVs into a single DataFrame
data_frames = []
for file in all_files:
    df = pd.read_csv(file)
    data_frames.append(df)
merged_data = pd.concat(data_frames, ignore_index=True)
print(f"Merged data shape: {merged_data.shape}")

# ---- Data Analysis ----
# Identify columns with all zero values
zero_columns = [col for col in merged_data.columns if (merged_data[col] == 0).all()]
print(f"Number of columns with all zero values: {len(zero_columns)}")

# Drop zero-value columns
merged_data.drop(columns=zero_columns, inplace=True)

# Drop columns with NaN values
nan_columns = merged_data.columns[merged_data.isna().any()].tolist()
print(f"Number of columns with NaN values: {len(nan_columns)}")
merged_data.dropna(axis=1, inplace=True)

# Drop duplicate columns
duplicate_columns = set()
for i, col1 in enumerate(merged_data.columns):
    for col2 in merged_data.columns[i + 1:]:
        if merged_data[col1].equals(merged_data[col2]):
            duplicate_columns.add(col2)
print(f"Number of duplicate columns: {len(duplicate_columns)}")
merged_data.drop(columns=list(duplicate_columns), inplace=True)

# ---- Feature Selection ----
# Ensure file_id and label columns are present
if "fileid" not in merged_data.columns or "label" not in merged_data.columns:
    raise ValueError("Columns 'file_id' and 'label' must be present in the dataset.")

# Separate features, file_id, and label
file_ids = merged_data["fileid"]
labels = merged_data["label"]
features = merged_data.drop(columns=["fileid", "label"])

# Normalize features for mutual information computation
scaler = MinMaxScaler()
normalized_features = scaler.fit_transform(features)

# Compute mutual information
mutual_info = mutual_info_classif(normalized_features, labels, discrete_features=False)
feature_importance = pd.Series(mutual_info, index=features.columns).sort_values(ascending=False)
top_features = feature_importance.head(64).index.tolist()

print(f"Selected top 64 features based on mutual information: {top_features}")

# Keep only selected features along with file_id and label
selected_data = pd.concat([file_ids, features[top_features], labels], axis=1)

# ---- Save Cleaned Data ----
output_file = os.path.join(output_path, "selected_66d_data.csv")
selected_data.to_csv(output_file, index=False)
print(f"Saved selected features to {output_file}")


In [12]:
import os
import pandas as pd
import glob

# Define the directory where CSV files are stored
csv_dir = r"F:\Awais_data\Datasets\PartialSpoof\Specnet\train\embeddings"

# Ensure the output directory exists
output_path = csv_dir  # Save in the same directory
os.makedirs(output_path, exist_ok=True)

# Read all CSV files
all_files = glob.glob(os.path.join(csv_dir, "*.csv"))
print(f"Found {len(all_files)} files to process.")

# Combine all CSVs into a single DataFrame
data_frames = []
for file in all_files:
    df = pd.read_csv(file)
    data_frames.append(df)
merged_data = pd.concat(data_frames, ignore_index=True)
print(f"Merged data shape: {merged_data.shape}")

# ---- Data Analysis ----
# Identify and remove columns with all zero values
zero_columns = [col for col in merged_data.columns if (merged_data[col] == 0).all()]
print(f"Number of columns with all zero values: {len(zero_columns)}")
merged_data.drop(columns=zero_columns, inplace=True)

# Remove columns with NaN values
nan_columns = merged_data.columns[merged_data.isna().any()].tolist()
print(f"Number of columns with NaN values: {len(nan_columns)}")
merged_data.dropna(axis=1, inplace=True)

# Remove duplicate columns
duplicate_columns = set()
for i, col1 in enumerate(merged_data.columns):
    for col2 in merged_data.columns[i + 1:]:
        if merged_data[col1].equals(merged_data[col2]):
            duplicate_columns.add(col2)
print(f"Number of duplicate columns: {len(duplicate_columns)}")
merged_data.drop(columns=list(duplicate_columns), inplace=True)

# Keep the "file_id" and "label" columns for later
if "fileid" not in merged_data.columns or "label" not in merged_data.columns:
    raise ValueError("Columns 'fileid' and 'label' must be present in the dataset.")

file_id = merged_data.pop("fileid")
label = merged_data.pop("label")

# Select the top 64 distinct columns based on variance
variance = merged_data.var()
top_features = variance.nlargest(64).index
print(f"Selected top 64 features based on variance: {top_features.tolist()}")

# Combine the cleaned data
cleaned_data = pd.concat([file_id, merged_data[top_features], label], axis=1)
print(f"Cleaned data shape: {cleaned_data.shape}")

# ---- Save Cleaned Data in Parts ----
part_size = 50000
for i in range(0, len(cleaned_data), part_size):
    part_data = cleaned_data.iloc[i:i + part_size]
    part_name = os.path.join(output_path, f"cleaned_part_{i // part_size + 1}.csv")
    part_data.to_csv(part_name, index=False)
    print(f"Saved {part_name}")

print("Data cleaning and saving completed!")


Found 7 files to process.
Merged data shape: (558407, 259)
Number of columns with all zero values: 179
Number of columns with NaN values: 1
Number of duplicate columns: 0
Selected top 64 features based on variance: ['Unnamed: 242', 'Unnamed: 158', 'feature_100', 'Unnamed: 139', 'feature_75', 'Unnamed: 188', 'feature_92', 'Unnamed: 133', 'Unnamed: 138', 'Unnamed: 176', 'feature_72', 'feature_18', 'feature_22', 'Unnamed: 197', 'Unnamed: 228', 'feature_3', 'feature_127', 'feature_41', 'Unnamed: 150', 'feature_101', 'Unnamed: 256', 'Unnamed: 257', 'feature_94', 'Unnamed: 251', 'feature_105', 'feature_26', 'Unnamed: 160', 'feature_43', 'feature_60', 'feature_56', 'feature_121', 'Unnamed: 208', 'feature_11', 'feature_120', 'Unnamed: 229', 'Unnamed: 205', 'feature_47', 'Unnamed: 172', 'Unnamed: 224', 'feature_39', 'feature_96', 'Unnamed: 239', 'feature_78', 'feature_45', 'Unnamed: 235', 'feature_111', 'feature_53', 'Unnamed: 180', 'Unnamed: 194', 'Unnamed: 252', 'Unnamed: 143', 'Unnamed: 141'

data shuffle and normalization


In [5]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.utils import shuffle

# Input and output CSV file paths
input_csv = "C:\\Notebooks\\rrl_source\\Spectnet_model_Halftruth_embedding\\feature_emb_256.csv"  # Replace with your input file path
output_csv = "C:\\Notebooks\\rrl_source\\Spectnet_model_Halftruth_embedding\\normalized_shuffled_output.csv"  # Replace with your desired output file path

# Read the CSV file
df = pd.read_csv(input_csv)

# Separate the columns
file_ids = df.iloc[:, 0]  # File IDs (first column)
labels = df.iloc[:, -1]  # Labels (last column)
features = df.iloc[:, 1:-1]  # Feature columns

# Normalize features to [0, 1] range
scaler = MinMaxScaler(feature_range=(0, 1))
features_normalized = scaler.fit_transform(features)

# Create a new DataFrame with normalized features
normalized_df = pd.DataFrame(features_normalized, columns=features.columns)
normalized_df.insert(0, "FileID", file_ids)  # Add FileID as the first column
normalized_df["Label"] = labels  # Add Label as the last column

# Shuffle the DataFrame rows
shuffled_df = shuffle(normalized_df, random_state=42)

# Save the shuffled DataFrame to a new CSV file
shuffled_df.to_csv(output_csv, index=False)

print(f"Normalized and shuffled data saved to {output_csv}")


Normalized and shuffled data saved to C:\Notebooks\rrl_source\Spectnet_model_Halftruth_embedding\normalized_shuffled_output.csv
