In [34]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, f1_score, confusion_matrix, roc_curve, auc
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer
import matplotlib.pyplot as plt
import seaborn as sns

from pprint import pprint
from collections import Counter
import re

In [35]:
def hex_to_int(hex_value):
    try:
        # Remove potential '0x' prefix and convert to int
        return int(hex_value, 16)
    except (ValueError, TypeError):  # Handle non-convertible or empty values
        return None
    
def extract_filename(filepath):
    return filepath.split('/')[-1] 

def count_imports(imports):
    imports_list = [imp for imp in imports.split(' | ') if imp]
    return len(imports_list)

def parse_imports(imports):
    # Split the string by " | " and remove any empty strings
    imports_list = [imp.strip().upper() for imp in imports.split(' | ') if imp.strip()]
    for index, name in enumerate(imports_list):
        if name.startswith("|"):
            name = name.replace("|", "")
            name = name.strip()
            imports_list[index] = name.upper()
        if re.search(r"(.*)\d-\d-\d.\DLL", name):
            imports_list[index] = re.sub(r"(.*)\d-\d-\d.\DLL", r"\1X-X-X.DLL", name)
    return imports_list

In [None]:

# Load the CSV file into a DataFrame
malware_file_path = '../result/malware32.csv'
df_malware = pd.read_csv(malware_file_path)
df_malware['disposition'] = 'malware'

# Load the CSV file into a DataFrame
bening_file_path = '../result/good32.csv'
df_goodware = pd.read_csv(bening_file_path)
df_goodware['disposition'] = 'goodware'

# Concatenate the two DataFrames
df = pd.concat([df_malware, df_goodware], ignore_index=True)
# Verify the combined DataFrame

# Display the first few rows of the dataset
print(f"Amount of Rows {df.shape[0]}")
print("First few rows of the dataset:")
df.head()

# Handling missing values
# Option 1: Drop missing values
df = df.dropna()
print(f"Amount of Rows {df.shape[0]} after drop missing values")

# Checking for duplicate rows
duplicates = df.duplicated().sum()
print(f"Number of duplicate rows: {duplicates}")

# Optionally remove duplicate rows
df = df.drop_duplicates()
print(f"Amount of Rows {df.shape[0]} After dropping duplicates")

# Convert column names to lowercase (optional)
df.columns = df.columns.str.lower()

# Removing leading/trailing whitespaces from string columns
df = df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)

#df = df['file_name'].apply(extract_filename)

hex_columns = [
    'table_pointer', 'size_of_uninitialized_data', 'address_of_entry_point',
    'base_of_code', 'image_base'
]

for col in hex_columns:
    df[col] = df[col].apply(hex_to_int)

df['import_count'] = df['import_directory'].apply(count_imports)
df['import_directory_list'] = df['import_directory'].apply(parse_imports)


In [None]:
df.info()
row = df.iloc[1]
print(row)

In [38]:
# # Step 1: Count occurrences of each item in the lists
# all_imports = [item for sublist in df['import_directory_list'] for item in sublist]
# item_counts = Counter(all_imports)

# top_items = [item for item, count in item_counts.most_common(200)]
# df['filtered_imports'] = df['import_directory_list'].apply(lambda x: [item for item in x if item in top_items])

# # Step 4: One-Hot Encode the filtered lists
# mlb = MultiLabelBinarizer()
# one_hot_encoded = mlb.fit_transform(df['filtered_imports'])

# # Step 5: Create a DataFrame with meaningful column names
# one_hot_df = pd.DataFrame(one_hot_encoded, columns=[f'import_{item}' for item in mlb.classes_])

# # Step 6: Concatenate with the original DataFrame
# final_df = pd.concat([df, one_hot_df], axis=1)
# df = final_df


In [None]:
parquet_file_path = 'filtered_dataset.parquet'

df.to_parquet(parquet_file_path, engine='pyarrow', index=False)

print(f"DataFrame successfully saved to {parquet_file_path}")