In [94]:
import pandas as pd
import random
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import os

# Load the dataset
df = pd.read_csv("new_metadata.csv")

# Feature Engineering
def extract_features(df):
    sensitive_files = df[df['File Name'].str.lower().str.contains("bank", na=False)]
    print(sensitive_files[['File Name', 'Predicted Sensitivity']])
    # Extract file extension from File Type (already in uppercase)
    df['File Type'] = df['File Type'].apply(lambda x: x.lower())

    # Check for sensitive keywords in the file path
    sensitive_keywords = ["payroll", "tax", "insurance", "contract", "agreement", "aadhar", "bank"]
    df['Path Contains Sensitive Keyword'] = df['File Path'].apply(
        lambda path: 1 if any(keyword in path.lower() for keyword in sensitive_keywords) else 0
    )
    df['Name Contains Sensitive Keyword'] = df['File Name'].apply(
        lambda name: 1 if any(keyword in name.lower() for keyword in sensitive_keywords) else 0
    )

    # Encode the target variable (Predicted Sensitivity)
    le = LabelEncoder()
    df['Predicted Sensitivity'] = le.fit_transform(df['Predicted Sensitivity'])

    return df

# Apply feature extraction to the dataset
df = extract_features(df)

# Select Features (X) and Target (y)
X = df[['File Type', 'File Size (Bytes)', 'Path Contains Sensitive Keyword','Name Contains Sensitive Keyword']]
y = df['Predicted Sensitivity']

# One-hot encode 'File Type'
X = pd.get_dummies(X, columns=['File Type'], drop_first=True)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Random Forest Classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Evaluation
print("Model Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

# Example function to predict sensitivity of a given file path
def predict_sensitivity(file_path):
    # Handle backslashes in Windows paths by converting them to forward slashes
    file_path = file_path.replace("\\", "/")
    
    # Extract file extension (e.g., '.pdf' or '.xlsx')
    file_name = file_path.split('/')[-1].lower()
    file_extension = file_path.split('.')[-1].lower()
    
    # Assuming that we have a mapping of file extensions to one-hot encoded values
    file_types = ['pdf', 'xlsx', 'docx', 'jpg', 'txt']  # List of file types from training data
    file_type_encoded = {f"File Type_{ft}": 0 for ft in file_types}  # Initialize all to 0
    
    if file_extension in file_types:
        file_type_encoded[f"File Type_{file_extension}"] = 1  # Set the corresponding file type to 1
    
    # Extract the feature for the given file path (check for sensitive keywords)
    sensitive_keywords = ["payroll", "tax", "insurance", "contract", "agreement", "aadhar", "bank"]
    path_sensitive_keyword = 1 if any(keyword in file_path.lower() for keyword in sensitive_keywords) else 0
    name_sensitive_keyword = 1 if any(keyword in file_name.lower() for keyword in sensitive_keywords) else 0

    # Dynamically calculate the file size (in bytes)
    try:
        file_size = os.path.getsize(file_path)  # Get the actual file size
    except FileNotFoundError:
        print(f"File not found: {file_path}")
        return "Non-Sensitive"  # Default to non-sensitive if file is not found
    
    # Prepare the input data for prediction (file type, file size, and sensitive keyword check)
    input_data = pd.DataFrame({
        'File Size (Bytes)': [file_size],
        'Path Contains Sensitive Keyword': [path_sensitive_keyword],
        'Name Contains Sensitive Keyword': [name_sensitive_keyword],

        **file_type_encoded  # Add the one-hot encoded file type columns
    })
    
    # Ensure the input data has the same columns as the training data
    # Reorder and add missing columns with default value 0
    input_data = input_data.reindex(columns=X_train.columns, fill_value=0)
    
    # Make prediction
    sensitivity_prediction = model.predict(input_data)
    if sensitivity_prediction[0] == 1:
        return "Sensitive "
    else: 
        return"Non-Sensitive"

                       File Name Predicted Sensitivity
9     bank_statement_2027_5.xlsx             Sensitive
16     bank_statement_2023_9.pdf             Sensitive
18    bank_statement_2029_10.jpg             Sensitive
84    bank_statement_2020_2.xlsx             Sensitive
128    bank_statement_2024_3.txt             Sensitive
133    bank_statement_2025_1.txt             Sensitive
137    bank_statement_2028_1.jpg             Sensitive
154  bank_statement_2020_12.docx             Sensitive
160   bank_statement_2029_12.jpg             Sensitive
175   bank_statement_2022_2.xlsx             Sensitive
187    bank_statement_2020_8.jpg             Sensitive
234   bank_statement_2030_1.docx             Sensitive
265    bank_statement_2026_5.jpg             Sensitive
269    bank_statement_2029_7.pdf             Sensitive
282   bank_statement_2026_11.txt             Sensitive
329    bank_statement_2020_3.txt             Sensitive
347  bank_statement_2024_10.xlsx             Sensitive
349    ban

In [95]:
# Test the prediction function with a Windows-style file path
test_file_path = r"C:\Users\ASUS\OneDrive\Desktop\bank.docx"
print("Predicted Sensitivity for file:", predict_sensitivity(test_file_path))

Predicted Sensitivity for file: Sensitive 


In [97]:
import pickle
# Save the model to disk
filename = 'model.pkl'
pickle.dump(model, open(filename, 'wb'))