In [None]:
# preprocessing.ipynb

 # Data Preprocessing

#: This notebook covers the data loading, validation, and feature engineering steps for the developer role classification task.

# Import Libraries
import pandas as pd
import numpy as np
import ast
import nltk
from datetime import datetime

# **Data Loading and Validation**

In [None]:
# Scikit-learn imports for potential preprocessing steps (though mostly handled in functions)
from sklearn.preprocessing import OneHotEncoder, RobustScaler


# Define a function to load and validate the dataset.
def load_and_validate_data(file_path):
    """Loads the dataset from a CSV file and performs basic validation checks."""
    df = pd.read_csv(/content/final_dataset.csv)

    # Validation checks
    assert 'role' in df.columns, "Target column 'role' missing"
    assert 'commitmessage' in df.columns, "Commit message column missing"

    print(f"Dataset shape: {df.shape}")
    print("\nColumn types:")
    print(df.dtypes)

    print("\nMissing values:")
    print(df.isnull().sum())

    return df

# **Data Preprocessing Functions**

In [None]:
# Define functions for file extension processing and time feature extraction.
def preprocess_file_extensions(ext_str):
    """Convert string representation of list to actual list"""
    try:
        return ast.literal_eval(ext_str)
    except (ValueError, SyntaxError):
        return []

def categorize_extensions(ext_list):
    """Categorize file extensions into frontend/backend/etc."""
    frontend_ext = ['js', 'css', 'html', 'vue', 'ts', 'jsx', 'tsx']
    backend_ext = ['py', 'java', 'php', 'rb', 'go', 'c', 'cpp', 'h']
    config_ext = ['json', 'yml', 'yaml', 'xml', 'config', 'properties']
    test_ext = ['test', 'spec', 'test.js', 'test.py']

    categories = {
        'frontend': 0,
        'backend': 0,
        'config': 0,
        'test': 0,
        'other': 0
    }

    for ext in ext_list:
        ext = ext.lower().replace('.', '')
        if ext in frontend_ext:
            categories['frontend'] += 1
        elif ext in backend_ext:
            categories['backend'] += 1
        elif ext in config_ext:
            categories['config'] += 1
        elif ext in test_ext or 'test' in ext:
            categories['test'] += 1
        else:
            categories['other'] += 1

    return categories

def extract_time_features(datetime_str):
    """Extract temporal features from datetime string"""
    dt = pd.to_datetime(datetime_str, errors='coerce')
    if pd.isna(dt):
        return {}

    hour = dt.hour
    weekday = dt.weekday()

    # Cyclical encoding
    hour_sin = np.sin(2 * np.pi * hour/24)
    hour_cos = np.cos(2 * np.pi * hour/24)
    day_sin = np.sin(2 * np.pi * weekday/7)
    day_cos = np.cos(2 * np.pi * weekday/7)

    return {
        'hour_sin': hour_sin,
        'hour_cos': hour_cos,
        'day_sin': day_sin,
        'day_cos': day_cos,
        'is_weekend': 1 if weekday >= 5 else 0
    }

# **Main Preprocessing Pipeline** 

In [None]:
# Define the main function to apply all preprocessing steps.
def preprocess_data(df):
    """Apply all preprocessing steps"""
    # Create a copy to avoid modifying the original
    processed_df = df.copy()

    # Process file extensions
    processed_df['fileextensions'] = processed_df['fileextensions'].apply(preprocess_file_extensions)
    ext_categories = processed_df['fileextensions'].apply(categorize_extensions)
    ext_categories_df = pd.json_normalize(ext_categories)
    processed_df = pd.concat([processed_df, ext_categories_df], axis=1)

    # Process time features
    time_features = processed_df['timeofcommit'].apply(extract_time_features)
    time_features_df = pd.json_normalize(time_features)
    processed_df = pd.concat([processed_df, time_features_df], axis=1)

    # Add simple text features
    processed_df['message_length'] = processed_df['commitmessage'].str.len()
    processed_df['word_count'] = processed_df['commitmessage'].str.split().str.len()

    # Add keyword features
    keywords = ['bug', 'fix', 'feature', 'refactor', 'test', 'doc', 'merge']
    for keyword in keywords:
        processed_df[f'has_{keyword}'] = processed_df['commitmessage'].str.contains(
            keyword, case=False).astype(int)

    return processed_df

# **Execute Preprocessing**

In [None]:
# Load the data and apply the preprocessing pipeline.
df = load_and_validate_data('/content/final_dataset.csv')
processed_df = preprocess_data(df)

# Display the new features
print("New features after preprocessing:")
print(processed_df.columns.tolist())

# Display the first few rows of the processed DataFrame
print("\nProcessed DataFrame head:")
display(processed_df.head())