## Exploratory data analysis (eda)

###
Perform exploratory data analysis to understand the distribution of developer roles, the relationship between features and roles, and identify potential risk factors or biases in the dataset. This will involve visualizations and statistical analysis.


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# 1. Count occurrences of each unique value in the `role` column
role_counts = df_processed['role'].value_counts()
print("Distribution of Developer Roles:")
display(role_counts)

# 2. Calculate descriptive statistics for numerical features for each unique role
numerical_cols = ['numfileschanged', 'linesadded', 'linesdeleted', 'numcommentsadded']
role_numerical_stats = df_processed.groupby('role')[numerical_cols].describe()
print("\nDescriptive Statistics of Numerical Features per Role:")
display(role_numerical_stats)

# 3. Generate visualizations for numerical features across different roles
print("\nVisualizing Distribution of Numerical Features per Role:")
for col in numerical_cols:
    plt.figure(figsize=(10, 6))
    sns.boxplot(x='role', y=col, data=df_processed)
    plt.title(f'Distribution of {col} by Role')
    plt.ylabel(col)
    plt.xlabel('Role')
    plt.show()

# 4. Analyze the distribution of categorical features across different roles
# Select categorical columns (excluding original index and role)
categorical_cols = df_processed.select_dtypes(include='bool').columns.tolist()

print("\nAnalyzing Distribution of Categorical Features per Role:")
for col in categorical_cols:
    if 'timeofcommit' in col or 'committype' in col or 'fileextension' in col:
        role_category_distribution = df_processed.groupby('role')[col].value_counts(normalize=True).unstack().fillna(0)
        print(f"\nProportion of {col} per Role:")
        display(role_category_distribution)

        # Visualize categorical distribution (can be too many columns for stacked bar)
        # If there are only two categories (True/False),we then plot the proportion of True
        if role_category_distribution.shape[1] == 2:
             role_category_distribution[True].plot(kind='bar', figsize=(10, 6))
             plt.title(f'Proportion of {col} (True) by Role')
             plt.ylabel('Proportion')
             plt.xlabel('Role')
             plt.xticks(rotation=45, ha='right')
             plt.tight_layout()
             plt.show()




## Feature engineering

###
Based on the EDA, creating new features like the length of the processed commit message and extracting day of week and hour of day from 'timeofcommit' could potentially improve model performance. I will also explore creating an interaction term between 'linesadded' and 'linesdeleted' as they seem to have different distributions across roles.



In [None]:
# Create new feature: length of processed commit message
df_processed['commitmessage_length'] = df['commitmessage'].apply(lambda x: len(str(x).split()))

# Extract day of week and hour of day from 'timeofcommit'
df_processed['day_of_week'] = df['timeofcommit'].apply(lambda x: x.split(' ')[0] if isinstance(x, str) else None)
df_processed['hour_of_day'] = df['timeofcommit'].apply(lambda x: int(x.split(' ')[1].split(':')[0]) if isinstance(x, str) and len(x.split(' ')) > 1 else None)

# One-hot encode 'day_of_week'
df_processed = pd.get_dummies(df_processed, columns=['day_of_week'], drop_first=False)

# Create an interaction term between 'linesadded' and 'linesdeleted'
# These columns are already in df_processed and scaled, so use df_processed
df_processed['lines_added_deleted_interaction'] = df_processed['linesadded'] * df_processed['linesdeleted']

# Additional Feature Engineering based on EDA:

# 1. Ratio of lines added to lines deleted (handle division by zero)
# Use scaled numerical columns from df_processed
df_processed['lines_added_to_deleted_ratio'] = df_processed.apply(
    lambda row: row['linesadded'] / row['linesdeleted'] if row['linesdeleted'] != 0 else 0, axis=1
)

# 2. Total lines changed
# Use scaled numerical columns from df_processed
df_processed['total_lines_changed'] = df_processed['linesadded'] + df_processed['linesdeleted']

# 3. Presence of specific keywords in commit message (example keywords, can be refined based on text analysis)
# Use the original 'df' DataFrame for features derived from original columns
keywords = ['fix', 'feat', 'refactor', 'test', 'ui', 'api', 'database']
for keyword in keywords:
    df_processed[f'msg_has_{keyword}'] = df['commitmessage'].apply(lambda x: 1 if keyword in str(x).lower() else 0)

# Display the head of the df_processed DataFrame to show the newly added features
display(df_processed.head())