In [42]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

In [24]:
df = pd.read_csv('../data/raw/function_features.csv')

In [25]:
df.isnull().sum()

name                             2
node_type                        0
file_path                        0
code_snippet                     0
repo_name                        0
repo_stars                       0
repo_forks                       0
repo_watchers                    0
repo_language                    0
repo_created_at                  0
repo_last_updated                0
repo_topics                      0
loc                              0
num_args                         0
num_returns                      0
num_variables                    0
num_function_calls               0
has_decorators                   0
uses_globals                     0
is_recursive                     0
estimated_branches          108641
estimated_difficulty             0
estimated_bugs                   0
has_docstring                    0
docstring_length                 0
num_comments                     0
name_length                      0
is_name_well_formed              0
bad_variable_names_c

In [26]:
columns_to_drop = [
    "name",
    "node_type",
    "file_path",
    "code_snippet",
    "repo_name",
    "repo_stars",
    "repo_forks",
    "repo_watchers",
    "repo_language",
    "repo_created_at",
    "repo_last_updated",
    "repo_topics",
    "estimated_branches",  # all values null
    "quality"              # all values null will add it later when the model is finished
]

In [27]:
df = df.drop(columns=columns_to_drop)

In [28]:
df.dtypes

loc                           int64
num_args                      int64
num_returns                   int64
num_variables                 int64
num_function_calls            int64
has_decorators                 bool
uses_globals                   bool
is_recursive                   bool
estimated_difficulty        float64
estimated_bugs              float64
has_docstring                  bool
docstring_length              int64
num_comments                  int64
name_length                   int64
is_name_well_formed            bool
bad_variable_names_count      int64
max_return_length             int64
estimated_complexity          int64
dtype: object

In [29]:
df.isnull().sum()

loc                         0
num_args                    0
num_returns                 0
num_variables               0
num_function_calls          0
has_decorators              0
uses_globals                0
is_recursive                0
estimated_difficulty        0
estimated_bugs              0
has_docstring               0
docstring_length            0
num_comments                0
name_length                 0
is_name_well_formed         0
bad_variable_names_count    0
max_return_length           0
estimated_complexity        0
dtype: int64

In [34]:
duplicates = df.duplicated()
df[duplicates]

Unnamed: 0,loc,num_args,num_returns,num_variables,num_function_calls,has_decorators,uses_globals,is_recursive,estimated_difficulty,estimated_bugs,has_docstring,docstring_length,num_comments,name_length,is_name_well_formed,bad_variable_names_count,max_return_length,estimated_complexity
38,3,2,1,1,1,False,False,False,0.0,0.000000,False,0,0,8,True,0,1,1
45,7,2,1,0,3,False,False,False,0.5,0.004644,False,0,0,6,True,0,107,1
46,2,2,1,0,0,False,False,False,1.0,0.003870,False,0,0,6,True,0,17,1
50,4,1,1,1,4,False,False,False,0.0,0.000000,False,0,0,11,True,1,29,2
163,2,1,1,0,0,False,False,False,0.0,0.000000,False,0,0,9,True,0,4,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
108504,2,4,0,1,0,False,False,False,0.0,0.000000,False,0,0,8,True,0,0,1
108535,3,2,1,0,3,False,False,False,0.0,0.000000,True,1,0,9,True,0,21,1
108544,2,2,1,0,0,False,False,False,0.5,0.001585,False,0,0,12,True,0,21,1
108556,2,1,1,0,2,False,False,False,0.5,0.001585,False,0,0,7,True,0,32,1


In [35]:
# Drop duplicate rows
df = df.drop_duplicates()

---

### 🧮 Step 4: Creating a Heuristic-Based Quality Score

Since our dataset doesn't come with predefined labels, we need to create them ourselves. To do this, we'll use a **heuristic scoring system** based on the features we have.

This score will act as a proxy for code quality, helping us assign labels such as `excellent`, `good`, `moderate`, or `bad`.

We'll define a custom formula that weights certain features more heavily than others based on the following assumptions:

- **Worse quality** is associated with:
  - More `estimated_bugs`
  - Higher `estimated_difficulty`
  - A greater number of `bad_variable_names`
  - More `globals`
  - Longer `return` values

- **Better quality** is associated with:
  - Having a `docstring`
  - Longer `docstrings`
  - Shorter `return` values
  - Fewer `globals`

This scoring mechanism helps us "bootstrap" our labels before we apply machine learning models. In future iterations, we could refine this formula or even replace it with a learned labeling model.

---


In [38]:
#  the score that we will use to train the model
df['quality_score'] = (
    df["estimated_bugs"] * 3 +
    df["estimated_difficulty"] * 2 +
    df["bad_variable_names_count"] * 2 +
    df["num_returns"] * 1 +
    df["max_return_length"] * 1 +
    df["uses_globals"].astype(int) * 3 -
    df["has_docstring"].astype(int) * 3 -
    df["docstring_length"] * 0.5
)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['quality_score'] = (


In [39]:
# bin the scores into labels

df['quality_label'] = pd.cut(
    df['quality_score'],
    bins=[-float('inf'), 2, 5, 8, float('inf')],
    labels=['excellent', 'good', 'moderate', 'bad']
)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['quality_label'] = pd.cut(


In [41]:
from sklearn.model_selection import train_test_split

X = df.drop(columns=['quality_label', 'quality_score'])
y = df['quality_label']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)


---

## Step 7: Feature Preprocessing – Scaling and Encoding

Now that we’ve labeled our dataset, the next step is to **prepare the features** for use in machine learning models.

### Why this step is important:
- Some models, like **Neural Networks** and **Transformers**, are sensitive to the scale of input features.
- **Random Forests** don’t need scaling, but we’ll preprocess everything uniformly to keep workflows consistent.
- Boolean features like `has_docstring` and `uses_globals` can be used directly (0 or 1), but must be properly handled in the pipeline.

### What we’ll do:
- **Scale** all numerical features using `StandardScaler` (mean = 0, std = 1).
- **Pass through** all boolean features (they're already numeric).

We use `ColumnTransformer` to apply different preprocessing steps to different columns.

---


In [44]:
# Separate numeric and boolean features
numeric_features = df.select_dtypes(include=['int64', 'float64']).drop(columns=['quality_score']).columns.tolist()
boolean_features = df.select_dtypes(include=['bool']).columns.tolist()

# Create preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),  # Scale numeric features
        ('bool', 'passthrough', boolean_features)     # Keep boolean features as-is
    ]
)



## Step 8: Train-Test Split and Dataset Saving

Once the features are ready, we’ll split the dataset into **training** and **testing** sets.

### Why we do this:
- To **evaluate** model performance fairly.
- To **prevent data leakage** — the model should never "see" the test data during training.
- To **reuse** the same splits for all models and experiments.

### What we’ll do:
- Split 80% for training, 20% for testing using `train_test_split()`.
- Use `stratify=y` to preserve class proportions across splits.
- Save the resulting datasets (`X_train`, `X_test`, `y_train`, `y_test`) to the `data/processed/` folder so they can be easily loaded later in the training and evaluation notebooks.

---


In [49]:
# Separating the features and target 
X = df.drop(columns=['quality_score', 'quality_label'])
y = df['quality_label']


X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)


X_train.to_csv("../data/processed/X_train.csv", index=False)
X_test.to_csv("../data/processed/X_test.csv", index=False)
y_train.to_csv("../data/processed/y_train.csv", index=False)
y_test.to_csv("../data/processed/y_test.csv", index=False)
