In [1]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from multiprocessing import Pool, cpu_count
from sklearn.pipeline import Pipeline
from tqdm.notebook import tqdm
import pandas as pd
import subprocess
import tempfile
import re

In [2]:
df = pd.read_csv('../data/raw/function_features.csv')

In [3]:
df.isnull().sum()

name                             2
node_type                        0
file_path                        0
code_snippet                     0
repo_name                        0
repo_stars                       0
repo_forks                       0
repo_watchers                    0
repo_language                    0
repo_created_at                  0
repo_last_updated                0
repo_topics                      0
loc                              0
num_args                         0
num_returns                      0
num_variables                    0
num_function_calls               0
has_decorators                   0
uses_globals                     0
is_recursive                     0
estimated_branches          108641
estimated_difficulty             0
estimated_bugs                   0
has_docstring                    0
docstring_length                 0
num_comments                     0
name_length                      0
is_name_well_formed              0
bad_variable_names_c

In [4]:
columns_to_drop = [
    "name",
    "node_type",
    "file_path",
    "repo_name",
    "repo_stars",
    "repo_forks",
    "repo_watchers",
    "repo_language",
    "repo_created_at",
    "repo_last_updated",
    "repo_topics",
    "estimated_branches",  # all values null
    "quality"              # all values null will add it later when the model is finished
]

In [5]:
df = df.drop(columns=columns_to_drop)

In [6]:
df.dtypes

code_snippet                 object
loc                           int64
num_args                      int64
num_returns                   int64
num_variables                 int64
num_function_calls            int64
has_decorators                 bool
uses_globals                   bool
is_recursive                   bool
estimated_difficulty        float64
estimated_bugs              float64
has_docstring                  bool
docstring_length              int64
num_comments                  int64
name_length                   int64
is_name_well_formed            bool
bad_variable_names_count      int64
max_return_length             int64
estimated_complexity          int64
dtype: object

In [7]:
df.isnull().sum()

code_snippet                0
loc                         0
num_args                    0
num_returns                 0
num_variables               0
num_function_calls          0
has_decorators              0
uses_globals                0
is_recursive                0
estimated_difficulty        0
estimated_bugs              0
has_docstring               0
docstring_length            0
num_comments                0
name_length                 0
is_name_well_formed         0
bad_variable_names_count    0
max_return_length           0
estimated_complexity        0
dtype: int64

In [8]:
duplicates = df.duplicated()
df[duplicates]

Unnamed: 0,code_snippet,loc,num_args,num_returns,num_variables,num_function_calls,has_decorators,uses_globals,is_recursive,estimated_difficulty,estimated_bugs,has_docstring,docstring_length,num_comments,name_length,is_name_well_formed,bad_variable_names_count,max_return_length,estimated_complexity
45,"def __eq__(self, other):\n return all(\...",7,2,1,0,3,False,False,False,0.5,0.004644,False,0,0,6,True,0,107,1
46,"def __ne__(self, other):\n return not s...",2,2,1,0,0,False,False,False,1.0,0.003870,False,0,0,6,True,0,17,1
163,def __enter__(self):\n return self,2,1,1,0,0,False,False,False,0.0,0.000000,False,0,0,9,True,0,4,1
164,"def __exit__(self, *args):\n self.close()",2,1,0,0,1,False,False,False,0.0,0.000000,False,0,0,8,True,0,0,1
525,def response_handler(sock):\n consu...,7,1,0,0,2,False,False,False,0.0,0.000000,False,0,0,16,True,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
108110,def cedar_dec(func):\n @wraps(func)\n ...,5,1,2,0,2,False,False,False,0.0,0.000000,False,0,0,9,True,0,37,1
108117,"def example(req, test='default', *, loop='lol'...",2,2,1,0,0,False,False,False,0.0,0.000000,False,0,0,7,True,0,4,1
108122,"def cedar_wrapper(*a, **kw):\n retu...",2,0,1,0,2,True,False,False,0.0,0.000000,False,0,0,13,True,0,37,1
108291,def client_connect():\n clientsock = so...,4,0,1,2,2,False,False,False,0.0,0.000000,False,0,0,14,True,0,6,1


In [9]:
# Drop duplicate rows
df = df.drop_duplicates()

---

## Step 4: Generate Code Quality Scores Using `pylint`

Since our dataset now includes the actual code for each function, we can use `pylint` to objectively assess code quality.

### Why use `pylint`?
- It's a **widely-used Python linter** that detects code smells, complexity, unused variables, and more.
- It gives a **numeric score out of 10** summarizing the overall code quality.
- This gives us an **automated, data-driven way** to assign quality scores instead of relying on hand-crafted heuristics.

### What we’ll do:
- Write each function’s code to a temporary Python file.
- Run `pylint` on that file.
- Parse the output to extract the numeric score.
- Store the score in a new column called `quality_score`.

---


In [10]:
def get_pylint_score(code_string):
    try:
        with tempfile.NamedTemporaryFile('w', suffix='.py', delete=False, encoding='utf-8') as tmp:
            tmp.write(code_string)
            tmp_path = tmp.name

        result = subprocess.run(
            ['pylint', tmp_path, '--score=y'], 
            capture_output=True,
            text=True
        )

        output = result.stdout
        for line in output.splitlines():
            match = re.search(r"rated at ([0-9.]+)/10", line)
            if match:
                return float(match.group(1))

    except Exception as e:
        print(f"Error: {e}")

    return None


---

## Parallelized Code Quality Scoring with Pylint

Since scoring each function using `pylint` is slow (each one runs as a separate subprocess), we use Python’s `multiprocessing` module to run the scoring in parallel across multiple CPU cores.

This significantly speeds up the process of generating `quality_score` for every `code_snippet` in the dataset.

---


In [11]:
# tqdm used for progress bar for my sanity sake 
def run_parallel_with_tqdm(func, data, num_workers=None):
    from multiprocessing import Pool, cpu_count
    from tqdm import tqdm

    num_workers = num_workers or cpu_count()
    results = []
    
    with Pool(processes=num_workers) as pool:
        with tqdm(total=len(data)) as pbar:
            for result in pool.imap(func, data):
                results.append(result)
                pbar.update(1)
    
    return results


# Run and assign\
df_small = df.head(50)

df_small['quality_score'] = run_parallel_with_tqdm(get_pylint_score, df_small['code_snippet'].tolist())


  0%|          | 0/50 [00:00<?, ?it/s]

---

## Step 5: Bin Scores into Quality Labels

Once we have `quality_score`, we classify it into discrete quality levels:
- **0–3** → `bad`
- **3–6** → `moderate`
- **6–8** → `good`
- **8–10** → `excellent`

These categories will be stored in a new column: `quality_label`.

This prepares our dataset for classification tasks, where the model will learn to predict the label based on features.

---

In [None]:
df['quality_label'] = pd.cut(
    df['quality_score'],
    bins=[-float('inf'), 3, 6, 8, float('inf')],
    labels=['bad', 'moderate', 'good', 'excellent']
)


KeyError: 'quality_score'

In [None]:
print(len(df))
print(df['quality_label'].value_counts())


10
quality_label
bad          8
moderate     1
good         1
excellent    0
Name: count, dtype: int64


---

## Step 6: Train-Test Split

After scoring and labeling, we divide the dataset into **training** and **testing** subsets:
- **80%** of the data is used for training,
- **20%** is reserved for testing,
- We use `stratify=y` to maintain class proportions.

This split ensures that we can evaluate model performance fairly and avoid data leakage.

We will later save the splits into `data/processed/` for reuse during model training and evaluation.

---

In [None]:
from sklearn.model_selection import train_test_split

X = df.drop(columns=['quality_score', 'quality_label'])
y = df['quality_label']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)


ValueError: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.

---

## Step 7: Feature Preprocessing – Scaling and Encoding

Now that we’ve labeled our dataset, the next step is to **prepare the features** for use in machine learning models.

### Why this step is important:
- Some models, like **Neural Networks** and **Transformers**, are sensitive to the scale of input features.
- **Random Forests** don’t need scaling, but we’ll preprocess everything uniformly to keep workflows consistent.
- Boolean features like `has_docstring` and `uses_globals` can be used directly (0 or 1), but must be properly handled in the pipeline.

### What we’ll do:
- **Scale** all numerical features using `StandardScaler` (mean = 0, std = 1).
- **Pass through** all boolean features (they're already numeric).

We use `ColumnTransformer` to apply different preprocessing steps to different columns.

---


In [None]:
# Separate numeric and boolean features
numeric_features = df.select_dtypes(include=['int64', 'float64']).drop(columns=['quality_score']).columns.tolist()
boolean_features = df.select_dtypes(include=['bool']).columns.tolist()

# Create preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),  # Scale numeric features
        ('bool', 'passthrough', boolean_features)     # Keep boolean features as-is
    ]
)



## Step 8: Train-Test Split and Dataset Saving

Once the features are ready, we’ll split the dataset into **training** and **testing** sets.

### Why we do this:
- To **evaluate** model performance fairly.
- To **prevent data leakage** — the model should never "see" the test data during training.
- To **reuse** the same splits for all models and experiments.

### What we’ll do:
- Split 80% for training, 20% for testing using `train_test_split()`.
- Use `stratify=y` to preserve class proportions across splits.
- Save the resulting datasets (`X_train`, `X_test`, `y_train`, `y_test`) to the `data/processed/` folder so they can be easily loaded later in the training and evaluation notebooks.

---


In [None]:
# Separating the features and target 
X = df.drop(columns=['quality_score', 'quality_label'])
y = df['quality_label']


X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)


X_train.to_csv("../data/processed/X_train.csv", index=False)
X_test.to_csv("../data/processed/X_test.csv", index=False)
y_train.to_csv("../data/processed/y_train.csv", index=False)
y_test.to_csv("../data/processed/y_test.csv", index=False)
