# 03 - Checking Data Consistency

## Setting Up Project Directory

In [1]:
from jupyter_init import setup

setup()

from src_code.config import *

## Loading Dataset

In [4]:
import pandas as pd
import numpy as np

TRANSFORMED_DF = EXTRACTED_DATA_DIR / "train_labeled_features_partial_copy.feather"

# ---- LOAD ----
df = pd.read_feather(TRANSFORMED_DF)
print(f"Loaded dataframe with {len(df)} rows and {len(df.columns)} columns\n")

df.dtypes

Loaded dataframe with 23479 rows and 31 columns



datetime                      datetime64[us, pytz.FixedOffset(-120)]
commit                                                        object
repo                                                          object
filepath                                                      object
content                                                       object
methods                                                       object
lines                                                         object
author_email                                                  object
canonical_datetime                               datetime64[ns, UTC]
author_exp_pre                                                 int64
author_recent_activity_pre                                     int64
label                                                          int64
loc_added                                                      int64
loc_deleted                                                    int64
files_changed                     

## Converting NumpyArray -> List



In [None]:
#Convert the NumPy arrays back to Python lists
for col in ['code_embed', 'msg_embed', 'methods', 'lines']:
    # Use .apply(list) or .apply(lambda x: x.tolist()) for robustness
    df[col] = df[col].apply(list)

# print(df['content'].head(5))

## Missing Value Audit

In [None]:
print("## 1. Missing Values per Column")
nulls = df.isnull().sum().sort_values(ascending=False)
print(nulls.to_markdown())

## 1. Missing Values per Column
|                            |   0 |
|:---------------------------|----:|
| datetime                   |   0 |
| commit                     |   0 |
| repo                       |   0 |
| filepath                   |   0 |
| content                    |   0 |
| methods                    |   0 |
| lines                      |   0 |
| author_email               |   0 |
| canonical_datetime         |   0 |
| author_exp_pre             |   0 |
| author_recent_activity_pre |   0 |
| label                      |   0 |
| loc_added                  |   0 |
| loc_deleted                |   0 |
| files_changed              |   0 |
| hunks_count                |   0 |
| msg_len                    |   0 |
| has_fix_kw                 |   0 |
| has_bug_kw                 |   0 |
| ast_delta                  |   0 |
| complexity_delta           |   0 |
| max_func_change            |   0 |
| time_since_last_change     |   0 |
| todo                       |   0 |
| fixm

## Primary Key Integrity

In [None]:
print("## 2. Primary Key Uniqueness Check")
key_cols = ["repo", "commit", "filepath"]

dupes = df.duplicated(subset=key_cols).sum()
print(f"Duplicate key rows: {dupes}")

## 2. Primary Key Uniqueness Check
Duplicate key rows: 0


## Label Distribution

In [None]:
print("## 3. Label Distribution")
print(df['label'].value_counts(normalize=True).to_markdown())

## 3. Label Distribution
|   label |   proportion |
|--------:|-------------:|
|       1 |     0.517859 |
|       0 |     0.482141 |


## Repository Distribution (Imbalance Check)

In [None]:
print("## 4. Repository Distribution")
repo_dist = df['repo'].value_counts(normalize=True)
print(repo_dist.to_markdown())

## 4. Repository Distribution
| repo    |   proportion |
|:--------|-------------:|
| pandas  |     0.644913 |
| airflow |     0.355087 |


## Value Range Scan for Numeric Columns

Automatically detects:

- negatives where not allowed
- max values
- suspicious spikes

In [None]:
print("## 5. Numeric Column Range Scan")
num_cols = df.select_dtypes(include=[np.number]).columns

ranges = pd.DataFrame({
    "min": df[num_cols].min(),
    "median": df[num_cols].median(),
    "mean": df[num_cols].mean(),
    "max": df[num_cols].max()
})

print(ranges.to_markdown())

## 5. Numeric Column Range Scan
|                            |     min |   median |          mean |              max |
|:---------------------------|--------:|---------:|--------------:|-----------------:|
| author_exp_pre             |       0 |     84   |   412.41      |   3557           |
| author_recent_activity_pre |       0 |     14   |    33.0938    |    238           |
| label                      |       0 |      1   |     0.517859  |      1           |
| loc_added                  |       0 |     36   |    85.0857    |  11824           |
| loc_deleted                |       0 |     63   |   124.98      |  12067           |
| files_changed              |       0 |      7   |    16.1322    |    214           |
| hunks_count                |       0 |     32   |    76.5818    |   1104           |
| msg_len                    |       7 |     65   |   137.041     |   3814           |
| has_fix_kw                 |       0 |      0   |     0.2004    |      1           |
| has_bug_k

## Check Columns Expected to Be Non-Negative

In [None]:
non_negative_cols = [
    "loc_added", "loc_deleted",
    "files_changed", "hunks_count",
    "msg_len", "ast_delta",
    "complexity_delta", "max_func_change",
    "author_exp_pre", "author_recent_activity_pre",
    "todo", "fixme", "try", "except", "raise",
    "recent_churn"
]

print("## 6. Negative Value Check")
for col in non_negative_cols:
    bad = (df[col] < 0).sum()
    print(f"{col}: {bad} negative values")

## 6. Negative Value Check
loc_added: 0 negative values
loc_deleted: 0 negative values
files_changed: 0 negative values
hunks_count: 0 negative values
msg_len: 0 negative values
ast_delta: 0 negative values
complexity_delta: 0 negative values
max_func_change: 0 negative values
author_exp_pre: 0 negative values
author_recent_activity_pre: 0 negative values
todo: 0 negative values
fixme: 0 negative values
try: 0 negative values
except: 0 negative values
raise: 0 negative values
recent_churn: 0 negative values


## Suspicious Feature Check: time_since_last_change

In [None]:
print("## 7. time_since_last_change Outliers")
tslc = df["time_since_last_change"]

print(f"Negative values: {(tslc < 0).sum()}")
print(f"99.9% quantile: {tslc.quantile(0.999)}")
print(f"Min: {tslc.min()}")
print(f"Max: {tslc.max()}")

## 7. time_since_last_change Outliers
Negative values: 3
99.9% quantile: 981249.0
Min: -396818
Max: 20466389


### Understanding the Feature

*time_since_last_change = c.committed_date - last_time*

Where:
- c.committed_date = current commit timestamp (UNIX seconds)
- last_time = timestamp of first parent commit

So the feature = time difference between consecutive commits.

This represents how much time passed between commits in a repo.

### Why Negative Values?

Meaning:

- Some commits appear to be ~4.6 days negative (-396,818 sec)
- Some commits appear to be ~77 days ahead (6.6M sec)

This is expected when real Git data is used.

Git timestamps can go backwards because:

#### (1) Rebased or rewritten history

During rebases, old commits appear “later” than newer ones.

*WHY?*

When you merge/rebase, Git does not reorder commits chronologically.
Instead, it preserves the logical order of development.

#### (2) Merge parents

You use only the first parent:

if c.parents:
    last_time = c.parents[0].committed_date


But merges may introduce non-linear time ordering.

#### (3) Clock drift

Different authors → different local machine clocks.

#### (4) Shallow clones or incomplete history

If the repo is shallow-fetched, parent commits may have weird timestamps.

None of this indicates your extraction is wrong.



## Binary Flag Integity

In [None]:
print("## 8. Binary Columns Integrity")
bin_cols = ["has_fix_kw", "has_bug_kw"]

for col in bin_cols:
    bad = df[~df[col].isin([0,1])]
    print(f"{col}: {len(bad)} invalid values")

## 8. Binary Columns Integrity
has_fix_kw: 0 invalid values
has_bug_kw: 0 invalid values


## Embedding Consistency Check

Ensure:

- no None
- all lists
- identical dimensionality

In [None]:
# # Convert the NumPy arrays back to Python lists
# for col in ['code_embed', 'msg_embed']:
#     # Use .apply(list) or .apply(lambda x: x.tolist()) for robustness
#     df[col] = df[col].apply(list)

print(type(df.loc[0, 'code_embed']))
print(type(df.loc[0, 'msg_embed']))

print("## 9. Embedding Structural Checks")

# None count
print("code_embed None count:", df['code_embed'].isna().sum())
print("msg_embed None count:", df['msg_embed'].isna().sum())

# Check if all are lists
print("\nNon-list code_embed rows:", (~df['code_embed'].apply(lambda x: isinstance(x, list))).sum())
print("Non-list msg_embed rows:", (~df['msg_embed'].apply(lambda x: isinstance(x, list))).sum())

# Check dimensionality
dims = df['code_embed'].apply(lambda x: len(x) if isinstance(x, list) else None)
print("\nEmbedding dimensionality distribution:")
print(dims.value_counts().head())

<class 'list'>
<class 'list'>
## 9. Embedding Structural Checks
code_embed None count: 0
msg_embed None count: 0

Non-list code_embed rows: 0
Non-list msg_embed rows: 0

Embedding dimensionality distribution:
code_embed
768    22510
Name: count, dtype: int64


## Datetime Consistency

Check for:

- NaT values
- ordering sanity (commit should not be older than file's previous record)

In [None]:
print("## 10. Datetime Columns Audit")

date_cols = ["datetime", "canonical_datetime"]

for col in date_cols:
    print(f"{col}: NaT count = {df[col].isna().sum()}")
    print(f"{col}: min = {df[col].min()}, max = {df[col].max()}")

## 10. Datetime Columns Audit
datetime: NaT count = 0
datetime: min = 2009-11-21 20:57:45-02:00, max = 2022-01-03 09:36:56-02:00
canonical_datetime: NaT count = 0
canonical_datetime: min = 2009-11-21 22:57:45+00:00, max = 2022-01-03 21:36:56+00:00


## Correlation Hot Spots
To detect redundant or suspiciously identical columns.

In [None]:
print("## 11. Correlations (Numeric Only)")
corr = df[num_cols].corr()
# display(corr.style.background_gradient(cmap='coolwarm'))
matrix = corr.style.background_gradient(cmap='coolwarm')
matrix

# print("## 11. Correlations (Numeric Only)")
# corr = df[num_cols].corr()
# print(corr.to_markdown())  # nicely formatted markdown table

## 11. Correlations (Numeric Only)


Unnamed: 0,author_exp_pre,author_recent_activity_pre,label,loc_added,loc_deleted,files_changed,hunks_count,msg_len,has_fix_kw,has_bug_kw,ast_delta,complexity_delta,max_func_change,time_since_last_change,todo,fixme,try,except,raise,recent_churn
author_exp_pre,1.0,0.704432,0.073225,-0.059444,-0.101316,-0.102655,-0.08866,-0.150136,-0.143799,0.007367,-0.000465,0.007772,0.017577,-0.037671,0.177089,0.065947,-0.024393,-0.057012,-0.010252,0.223938
author_recent_activity_pre,0.704432,1.0,0.110851,0.030255,-0.02852,0.102434,0.107056,-0.164335,-0.145771,-0.062814,0.060305,0.070844,0.07193,-0.045571,0.090275,0.014806,-0.015013,-0.026514,0.020245,0.598666
label,0.073225,0.110851,1.0,0.103065,0.091812,0.092245,0.124209,0.105458,-0.009564,-0.033249,0.151651,0.137073,0.27205,-0.044236,0.049289,0.011269,0.002316,0.030306,0.084033,0.070927
loc_added,-0.059444,0.030255,0.103065,1.0,0.534862,0.335417,0.521405,0.042831,-0.001122,-0.095351,0.231791,0.243495,0.202412,-0.016082,0.076058,0.012609,0.452076,0.336451,0.219361,0.266678
loc_deleted,-0.101316,-0.02852,0.091812,0.534862,1.0,0.420093,0.461152,0.144048,0.006834,-0.052422,0.151385,0.150414,0.113967,-0.00843,0.028524,0.018499,0.100895,0.214006,0.178085,0.325419
files_changed,-0.102655,0.102434,0.092245,0.335417,0.420093,1.0,0.7799,0.01204,-0.025043,-0.16466,0.088719,0.102018,0.310825,-0.021039,0.01048,-0.002683,0.032754,0.173386,0.127487,0.435691
hunks_count,-0.08866,0.107056,0.124209,0.521405,0.461152,0.7799,1.0,0.002165,-0.028878,-0.167584,0.184336,0.201897,0.418677,-0.022947,0.011396,-0.004983,0.100873,0.289478,0.157643,0.445888
msg_len,-0.150136,-0.164335,0.105458,0.042831,0.144048,0.01204,0.002165,1.0,0.265484,0.083857,0.011781,0.008092,-0.015201,0.022658,0.000751,0.073011,0.004614,0.018125,-0.008578,-0.050963
has_fix_kw,-0.143799,-0.145771,-0.009564,-0.001122,0.006834,-0.025043,-0.028878,0.265484,1.0,0.140631,-0.036909,-0.046012,0.025097,0.006644,-0.01045,0.051002,-0.006642,0.051726,0.017459,-0.073166
has_bug_kw,0.007367,-0.062814,-0.033249,-0.095351,-0.052422,-0.16466,-0.167584,0.083857,0.140631,1.0,-0.033896,-0.025382,-0.097119,0.00504,-0.017058,0.031366,-0.02526,-0.053329,-0.04334,-0.084359


In [None]:
print("## 12. Columns With Only One Unique Value")

for col in df.columns:
    # Convert arrays to tuples for uniqueness check
    if df[col].apply(lambda x: isinstance(x, (np.ndarray, list))).any():
        # Convert each element to a tuple (or a string representation)
        # print(col)
        unique_values = df[col].apply(lambda x: tuple(x) if isinstance(x, (np.ndarray, list)) else x)
    else:
        unique_values = df[col]

    if unique_values.nunique(dropna=True) == 1:
        print(f"⚠️ {col} has only one unique value")

## 12. Columns With Only One Unique Value


## Check Text Columns for Weirdness

Empty strings? Too short? Too long?

In [None]:
print("## 13. Text Field Checks")

if 'content' in df.columns:
    print("Empty content rows:", (df['content'].str.len() == 0).sum())
    print(df['content'].str.len().describe())

# Check 'methods' if it is a list column
if 'methods' in df.columns:
    # Use len() on the Python lists
    print("\nEmpty methods rows:", (df['methods'].apply(len) == 0).sum())
    print(df['methods'].apply(len).describe())

## 13. Text Field Checks
Empty content rows: 54
count     22510.000000
mean       1786.281253
std        3229.427225
min           0.000000
25%         455.250000
50%         858.000000
75%        1870.000000
max      109324.000000
Name: content, dtype: float64

Empty methods rows: 4863
count    22510.000000
mean         2.472768
std          4.766058
min          0.000000
25%          1.000000
50%          1.000000
75%          2.000000
max        148.000000
Name: methods, dtype: float64


## Check For Impossible Values

In [None]:
print("## 14. Logical Consistency Checks")

# msg_len should match commit message length
if "msg_len" in df.columns:
    print("msg_len outliers (msg_len <= 0):", (df['msg_len'] <= 0).sum())

## 14. Logical Consistency Checks
msg_len outliers (msg_len <= 0): 0


## Filepath Sanity
Check for Windows vs POSIX weirdness.

In [None]:
# Normalize all filepaths to use forward slashes
df['filepath'] = df['filepath'].str.replace('\\', '/', regex=False)

# Check again for paths without a slash
bad_paths = df[~df['filepath'].str.contains('/')]
print("Filepaths without / (unexpected after normalization):", len(bad_paths))
print(df.loc[~df['filepath'].str.contains('/'), 'filepath'].head(20))

Filepaths without / (unexpected after normalization): 400
11        setup.py
80        setup.py
81     setupegg.py
122       setup.py
123       setup.py
138       setup.py
144       setup.py
168       setup.py
278       setup.py
461       setup.py
462       setup.py
479       setup.py
497       setup.py
513       setup.py
514       setup.py
523       setup.py
524       setup.py
531       setup.py
562       setup.py
570       setup.py
Name: filepath, dtype: object


## Check Recent Churn for Extreme Outliers

In [None]:
print("## 16. recent_churn Outlier Scan")
print(df['recent_churn'].describe())
print("99.9% quantile:", df['recent_churn'].quantile(0.999))

## 16. recent_churn Outlier Scan
count     22510.000000
mean       6193.076455
std       10214.507757
min           0.000000
25%         194.000000
50%        1882.500000
75%        8539.750000
max      110348.000000
Name: recent_churn, dtype: float64
99.9% quantile: 90616.47900000121


## Check Distribution of Code Activity Keywords

(todo, fixme, try/except/raise)

In [None]:
print("## 17. Keyword Column Distributions")
kw_cols = ["todo", "fixme", "try", "except", "raise"]

print(df[kw_cols].describe().T.to_markdown())

## 17. Keyword Column Distributions
|        |   count |      mean |       std |   min |   25% |   50% |   75% |   max |
|:-------|--------:|----------:|----------:|------:|------:|------:|------:|------:|
| todo   |   22510 | 0.132474  |  0.643505 |     0 |     0 |     0 |     0 |    26 |
| fixme  |   22510 | 0.0137272 |  0.228647 |     0 |     0 |     0 |     0 |    17 |
| try    |   22510 | 1.15806   | 12.6487   |     0 |     0 |     0 |     0 |   660 |
| except |   22510 | 1.58992   |  8.20646  |     0 |     0 |     0 |     0 |   128 |
| raise  |   22510 | 1.57606   |  6.64527  |     0 |     0 |     0 |     1 |   213 |
