In [6]:
import sys
import pandas as pd
import sklearn
import matplotlib

print("Python executable:", sys.executable)
print("Setup successful")



Python executable: c:\Users\duamq\OneDrive\Desktop\code-complexity-classifier\venv\Scripts\python.exe
Setup successful


In [None]:
df = pd.read_csv("data/snippets.tsv", sep="\t")
df.head()





Unnamed: 0,code,label
0,"print(""Hello World"")",Simple
1,x = 5 + 3,Simple
2,for i in range(5): print(i),Simple
3,"nums = [1, 2, 3]",Simple
4,total = sum(nums),Simple


In [22]:
df["label"].value_counts()


label
Simple            20
Reasonable        20
Overengineered    20
Name: count, dtype: int64

In [23]:
def extract_features(code):
    lines = code.split("\n")
    return {
        "num_lines": len(lines),
        "num_defs": code.count("def "),
        "num_classes": code.count("class "),
        "num_imports": code.count("import "),
        "avg_line_length": sum(len(l) for l in lines) / max(1, len(lines))
    }


In [24]:
features = df["code"].apply(extract_features)
X = pd.DataFrame(list(features))
y = df["label"]

X.head()


Unnamed: 0,num_lines,num_defs,num_classes,num_imports,avg_line_length
0,1,0,0,0,20.0
1,1,0,0,0,9.0
2,1,0,0,0,27.0
3,1,0,0,0,16.0
4,1,0,0,0,17.0


In [25]:
X.describe()


Unnamed: 0,num_lines,num_defs,num_classes,num_imports,avg_line_length
count,141.0,141.0,141.0,141.0,141.0
mean,1.0,0.297872,0.205674,0.0,18.609929
std,0.0,0.458953,0.405634,0.0,5.985426
min,1.0,0.0,0.0,0.0,5.0
25%,1.0,0.0,0.0,0.0,14.0
50%,1.0,0.0,0.0,0.0,18.0
75%,1.0,1.0,0.0,0.0,22.0
max,1.0,1.0,1.0,0.0,40.0


how to read, 

count: number of data points which exists (so 141 code snippets)

mean: the average value containing x

std: how spread out the values are (low ->values are similar, high ->values vary a lot)

min: smallest value ccontaining x

first quartile: 25% of values are below this number

median: half of the values are below it and other half are above it

third quartile: 75% of values are below this number

max: largest value containing x



column 1: not useful in training since it cannot help distinguish classes and adds no info

column 2: good feature, helps seperate into 3 categories (simple -> no functions, reasonable -> often funcs, overengineered -> sometimes func+classes)

column 3: important, classes are rare in simple code but important in overnegineered code

column 4: not useful feature

column 5: kinda useful since overengineered code tends to be longer lines while simple code tends to be shorter

Conclusion: drop num_lines and num_imports

In [26]:
X_clean = X.drop(columns=["num_lines", "num_imports"])
X_clean.head()


Unnamed: 0,num_defs,num_classes,avg_line_length
0,0,0,20.0
1,0,0,9.0
2,0,0,27.0
3,0,0,16.0
4,0,0,17.0


In [30]:
X_clean.isna().sum()


num_defs           0
num_classes        0
avg_line_length    0
dtype: int64

In [31]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42
)


In [33]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(
    n_estimators=100,
    random_state=42
)

model.fit(X_train, y_train)


ValueError: Input contains NaN

In [1]:
import pandas as pd

df = pd.read_csv("data/snippets.tsv", sep="\t")


In [2]:
df = df[df["code"].notna() & (df["code"].str.strip() != "")]
df = df.reset_index(drop=True)


In [3]:
def extract_features(code):
    lines = code.split("\n")
    return {
        "num_defs": code.count("def "),
        "num_classes": code.count("class "),
        "avg_line_length": sum(len(l) for l in lines) / max(1, len(lines))
    }


In [4]:
features = df["code"].apply(extract_features)
X = pd.DataFrame(list(features))
y = df["label"]


In [5]:
print(X.isna().sum())
print(y.isna().sum())
print(X.shape, y.shape)


num_defs           0
num_classes        0
avg_line_length    0
dtype: int64
81
(141, 3) (141,)


In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42
)


In [7]:
print(X_train.isna().sum())
print(y_train.isna().sum())


num_defs           0
num_classes        0
avg_line_length    0
dtype: int64
65


In [8]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(
    n_estimators=100,
    random_state=42
)

model.fit(X_train, y_train)


ValueError: Input contains NaN

In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv("data/snippets.tsv", sep="\t")

df = df[df["code"].notna() & (df["code"].str.strip() != "")]
df = df.reset_index(drop=True)


In [2]:
def extract_features(code):
    lines = code.split("\n")
    avg_len = sum(len(l) for l in lines) / max(1, len(lines))
    return {
        "num_defs": float(code.count("def ")),
        "num_classes": float(code.count("class ")),
        "avg_line_length": float(avg_len)
    }


In [3]:
features = df["code"].apply(extract_features)
X = pd.DataFrame(list(features))
y = df["label"]


In [None]:
X = X.apply(pd.to_numeric, errors="coerce")

X = X.replace([np.inf, -np.inf], np.nan)

X = X.fillna(0)


In [5]:
print(X.isna().sum())
print(np.isinf(X.values).sum())
print(X.dtypes)


num_defs           0
num_classes        0
avg_line_length    0
dtype: int64
0
num_defs           float64
num_classes        float64
avg_line_length    float64
dtype: object


In [6]:
# Force numeric dtype (this is the missing step)
X = X.astype(float)

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42
)

from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(
    n_estimators=100,
    random_state=42
)

model.fit(X_train, y_train)


ValueError: Input contains NaN

In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv("data/snippets.tsv", sep="\t")
df = df[df["code"].notna() & (df["code"].str.strip() != "")]
df = df.reset_index(drop=True)


In [2]:
def extract_features(code):
    lines = code.split("\n")
    avg_len = sum(len(l) for l in lines) / max(1, len(lines))
    return [
        float(code.count("def ")),
        float(code.count("class ")),
        float(avg_len)
    ]


In [3]:
X = np.array(df["code"].apply(extract_features).to_list())


In [4]:
print(X.shape)
print(np.isnan(X).sum())
print(np.isinf(X).sum())


(141, 3)
0
0


In [5]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y = le.fit_transform(df["label"])


In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [7]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(
    n_estimators=100,
    random_state=42
)

model.fit(X_train, y_train)


0,1,2
,"n_estimators  n_estimators: int, default=100 The number of trees in the forest. .. versionchanged:: 0.22  The default value of ``n_estimators`` changed from 10 to 100  in 0.22.",100
,"criterion  criterion: {""gini"", ""entropy"", ""log_loss""}, default=""gini"" The function to measure the quality of a split. Supported criteria are ""gini"" for the Gini impurity and ""log_loss"" and ""entropy"" both for the Shannon information gain, see :ref:`tree_mathematical_formulation`. Note: This parameter is tree-specific.",'gini'
,"max_depth  max_depth: int, default=None The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples.",
,"min_samples_split  min_samples_split: int or float, default=2 The minimum number of samples required to split an internal node: - If int, then consider `min_samples_split` as the minimum number. - If float, then `min_samples_split` is a fraction and  `ceil(min_samples_split * n_samples)` are the minimum  number of samples for each split. .. versionchanged:: 0.18  Added float values for fractions.",2
,"min_samples_leaf  min_samples_leaf: int or float, default=1 The minimum number of samples required to be at a leaf node. A split point at any depth will only be considered if it leaves at least ``min_samples_leaf`` training samples in each of the left and right branches. This may have the effect of smoothing the model, especially in regression. - If int, then consider `min_samples_leaf` as the minimum number. - If float, then `min_samples_leaf` is a fraction and  `ceil(min_samples_leaf * n_samples)` are the minimum  number of samples for each node. .. versionchanged:: 0.18  Added float values for fractions.",1
,"min_weight_fraction_leaf  min_weight_fraction_leaf: float, default=0.0 The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sample_weight is not provided.",0.0
,"max_features  max_features: {""sqrt"", ""log2"", None}, int or float, default=""sqrt"" The number of features to consider when looking for the best split: - If int, then consider `max_features` features at each split. - If float, then `max_features` is a fraction and  `max(1, int(max_features * n_features_in_))` features are considered at each  split. - If ""sqrt"", then `max_features=sqrt(n_features)`. - If ""log2"", then `max_features=log2(n_features)`. - If None, then `max_features=n_features`. .. versionchanged:: 1.1  The default of `max_features` changed from `""auto""` to `""sqrt""`. Note: the search for a split does not stop until at least one valid partition of the node samples is found, even if it requires to effectively inspect more than ``max_features`` features.",'sqrt'
,"max_leaf_nodes  max_leaf_nodes: int, default=None Grow trees with ``max_leaf_nodes`` in best-first fashion. Best nodes are defined as relative reduction in impurity. If None then unlimited number of leaf nodes.",
,"min_impurity_decrease  min_impurity_decrease: float, default=0.0 A node will be split if this split induces a decrease of the impurity greater than or equal to this value. The weighted impurity decrease equation is the following::  N_t / N * (impurity - N_t_R / N_t * right_impurity  - N_t_L / N_t * left_impurity) where ``N`` is the total number of samples, ``N_t`` is the number of samples at the current node, ``N_t_L`` is the number of samples in the left child, and ``N_t_R`` is the number of samples in the right child. ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum, if ``sample_weight`` is passed. .. versionadded:: 0.19",0.0
,"bootstrap  bootstrap: bool, default=True Whether bootstrap samples are used when building trees. If False, the whole dataset is used to build each tree.",True


In [8]:
from sklearn.metrics import classification_report

preds = model.predict(X_test)
print(classification_report(y_test, preds))


              precision    recall  f1-score   support

           0       0.43      0.50      0.46         6
           1       0.00      0.00      0.00         4
           2       0.00      0.00      0.00         3
           3       0.82      0.88      0.85        16

    accuracy                           0.59        29
   macro avg       0.31      0.34      0.33        29
weighted avg       0.54      0.59      0.56        29



precision → “When the model predicts this class, how often is it right?”

recall → “Out of all real examples of this class, how many did it catch?”

f1-score → balance of precision & recall

support → how many true examples of this class were in the test se

In [None]:
df["binary_label"] = df["label"].apply(
    lambda x: "Overengineered" if x == "Overengineered" else "Not_Overengineered"
)

df["binary_label"].value_counts()


binary_label
Not_Overengineered    121
Overengineered         20
Name: count, dtype: int64

In [10]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y_binary = le.fit_transform(df["binary_label"])

# See mapping
dict(zip(le.classes_, le.transform(le.classes_)))


{'Not_Overengineered': np.int64(0), 'Overengineered': np.int64(1)}

In [11]:
print(X.shape)
print(y_binary.shape)


(141, 3)
(141,)


In [12]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y_binary,
    test_size=0.2,
    random_state=42,
    stratify=y_binary
)


In [13]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(
    n_estimators=100,
    random_state=42
)

model.fit(X_train, y_train)


0,1,2
,"n_estimators  n_estimators: int, default=100 The number of trees in the forest. .. versionchanged:: 0.22  The default value of ``n_estimators`` changed from 10 to 100  in 0.22.",100
,"criterion  criterion: {""gini"", ""entropy"", ""log_loss""}, default=""gini"" The function to measure the quality of a split. Supported criteria are ""gini"" for the Gini impurity and ""log_loss"" and ""entropy"" both for the Shannon information gain, see :ref:`tree_mathematical_formulation`. Note: This parameter is tree-specific.",'gini'
,"max_depth  max_depth: int, default=None The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples.",
,"min_samples_split  min_samples_split: int or float, default=2 The minimum number of samples required to split an internal node: - If int, then consider `min_samples_split` as the minimum number. - If float, then `min_samples_split` is a fraction and  `ceil(min_samples_split * n_samples)` are the minimum  number of samples for each split. .. versionchanged:: 0.18  Added float values for fractions.",2
,"min_samples_leaf  min_samples_leaf: int or float, default=1 The minimum number of samples required to be at a leaf node. A split point at any depth will only be considered if it leaves at least ``min_samples_leaf`` training samples in each of the left and right branches. This may have the effect of smoothing the model, especially in regression. - If int, then consider `min_samples_leaf` as the minimum number. - If float, then `min_samples_leaf` is a fraction and  `ceil(min_samples_leaf * n_samples)` are the minimum  number of samples for each node. .. versionchanged:: 0.18  Added float values for fractions.",1
,"min_weight_fraction_leaf  min_weight_fraction_leaf: float, default=0.0 The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sample_weight is not provided.",0.0
,"max_features  max_features: {""sqrt"", ""log2"", None}, int or float, default=""sqrt"" The number of features to consider when looking for the best split: - If int, then consider `max_features` features at each split. - If float, then `max_features` is a fraction and  `max(1, int(max_features * n_features_in_))` features are considered at each  split. - If ""sqrt"", then `max_features=sqrt(n_features)`. - If ""log2"", then `max_features=log2(n_features)`. - If None, then `max_features=n_features`. .. versionchanged:: 1.1  The default of `max_features` changed from `""auto""` to `""sqrt""`. Note: the search for a split does not stop until at least one valid partition of the node samples is found, even if it requires to effectively inspect more than ``max_features`` features.",'sqrt'
,"max_leaf_nodes  max_leaf_nodes: int, default=None Grow trees with ``max_leaf_nodes`` in best-first fashion. Best nodes are defined as relative reduction in impurity. If None then unlimited number of leaf nodes.",
,"min_impurity_decrease  min_impurity_decrease: float, default=0.0 A node will be split if this split induces a decrease of the impurity greater than or equal to this value. The weighted impurity decrease equation is the following::  N_t / N * (impurity - N_t_R / N_t * right_impurity  - N_t_L / N_t * left_impurity) where ``N`` is the total number of samples, ``N_t`` is the number of samples at the current node, ``N_t_L`` is the number of samples in the left child, and ``N_t_R`` is the number of samples in the right child. ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum, if ``sample_weight`` is passed. .. versionadded:: 0.19",0.0
,"bootstrap  bootstrap: bool, default=True Whether bootstrap samples are used when building trees. If False, the whole dataset is used to build each tree.",True


In [14]:
from sklearn.metrics import classification_report

preds = model.predict(X_test)
print(classification_report(y_test, preds, target_names=le.classes_))


                    precision    recall  f1-score   support

Not_Overengineered       0.96      0.88      0.92        25
    Overengineered       0.50      0.75      0.60         4

          accuracy                           0.86        29
         macro avg       0.73      0.81      0.76        29
      weighted avg       0.89      0.86      0.87        29



## Feature Extraction
We convert raw code snippets into simple, interpretable numeric features that
approximate structural complexity.


In [15]:
def extract_features(code):
    """
    Convert a code snippet into numeric features.

    Features:
    - Number of function definitions
    - Number of class definitions
    - Average line length
    """
    lines = code.split("\n")
    avg_len = sum(len(l) for l in lines) / max(1, len(lines))

    return [
        float(code.count("def ")),
        float(code.count("class ")),
        float(avg_len)
    ]

extract_features("def add(a, b): return a + b")



[1.0, 0.0, 27.0]