In [106]:
# Imports 
import pandas as pd
import numpy as np 
from io import StringIO

In [108]:
# Loading the Data set 
df = pd.read_json("dataset.jsonl", lines=True)

In [109]:
print(df.head())

                       title  \
0                        Uuu   
1             House Building   
2             Mario or Luigi   
3             The Wire Ghost   
4  Barking Up The Wrong Tree   

                                         description  \
0  Unununium (Uuu) was the name of the chemical\n...   
1  A number of eccentrics from central New York h...   
2  Mario and Luigi are playing a game where they ...   
3  Žofka is bending a copper wire. She starts wit...   
4  Your dog Spot is let loose in the park. Well, ...   

                                   input_description  \
0  The input consists of one line with two intege...   
1  The input consists of $10$ test cases, which a...   
2                                                      
3  The first line contains two integers $L$ and $...   
4  The first line of input consists of two intege...   

                                  output_description  \
0  The output consists of $M$ lines where the $i$...   
1  Print $K$ lines wi

In [112]:
text_columns = [
    "title",
    "description",
    "input_description",
    "output_description"
]
# Ignore sample IO and Url 

In [114]:
# Fill missing text 
for col in text_columns:
    df[col] = df[col].fillna("")

In [116]:
# Combine text
df["full_text"] = (
    df["title"] + " " +
    df["description"] + " " +
    df["input_description"] + " " +
    df["output_description"]
)

In [118]:
# Minimal cleaning
df["full_text"] = (
    df["full_text"]
    .str.lower()
    .str.replace(r"\s+", " ", regex=True)
)

In [119]:
# Final check
df[["full_text", "problem_class", "problem_score"]].head()

Unnamed: 0,full_text,problem_class,problem_score
0,uuu unununium (uuu) was the name of the chemic...,hard,9.7
1,house building a number of eccentrics from cen...,hard,9.7
2,mario or luigi mario and luigi are playing a g...,hard,9.6
3,the wire ghost žofka is bending a copper wire....,hard,9.6
4,barking up the wrong tree your dog spot is let...,hard,9.6


In [122]:
df[["full_text", "problem_class", "problem_score"]].tail()

Unnamed: 0,full_text,problem_class,problem_score
4107,tölvunarfræðingar telja computer scientists co...,easy,1.1
4108,velkomin! welcome to forritunarkeppni framhald...,easy,1.1
4109,til hamingju there is no input in this problem...,easy,1.1
4110,hipp hipp there is no input in this problem. p...,easy,1.1
4111,advanced causal measurements causality is a ve...,hard,6.5


## EDA

In [124]:
df[["problem_class", "problem_score"]].isnull().sum()


problem_class    0
problem_score    0
dtype: int64

In [126]:
df["problem_class"].value_counts()


problem_class
hard      1941
medium    1405
easy       766
Name: count, dtype: int64

In [128]:
df["problem_score"].describe()


count    4112.000000
mean        5.114689
std         2.177770
min         1.100000
25%         3.300000
50%         5.200000
75%         6.900000
max         9.700000
Name: problem_score, dtype: float64

In [130]:
df.groupby("problem_class")["problem_score"].describe()


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
problem_class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
easy,766.0,1.970888,0.433289,1.1,1.6,2.0,2.3,2.8
hard,1941.0,7.071149,1.049729,5.5,6.2,7.0,7.9,9.7
medium,1405.0,4.125836,0.774216,2.8,3.5,4.1,4.8,5.5


In [132]:
df["word_count"] = df["full_text"].str.split().apply(len)
df.groupby("problem_class")["word_count"].describe()


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
problem_class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
easy,766.0,217.143603,100.733767,1.0,146.0,206.5,275.0,671.0
hard,1941.0,296.022154,128.060335,38.0,208.0,278.0,363.0,1055.0
medium,1405.0,271.855516,123.637675,24.0,188.0,254.0,334.0,1226.0


In [134]:
df["char_count"] = df["full_text"].apply(len)
df.groupby("problem_class")["char_count"].describe()


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
problem_class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
easy,766.0,1208.193211,564.68147,8.0,807.0,1149.0,1532.75,3776.0
hard,1941.0,1643.62442,714.912806,204.0,1153.0,1529.0,2016.0,6329.0
medium,1405.0,1511.745196,695.582718,128.0,1039.0,1403.0,1866.0,6649.0


In [136]:
df["digit_count"] = df["full_text"].apply(lambda x: sum(c.isdigit() for c in x))
df.groupby("problem_class")["digit_count"].describe()


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
problem_class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
easy,766.0,14.250653,15.31089,0.0,5.0,10.5,18.0,160.0
hard,1941.0,17.618238,17.569217,0.0,8.0,13.0,22.0,282.0
medium,1405.0,17.794306,18.616185,0.0,7.0,13.0,22.0,192.0


In [138]:
symbols = "+-*/%<=>"
df["math_symbol_count"] = df["full_text"].apply(
    lambda x: sum(x.count(s) for s in symbols)
)
df.groupby("problem_class")["math_symbol_count"].describe()


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
problem_class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
easy,766.0,3.0,5.914201,0.0,0.0,1.0,4.0,119.0
hard,1941.0,3.960845,4.896718,0.0,1.0,2.0,5.0,49.0
medium,1405.0,3.692527,5.004782,0.0,0.0,2.0,5.0,45.0


In [140]:
df["has_dp"] = df["full_text"].str.contains(r"\bdp\b", regex=True)
pd.crosstab(df["problem_class"], df["has_dp"], normalize="index")


has_dp,False,True
problem_class,Unnamed: 1_level_1,Unnamed: 2_level_1
easy,1.0,0.0
hard,1.0,0.0
medium,0.999288,0.000712


In [142]:
df["has_constraint"] = df["full_text"].str.contains(r"\bconstraint", regex=True)
pd.crosstab(df["problem_class"], df["has_constraint"], normalize="index")


has_constraint,False,True
problem_class,Unnamed: 1_level_1,Unnamed: 2_level_1
easy,0.986945,0.013055
hard,0.964451,0.035549
medium,0.972954,0.027046


## Fitting Data and Feature Extraction 

In [144]:
from sklearn.model_selection import train_test_split

X_text = df["full_text"]
y_class = df["problem_class"]
y_score = df["problem_score"]

X_text_train, X_text_test, y_class_train, y_class_test, y_score_train, y_score_test = (
    train_test_split(
        X_text,
        y_class,
        y_score,
        test_size=0.2,
        random_state=42,
        stratify=y_class
    )
)


In [146]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(
    ngram_range=(1, 2),
    max_features=8000,
    min_df=2,
    max_df=0.9
)

X_tfidf_train = tfidf.fit_transform(X_text_train)
X_tfidf_test = tfidf.transform(X_text_test)


In [148]:
import re

def extract_numeric_features(text):
    return np.array([
        len(text),                               # char count
        len(text.split()),                      # word count
        sum(c.isdigit() for c in text),         # digit count
        sum(text.count(s) for s in "+-*/%<=>"), # math symbol count
        len(re.findall(r"[.!?]", text))         # sentence count
    ], dtype=float)

X_numeric = np.vstack(df["full_text"].apply(extract_numeric_features))

X_num_train = X_numeric[X_text_train.index]
X_num_test = X_numeric[X_text_test.index]


# Tried Different Approaches 

## Linear SVC With Scaling 

In [150]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_num_train_scaled = scaler.fit_transform(X_num_train)
X_num_test_scaled = scaler.transform(X_num_test)


In [152]:
from scipy.sparse import hstack

X_train = hstack([X_tfidf_train, X_num_train_scaled])
X_test = hstack([X_tfidf_test, X_num_test_scaled])


In [154]:
from sklearn.svm import LinearSVC

svm_clf = LinearSVC(
    class_weight="balanced",
    max_iter=5000
)

svm_clf.fit(X_train, y_class_train)



0,1,2
,penalty,'l2'
,loss,'squared_hinge'
,dual,'auto'
,tol,0.0001
,C,1.0
,multi_class,'ovr'
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,verbose,0


In [156]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

y_pred = svm_clf.predict(X_test)

print("Accuracy:", accuracy_score(y_class_test, y_pred))
print(confusion_matrix(y_class_test, y_pred))
print(classification_report(y_class_test, y_pred))


Accuracy: 0.4787363304981774
[[ 65  41  47]
 [ 47 235 107]
 [ 43 144  94]]
              precision    recall  f1-score   support

        easy       0.42      0.42      0.42       153
        hard       0.56      0.60      0.58       389
      medium       0.38      0.33      0.36       281

    accuracy                           0.48       823
   macro avg       0.45      0.45      0.45       823
weighted avg       0.47      0.48      0.47       823



## Tried to do regression first and Then fit Problems into bins

In [158]:
from sklearn.ensemble import RandomForestRegressor

reg = RandomForestRegressor(
    n_estimators=300,
    random_state=42,
    n_jobs=-1
)

reg.fit(X_train, y_score_train)


0,1,2
,n_estimators,300
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [160]:
y_score_pred = reg.predict(X_test)


In [164]:
def score_to_class(score):
    if score <= 2.8:
        return "easy"
    elif score <= 5.5:
        return "medium"
    else:
        return "hard"

y_class_pred_from_score = [score_to_class(s) for s in y_score_pred]


In [166]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

print("Accuracy:", accuracy_score(y_class_test, y_class_pred_from_score))
print(confusion_matrix(y_class_test, y_class_pred_from_score))
print(classification_report(y_class_test, y_class_pred_from_score))


Accuracy: 0.4507897934386391
[[  3   9 141]
 [  0 169 220]
 [  0  82 199]]
              precision    recall  f1-score   support

        easy       1.00      0.02      0.04       153
        hard       0.65      0.43      0.52       389
      medium       0.36      0.71      0.47       281

    accuracy                           0.45       823
   macro avg       0.67      0.39      0.34       823
weighted avg       0.61      0.45      0.41       823



In [168]:
X_num = np.vstack(df["full_text"].apply(extract_numeric_features))

X_num_train = X_num[X_text_train.index]
X_num_test = X_num[X_text_test.index]


## Tried Scaling with Ridge Regression for Regression 

In [170]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_num_train_scaled = scaler.fit_transform(X_num_train)
X_num_test_scaled = scaler.transform(X_num_test)


In [172]:
from sklearn.linear_model import Ridge

reg = Ridge(alpha=1.0)
reg.fit(X_num_train_scaled, y_score_train)


0,1,2
,alpha,1.0
,fit_intercept,True
,copy_X,True
,max_iter,
,tol,0.0001
,solver,'auto'
,positive,False
,random_state,


In [174]:
y_score_pred = reg.predict(X_num_test_scaled)


In [216]:
def score_to_class(score):
    if score <= 0.8:
        return "easy"
    elif score <= 3.2:
        return "medium"
    else:
        return "hard"

y_class_pred = [score_to_class(s) for s in y_score_pred]


In [218]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

print("Accuracy:", accuracy_score(y_class_test, y_class_pred))
print(confusion_matrix(y_class_test, y_class_pred))
print(classification_report(y_class_test, y_class_pred))


Accuracy: 0.4726609963547995
[[  0 153   0]
 [  0 389   0]
 [  0 281   0]]
              precision    recall  f1-score   support

        easy       0.00      0.00      0.00       153
        hard       0.47      1.00      0.64       389
      medium       0.00      0.00      0.00       281

    accuracy                           0.47       823
   macro avg       0.16      0.33      0.21       823
weighted avg       0.22      0.47      0.30       823



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


## Numeric-only regression

In [220]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_num_train_scaled = scaler.fit_transform(X_num_train)
X_num_test_scaled = scaler.transform(X_num_test)


In [222]:
NUMERIC_WEIGHT = 5.0 

X_train = hstack([
    X_tfidf_train,
    X_num_train_scaled * NUMERIC_WEIGHT
])

X_test = hstack([
    X_tfidf_test,
    X_num_test_scaled * NUMERIC_WEIGHT
])


In [224]:
from sklearn.svm import LinearSVC

clf = LinearSVC(
    class_weight="balanced",
    C=1.0,
    max_iter=5000
)

clf.fit(X_train, y_class_train)




0,1,2
,penalty,'l2'
,loss,'squared_hinge'
,dual,'auto'
,tol,0.0001
,C,1.0
,multi_class,'ovr'
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,verbose,0


In [225]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

y_pred = clf.predict(X_test)

print("Accuracy:", accuracy_score(y_class_test, y_pred))
print(confusion_matrix(y_class_test, y_pred))
print(classification_report(y_class_test, y_pred))


Accuracy: 0.48116646415552855
[[ 65  41  47]
 [ 48 236 105]
 [ 43 143  95]]
              precision    recall  f1-score   support

        easy       0.42      0.42      0.42       153
        hard       0.56      0.61      0.58       389
      medium       0.38      0.34      0.36       281

    accuracy                           0.48       823
   macro avg       0.45      0.46      0.45       823
weighted avg       0.47      0.48      0.48       823



## Ordinal Decomposition

In [228]:
# harder than easy?
y_gt_easy = (df["problem_class"] != "easy").astype(int)

# harder than medium?
y_gt_medium = (df["problem_class"] == "hard").astype(int)


In [232]:
y_gt_easy_train = y_gt_easy.loc[X_text_train.index]
y_gt_easy_test  = y_gt_easy.loc[X_text_test.index]

y_gt_medium_train = y_gt_medium.loc[X_text_train.index]
y_gt_medium_test  = y_gt_medium.loc[X_text_test.index]


In [234]:
clf_easy = LinearSVC(class_weight="balanced", max_iter=5000)
clf_medium = LinearSVC(class_weight="balanced", max_iter=5000)

clf_easy.fit(X_train, y_gt_easy_train)
clf_medium.fit(X_train, y_gt_medium_train)




0,1,2
,penalty,'l2'
,loss,'squared_hinge'
,dual,'auto'
,tol,0.0001
,C,1.0
,multi_class,'ovr'
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,verbose,0


In [236]:
pred_easy = clf_easy.predict(X_test)      # 0 = easy, 1 = not easy
pred_medium = clf_medium.predict(X_test)  # 1 = hard


In [238]:
final_pred = []

for pe, pm in zip(pred_easy, pred_medium):
    if pe == 0:
        final_pred.append("easy")
    elif pm == 1:
        final_pred.append("hard")
    else:
        final_pred.append("medium")


In [240]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

print("Accuracy:", accuracy_score(y_class_test, final_pred))
print(confusion_matrix(y_class_test, final_pred))
print(classification_report(y_class_test, final_pred))


Accuracy: 0.48602673147023084
[[ 74  32  47]
 [ 39 216 134]
 [ 47 124 110]]
              precision    recall  f1-score   support

        easy       0.46      0.48      0.47       153
        hard       0.58      0.56      0.57       389
      medium       0.38      0.39      0.38       281

    accuracy                           0.49       823
   macro avg       0.47      0.48      0.48       823
weighted avg       0.49      0.49      0.49       823



## Trail On Sample 

In [248]:
from scipy.sparse import hstack
import numpy as np

def predict_problem_class_and_score(
    problem_text,
    tfidf,
    class_model,
    reg_model,
    extract_numeric_features,
    scaler=None
):
    """
    Predict difficulty class and difficulty score for a single problem.
    """

    # 1. Same preprocessing as training
    text = problem_text.lower()
    text = " ".join(text.split())

    # 2. TF-IDF features (for classification)
    X_tfidf = tfidf.transform([text])

    # 3. Numeric features
    num_features = extract_numeric_features(text).reshape(1, -1)

    if scaler is not None:
        num_features_scaled = scaler.transform(num_features)
    else:
        num_features_scaled = num_features

    # 4. Final feature matrix for classification
    X_class = hstack([X_tfidf, num_features_scaled])

    # 5. Predict class
    pred_class = class_model.predict(X_class)[0]

    # 6. Predict score (REGRESSION USES NUMERIC FEATURES ONLY)
    pred_score = reg_model.predict(num_features_scaled)[0]

    return pred_class, float(pred_score)


In [295]:
#  PASTE THE PROBLEM STATEMENT BELOW #

problem_text = """



Given n non-negative integers representing an elevation map where the width of each bar is 1, compute how much water it can trap after raining.

 

Example 1:


Input: height = [0,1,0,2,1,0,1,3,2,1,2,1]
Output: 6
Explanation: The above elevation map (black section) is represented by array [0,1,0,2,1,0,1,3,2,1,2,1]. In this case, 6 units of rain water (blue section) are being trapped.
Example 2:

Input: height = [4,2,0,3,2,5]
Output: 9
 

Constraints:

n == height.length
1 <= n <= 2 * 104
0 <= height[i] <= 105


"""

pred_class, pred_score = predict_problem_class_and_score(
    problem_text,
    tfidf=tfidf,
    class_model=clf,        
    reg_model=reg,          
    extract_numeric_features=extract_numeric_features,
    scaler=scaler
)

print("Predicted Difficulty Class :", pred_class.upper())
print("Predicted Difficulty Score :", round(pred_score, 2))


Predicted Difficulty Class : EASY
Predicted Difficulty Score : 4.05


In [282]:
features = extract_numeric_features(problem_text)

print("\nStructural signals:")
print(f"Characters       : {features[0]}")
print(f"Words            : {features[1]}")
print(f"Digits           : {features[2]}")
print(f"Math symbols     : {features[3]}")
print(f"Sentences        : {features[4]}")



Structural signals:
Characters       : 485.0
Words            : 74.0
Digits           : 10.0
Math symbols     : 11.0
Sentences        : 6.0
