In [11]:
import pandas as pd

In [12]:
df_train = pd.read_csv("07_data/train_data.csv")
df_test = pd.read_csv("07_data/test_data.csv")

In [13]:
df_train.head()

Unnamed: 0,ID,Hours_Studied,Attendance,Parental_Involvement,Access_to_Resources,Extracurricular_Activities,Sleep_Hours,Previous_Scores,Motivation_Level,Internet_Access,...,Family_Income,Teacher_Quality,School_Type,Peer_Influence,Physical_Activity,Learning_Disabilities,Parental_Education_Level,Distance_from_Home,Gender,Exam_Score
0,1,27,79,Low,High,Yes,8,63,High,Yes,...,Low,Medium,Public,Negative,5,No,College,Moderate,Female,69
1,2,16,86,High,Medium,Yes,7,94,Medium,Yes,...,Low,High,Public,Neutral,3,No,High School,Moderate,Female,69
2,3,22,87,Low,Medium,No,8,83,Low,Yes,...,Low,Medium,Public,Neutral,1,No,College,Far,Male,66
3,4,18,100,High,Medium,Yes,10,86,Medium,Yes,...,Medium,Medium,Public,Neutral,3,No,High School,Near,Male,72
4,5,35,78,High,Low,Yes,10,99,Medium,Yes,...,Low,Medium,Private,Positive,2,No,High School,Near,Male,72


In [14]:
df_train.info()
df_train.describe(include="all")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5285 entries, 0 to 5284
Data columns (total 21 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   ID                          5285 non-null   int64 
 1   Hours_Studied               5285 non-null   int64 
 2   Attendance                  5285 non-null   int64 
 3   Parental_Involvement        5285 non-null   object
 4   Access_to_Resources         5285 non-null   object
 5   Extracurricular_Activities  5285 non-null   object
 6   Sleep_Hours                 5285 non-null   int64 
 7   Previous_Scores             5285 non-null   int64 
 8   Motivation_Level            5285 non-null   object
 9   Internet_Access             5285 non-null   object
 10  Tutoring_Sessions           5285 non-null   int64 
 11  Family_Income               5285 non-null   object
 12  Teacher_Quality             5225 non-null   object
 13  School_Type                 5285 non-null   obje

Unnamed: 0,ID,Hours_Studied,Attendance,Parental_Involvement,Access_to_Resources,Extracurricular_Activities,Sleep_Hours,Previous_Scores,Motivation_Level,Internet_Access,...,Family_Income,Teacher_Quality,School_Type,Peer_Influence,Physical_Activity,Learning_Disabilities,Parental_Education_Level,Distance_from_Home,Gender,Exam_Score
count,5285.0,5285.0,5285.0,5285,5285,5285,5285.0,5285.0,5285,5285,...,5285,5225,5285,5285,5285.0,5285,5216,5234,5285,5285.0
unique,,,,3,3,2,,,3,2,...,3,3,2,3,,2,3,3,2,
top,,,,Medium,Medium,Yes,,,Medium,Yes,...,Low,Medium,Public,Positive,,No,High School,Near,Male,
freq,,,,2690,2663,3163,,,2688,4873,...,2146,3141,3683,2152,,4704,2591,3090,3054,
mean,2643.0,19.967077,79.873605,,,,7.024409,75.061116,,,...,,,,,2.96859,,,,,67.215137
std,1525.792417,5.993282,11.519161,,,,1.458215,14.406537,,,...,,,,,1.033942,,,,,3.922228
min,1.0,1.0,60.0,,,,4.0,50.0,,,...,,,,,0.0,,,,,56.0
25%,1322.0,16.0,70.0,,,,6.0,63.0,,,...,,,,,2.0,,,,,65.0
50%,2643.0,20.0,80.0,,,,7.0,75.0,,,...,,,,,3.0,,,,,67.0
75%,3964.0,24.0,90.0,,,,8.0,88.0,,,...,,,,,4.0,,,,,69.0


In [15]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.impute import SimpleImputer
from sklearn.model_selection import KFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler


def add_features(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()
    out["study_efficiency"] = out["Hours_Studied"] / (out["Tutoring_Sessions"] + 1)
    out["attendance_ratio"] = out["Attendance"] / 100.0
    out["sleep_quality"] = out["Sleep_Hours"] * out["attendance_ratio"]
    out["resource_and_parents"] = (
        out["Parental_Involvement"].astype(str)
        + "_"
        + out["Access_to_Resources"].astype(str)
    )

    out["Hours_Studied_sq"] = out["Hours_Studied"] ** 2
    out["Previous_Scores_sq"] = out["Previous_Scores"] ** 2

    return out


df_train_fe = add_features(df_train)
df_test_fe = add_features(df_test)

In [16]:
TARGET = "Exam_Score"
ID_COL = "ID"
feature_cols = [c for c in df_train_fe.columns if c not in [TARGET, ID_COL]]

# separăm numerical vs categorical
cat_cols = df_train_fe[feature_cols].select_dtypes("object").columns.tolist()
num_cols = [c for c in feature_cols if c not in cat_cols]

numeric_trf = Pipeline(
    [
        ("imp", SimpleImputer(strategy="median")),
        ("sc", StandardScaler()),
    ]
)
cat_trf = Pipeline(
    [
        ("imp", SimpleImputer(strategy="most_frequent")),
        ("oh", OneHotEncoder(handle_unknown="ignore")),
    ]
)
pre = ColumnTransformer(
    [
        ("num", numeric_trf, num_cols),
        ("cat", cat_trf, cat_cols),
    ]
)

model = GradientBoostingRegressor(random_state=42, loss="huber", alpha=0.9)
pipe = Pipeline([("pre", pre), ("gb", model)])

In [17]:
cv = KFold(n_splits=5, shuffle=True, random_state=42)
mae_scores = -cross_val_score(
    pipe,
    df_train_fe[feature_cols],
    df_train_fe[TARGET],
    cv=cv,
    scoring="neg_mean_absolute_error",
)
print(f"MAE (5‑fold): mean={mae_scores.mean():.3f} ± {mae_scores.std():.3f}")

MAE (5‑fold): mean=0.808 ± 0.046


In [18]:
pipe.fit(df_train_fe[feature_cols], df_train_fe[TARGET])
df_test["pred_exam_score"] = pipe.predict(df_test_fe[feature_cols])

In [19]:
mean_hours = df_train["Hours_Studied"].mean()

# Subtask 1
df_test["diff_hours_abs"] = (df_test["Hours_Studied"] - mean_hours).abs().round(2)

# Subtask 2
df_test["sleep_few"] = df_test["Sleep_Hours"] < 7

# Subtask 3
prev_scores_train = df_train["Previous_Scores"]
df_test["count_prev_ge"] = df_test["Previous_Scores"].apply(
    lambda v: int((prev_scores_train >= v).sum())
)

# Subtask 4
mot_counts = df_train["Motivation_Level"].value_counts()
df_test["count_same_motivation"] = (
    df_test["Motivation_Level"].map(mot_counts).fillna(0).astype(int)
)

In [20]:
rows = []
for _, row in df_test.iterrows():
    idx = int(row[ID_COL])
    rows.extend(
        [
            (1, idx, float(row["diff_hours_abs"])),
            (2, idx, bool(row["sleep_few"])),
            (3, idx, int(row["count_prev_ge"])),
            (4, idx, int(row["count_same_motivation"])),
            (5, idx, int(round(row["pred_exam_score"], 2))),
        ]
    )

submission = pd.DataFrame(rows, columns=["subtaskID", "datapointID", "answer"])
submission.to_csv("07_data/submission.csv", index=False)