In [1]:
import pandas as pd

In [2]:
df_train = pd.read_csv("06_data/train_data.csv")
df_test = pd.read_csv("06_data/test_data.csv")

In [3]:
df_train.head()

Unnamed: 0.1,Unnamed: 0,id,gen,judet,NT,MEV,MATE,MGIM,status_admitere
0,0,407,F,BT,4.91,8.82,6.23,6.85,0
1,1,444,F,BC,9.5,7.31,8.97,8.36,1
2,2,117,F,BH,6.23,8.63,5.4,8.54,0
3,3,30,M,SM,1.35,9.49,9.06,5.59,0
4,4,415,M,OT,1.82,6.71,8.67,8.34,0


In [4]:
df_train.info()
df_train.describe(include="all")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 270 entries, 0 to 269
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Unnamed: 0       270 non-null    int64  
 1   id               270 non-null    int64  
 2   gen              270 non-null    object 
 3   judet            270 non-null    object 
 4   NT               270 non-null    float64
 5   MEV              270 non-null    float64
 6   MATE             270 non-null    float64
 7   MGIM             270 non-null    float64
 8   status_admitere  270 non-null    int64  
dtypes: float64(4), int64(3), object(2)
memory usage: 19.1+ KB


Unnamed: 0.1,Unnamed: 0,id,gen,judet,NT,MEV,MATE,MGIM,status_admitere
count,270.0,270.0,270,270,270.0,270.0,270.0,270.0,270.0
unique,,,2,42,,,,,
top,,,F,BH,,,,,
freq,,,141,12,,,,,
mean,134.5,224.992593,,,5.239259,8.086037,7.571444,7.590037,0.262963
std,78.086491,132.175702,,,2.49506,1.172909,1.418122,1.414046,0.44106
min,0.0,0.0,,,1.1,6.02,5.02,5.01,0.0
25%,67.25,111.25,,,3.005,6.975,6.34,6.41,0.0
50%,134.5,224.0,,,5.2,8.21,7.69,7.675,0.0
75%,201.75,341.5,,,7.2775,9.065,8.77,8.795,1.0


In [5]:
df_train["dif_NT_MEV"] = df_train.NT - df_train.MEV
df_train["ratio_MATE_MGIM"] = df_train.MATE / df_train.MGIM

df_test["dif_NT_MEV"] = df_test.NT - df_test.MEV
df_test["ratio_MATE_MGIM"] = df_test.MATE / df_test.MGIM

In [6]:
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, precision_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

num_cols = ["NT", "MEV", "MATE", "MGIM", "dif_NT_MEV", "ratio_MATE_MGIM"]
cat_cols = ["gen", "judet"]

pre = ColumnTransformer(
    [
        (
            "num",
            Pipeline(
                [("imp", SimpleImputer(strategy="median")), ("sc", StandardScaler())]
            ),
            num_cols,
        ),
        (
            "cat",
            Pipeline(
                [
                    ("imp", SimpleImputer(strategy="most_frequent")),
                    ("oh", OneHotEncoder(handle_unknown="ignore")),
                ]
            ),
            cat_cols,
        ),
    ]
)
X = df_train[num_cols + cat_cols]
y = df_train.status_admitere
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

model = GradientBoostingClassifier(random_state=42)
pipe = Pipeline([("pre", pre), ("gb", model)])

pipe.fit(X_train, y_train)
y_pred_train = pipe.predict(X_train)
y_pred_val = pipe.predict(X_val)

print("Training Classification Report:")
print(classification_report(y_train, y_pred_train))

print("Validation Classification Report:")
print(classification_report(y_val, y_pred_val))

df_test["status_pred"] = pipe.predict(df_test[num_cols + cat_cols])

Training Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       158
           1       1.00      1.00      1.00        58

    accuracy                           1.00       216
   macro avg       1.00      1.00      1.00       216
weighted avg       1.00      1.00      1.00       216

Validation Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.98      0.98        41
           1       0.92      0.92      0.92        13

    accuracy                           0.96        54
   macro avg       0.95      0.95      0.95        54
weighted avg       0.96      0.96      0.96        54



In [7]:
# Subtask 1 & 2
df_test["dif_NT_MEV"] = (df_test.NT - df_test.MEV).round(2)
df_test["loc_MEV"] = df_test.MEV.rank(ascending=False, method="min").astype(int)

In [8]:
rows = []
for _, r in df_test.iterrows():
    rows += [
        (1, int(r.id), f"{r.dif_NT_MEV:.2f}"),
        (2, int(r.id), int(r.loc_MEV)),
        (3, int(r.id), int(r.status_pred)),
    ]

submission = pd.DataFrame(rows, columns=["subtaskID", "datapointID", "answer"])

submission.to_csv("06_data/submission.csv", index=False)