In [1]:
import sys  # System-specific parameters and functions
import numpy as np  # Fundamental package for scientific computing with Python
import pandas as pd  # Powerful data structures for data manipulation and analysis
from datetime import datetime  # Basic date and time types
import warnings  # Warning control
warnings.filterwarnings('ignore')  # Ignore warnings

In [2]:
df = pd.read_csv('/kaggle/input/competition3/data-tbtl/annonimized.csv')
df = df[(df['is_final'] != 0) | (df['pre_score'] != 10000)]
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 288863 entries, 0 to 295197
Data columns (total 11 columns):
 #   Column                           Non-Null Count   Dtype 
---  ------                           --------------   ----- 
 0   concat('it001',`assignment_id`)  288863 non-null  object
 1   concat('it001',`problem_id`)     288863 non-null  object
 2   concat('it001', username)        288863 non-null  object
 3   is_final                         288863 non-null  int64 
 4   status                           288863 non-null  object
 5   pre_score                        288863 non-null  int64 
 6   coefficient                      288863 non-null  int64 
 7   concat('it001',`language_id`)    288863 non-null  object
 8   created_at                       288863 non-null  object
 9   updated_at                       288863 non-null  object
 10  judgement                        288863 non-null  object
dtypes: int64(3), object(8)
memory usage: 26.4+ MB


In [3]:
df = df.rename(columns={"concat('it001',`assignment_id`)": 'assignment_id'})
df = df.rename(columns={"concat('it001',`problem_id`)":'problem_id'})
df = df.rename(columns={"concat('it001', username)":'username'})

In [4]:
df = df.drop(["concat('it001',`language_id`)", 'updated_at'], axis=1)

# Tần xuất submit mỗi giờ trong ngày

In [5]:
def calculate_frequency_vector(student_df):
    a = pd.to_datetime(student_df['created_at'], format='%m-%d %H:%M:%S', errors='coerce').dt.hour
    a = a.dropna()  # Drop rows with NaT (errors in parsing)

    hour_counts = a.value_counts().sort_index()
    frequency_vector = np.zeros(24)
    for hour, count in hour_counts.items():
        hour = int(hour)
        frequency_vector[hour] = count
    return frequency_vector

# Dictionary to store frequency vectors for each student
frequency_vectors = {}

# Iterate over each unique student and calculate their frequency vector
for username in df["username"].unique():
    student_df = df[df["username"] == username]
    frequency_vectors[username] = calculate_frequency_vector(student_df)

In [6]:
frequency_df = pd.DataFrame.from_dict(frequency_vectors, orient='index', columns=[f'hour_{i}' for i in range(24)]).reset_index()
frequency_df.rename(columns={'index': "username"}, inplace=True)

In [7]:
df = df.merge(frequency_df, on="username")

In [8]:
# Thêm năm
fixed_year = 2024  # Chọn một năm bất kỳ
df['created_at'] = [f"{fixed_year}-{date}" for date in df['created_at']]

In [9]:
# Thay đổi thời gian theo định dạng mới
df['created_at'] = pd.to_datetime(df['created_at'])
def adjust_year(date):
    if date.month >= 9:
        return date.replace(year=date.year - 1)
    return date

# Áp dụng hàm này vào cột 'created_at'
df['created_at'] = df['created_at'].apply(adjust_year)

# Xử lý assignment

In [10]:
from sklearn.preprocessing import LabelEncoder

In [11]:
label_encoder = LabelEncoder()

# Fit và transform cột assignment_id
df['assignment_id_encoded'] = label_encoder.fit_transform(df['assignment_id'])

# Hiển thị kết quả
print(df[['username', 'assignment_id', 'assignment_id_encoded']])

                                        username  \
0       ed9eaeb6a707f50154024b24d7efcb874a9795dd   
1       ed9eaeb6a707f50154024b24d7efcb874a9795dd   
2       ed9eaeb6a707f50154024b24d7efcb874a9795dd   
3       ed9eaeb6a707f50154024b24d7efcb874a9795dd   
4       ed9eaeb6a707f50154024b24d7efcb874a9795dd   
...                                          ...   
288858  232cce96362898f08e9150ba244adaf2d6583ab2   
288859  232cce96362898f08e9150ba244adaf2d6583ab2   
288860  232cce96362898f08e9150ba244adaf2d6583ab2   
288861  232cce96362898f08e9150ba244adaf2d6583ab2   
288862  232cce96362898f08e9150ba244adaf2d6583ab2   

                                   assignment_id  assignment_id_encoded  
0       90ce27571176d87961b565d5ef4b3de33ede04ac                    116  
1       90ce27571176d87961b565d5ef4b3de33ede04ac                    116  
2       90ce27571176d87961b565d5ef4b3de33ede04ac                    116  
3       90ce27571176d87961b565d5ef4b3de33ede04ac                    116  
4    

In [14]:
def calculate_status_assignment_vector(student_df):
    status_counts = student_df.groupby('assignment_id_encoded')['status'].value_counts()
    a = status_counts[status_counts.index.get_level_values(1) != 'SCORE']
    status_counts_vector = np.zeros(203)
    for i, count in a.items():
        status_counts_vector[i[0]] = count
    return status_counts_vector

In [15]:
def calculate_count_problem_vector(student_df):
    a = student_df.groupby('assignment_id_encoded')['problem_id'].nunique()
    problem_counts_vector = np.zeros(203)
    for i, count in a.items():
        problem_counts_vector[i] = count
    return problem_counts_vector

In [19]:
# Dictionary to store vectors for each student
status_counts_vector = {}
problem_counts_vector = {}

In [20]:
for username in df["username"].unique():
    student_df = df[df["username"] == username]
    status_counts_vector[username] = calculate_status_assignment_vector(student_df)
    problem_counts_vector[username] = calculate_count_problem_vector(student_df)

In [21]:
status_counts_df = pd.DataFrame.from_dict(status_counts_vector, orient='index', columns=[f'status_counts_vector{i}' for i in range(203)]).reset_index()
status_counts_df.rename(columns={'index': "username"}, inplace=True)

problem_counts_df = pd.DataFrame.from_dict(problem_counts_vector, orient='index', columns=[f'problem_counts_vector{i}' for i in range(203)]).reset_index()
problem_counts_df.rename(columns={'index': "username"}, inplace=True)

In [22]:
df = df.merge(status_counts_df, on="username")

df = df.merge(problem_counts_df, on="username")


# Xử lý problem 

In [23]:
label_encoder1 = LabelEncoder()

# Fit và transform cột assignment_id
df['problem_id_encoded'] = label_encoder1.fit_transform(df['problem_id'])

# Hiển thị kết quả
print(df[['username', 'assignment_id', 'problem_id_encoded']])

                                        username  \
0       ed9eaeb6a707f50154024b24d7efcb874a9795dd   
1       ed9eaeb6a707f50154024b24d7efcb874a9795dd   
2       ed9eaeb6a707f50154024b24d7efcb874a9795dd   
3       ed9eaeb6a707f50154024b24d7efcb874a9795dd   
4       ed9eaeb6a707f50154024b24d7efcb874a9795dd   
...                                          ...   
288858  232cce96362898f08e9150ba244adaf2d6583ab2   
288859  232cce96362898f08e9150ba244adaf2d6583ab2   
288860  232cce96362898f08e9150ba244adaf2d6583ab2   
288861  232cce96362898f08e9150ba244adaf2d6583ab2   
288862  232cce96362898f08e9150ba244adaf2d6583ab2   

                                   assignment_id  problem_id_encoded  
0       90ce27571176d87961b565d5ef4b3de33ede04ac                 208  
1       90ce27571176d87961b565d5ef4b3de33ede04ac                 208  
2       90ce27571176d87961b565d5ef4b3de33ede04ac                 208  
3       90ce27571176d87961b565d5ef4b3de33ede04ac                 335  
4       90ce27571176

In [24]:
def calculate_problem_vector(student_df):
    a = student_df['problem_id_encoded'].unique()

    problem_vector = np.zeros(468)
    for value in a:
        problem_vector[value] = 1
    return problem_vector

In [25]:
def calculate_count_problem_vector(student_df):
    a = student_df.groupby('problem_id_encoded').size()
    count_problem_vector = np.zeros(468)
    for i, count in a.items():
        count_problem_vector[i] = count
    return count_problem_vector

In [26]:
def calculate_time_problem_vector(student_df):
    a = student_df.groupby('problem_id_encoded')['created_at'].agg(lambda x: (x.max() - x.min()).total_seconds() // 3600)
    time_problem_vector = np.zeros(468)
    for i, count in a.items():
        time_problem_vector[i] = count
    return time_problem_vector

In [27]:
def calculate_count_problem_0_vector(student_df):
    count_problem_0 = student_df.groupby('problem_id_encoded')['is_final'].value_counts()
    a = count_problem_0[count_problem_0.index.get_level_values(1) == 0]
    count_problem_0_vector = np.zeros(468)
    for i, count in a.items():
        count_problem_0_vector[i[0]] = count
    return count_problem_0_vector

In [28]:
def calculate_mean_prescrore_problem_vector(student_df):
    a = student_df.groupby('problem_id_encoded')['pre_score'].mean()
    mean_prescrore_problem_vector = np.zeros(468)
    for i, count in a.items():
        if count > 0:
            mean_prescrore_problem_vector[i] = np.log(count)
        else:
            mean_prescrore_problem_vector[i] = 0
    return mean_prescrore_problem_vector

In [29]:
problem_vector = {}
count_problem_vector = {}
time_problem_vector = {}
count_problem_0_vector = {}
mean_prescrore_problem_vector = {}

In [30]:
for username in df["username"].unique():
    student_df = df[df["username"] == username]
    problem_vector[username] = calculate_problem_vector(student_df)
    count_problem_vector[username] = calculate_count_problem_vector(student_df)
    time_problem_vector[username] = calculate_time_problem_vector(student_df)
    count_problem_0_vector[username] = calculate_count_problem_0_vector(student_df)
    mean_prescrore_problem_vector[username] = calculate_mean_prescrore_problem_vector(student_df)

In [31]:
problem_df = pd.DataFrame.from_dict(problem_vector, orient='index', columns=[f'problem_vector{i}' for i in range(468)]).reset_index()
problem_df.rename(columns={'index': "username"}, inplace=True)

count_problem_df = pd.DataFrame.from_dict(count_problem_vector, orient='index', columns=[f'count_problem_vector{i}' for i in range(468)]).reset_index()
count_problem_df.rename(columns={'index': "username"}, inplace=True)

time_problem_df = pd.DataFrame.from_dict(time_problem_vector, orient='index', columns=[f'time_problem_vector{i}' for i in range(468)]).reset_index()
time_problem_df.rename(columns={'index': "username"}, inplace=True)

count_problem_0_df = pd.DataFrame.from_dict(count_problem_0_vector, orient='index', columns=[f'count_problem_0_vector{i}' for i in range(468)]).reset_index()
count_problem_0_df.rename(columns={'index': "username"}, inplace=True)

mean_prescrore_problem_df = pd.DataFrame.from_dict(mean_prescrore_problem_vector, orient='index', columns=[f'mean_prescrore_problem_vector{i}' for i in range(468)]).reset_index()
mean_prescrore_problem_df.rename(columns={'index': "username"}, inplace=True)

In [32]:
df = df.merge(problem_df, on="username")

df = df.merge(count_problem_df, on="username")

df = df.merge(time_problem_df, on="username")

df = df.merge(count_problem_0_df, on="username")

df = df.merge(mean_prescrore_problem_df, on="username")

# Recude memory usage

In [33]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

In [34]:
df = reduce_mem_usage(df)

Memory usage after optimization is: 1541.59 MB
Decreased by 74.8%


In [35]:
!pip install lightgbm catboost



In [36]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import os
import numpy as np
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.metrics import mean_squared_error as MSE
import lightgbm as lgb
import optuna
from sklearn.model_selection import cross_validate
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.metrics import r2_score
from sklearn.ensemble import VotingRegressor
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer, KNNImputer

In [37]:
train_term1 = df.drop(['assignment_id','problem_id','is_final','status','pre_score','coefficient','created_at','judgement'], axis = 1)

In [38]:
qt_train = pd.read_csv('/kaggle/input/competition3/public_it001/qt-public.csv')

In [39]:
qt_train.isnull().sum()
qt_train = qt_train.rename(columns={"hash":'username'})
qt_train.dropna(inplace=True)
train_term1 = train_term1.merge(qt_train, on='username', how='inner')
train_term1.drop_duplicates(subset='username', keep='first', inplace=True)
train_term1['diemqt'] = train_term1['diemqt'].str.replace('\xa0', ' ', regex=True)
train_term1['diemqt'] = train_term1['diemqt'].replace(' ', np.nan).astype(float)
train_term1.dropna(inplace=True)

In [40]:
X_train= train_term1.drop(columns=["diemqt", "username"])
y = train_term1['diemqt']
y = y.astype(float)

In [41]:
y = np.asarray(y)
X_pca = np.asarray(X_train)

In [42]:
def objective_lgb(trial):
    params = {
        "objective": "regression",
        "metric": "rmse",
        "n_estimators": 1000,
        "verbosity": -1,
        "bagging_freq": 1,
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 2, 2**10),
        "subsample": trial.suggest_float("subsample", 0.05, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.05, 1.0),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 100),
        'device': 'gpu',
    }
    cv = KFold(n_splits=5, shuffle=True,random_state = 42)
    r2_list = []

    for train_index, test_index in cv.split(X_pca,y):
        X_train_fold, X_test = X_pca[train_index], X_pca[test_index]
        y_train, y_test = y[train_index], y[test_index]

        model = lgb.LGBMRegressor(**params)

        model.fit(X_train_fold, y_train)
        y_pred = model.predict(X_test)
        r2 = r2_score(y_test, y_pred)
        r2_list.append(r2)

    return np.mean(r2_list)

In [43]:
def objective_cat(trial):
    params = {
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
        'depth': trial.suggest_int('depth', 3, 8),
        'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1e-3, 10),
        'iterations': trial.suggest_int('iterations', 100, 1000),
        'eval_metric': 'RMSE',
        'random_seed': 42,
        'verbose': False,
        'loss_function':'RMSE',
        'task_type': 'GPU'
    }

    cv = KFold(n_splits=5, shuffle=True,random_state = 42)
    r2_list = []

    for train_index, test_index in cv.split(X_pca,y):
        X_train_fold, X_test = X_pca[train_index], X_pca[test_index]
        y_train, y_test = y[train_index], y[test_index]
        model = CatBoostRegressor(**params)
        model.fit(X_train_fold, y_train, eval_set=(X_test, y_test), early_stopping_rounds=10)
        y_pred = model.predict(X_test)
        r2 = r2_score(y_test, y_pred)
        r2_list.append(r2)

    return np.mean(r2_list)

In [44]:
study_lgb = optuna.create_study(direction="minimize")
study_lgb.optimize(objective_lgb, n_trials=30)
print('Best parameters for LightGBM:', study_lgb.best_params)

[I 2024-06-27 18:49:27,474] A new study created in memory with name: no-name-995e5feb-545c-40ff-b32d-bc43df9127e5
[I 2024-06-27 18:49:44,994] Trial 0 finished with value: 0.34540462780390746 and parameters: {'learning_rate': 0.006228763350003019, 'num_leaves': 549, 'subsample': 0.5494367576007241, 'colsample_bytree': 0.2264260129903331, 'min_data_in_leaf': 26}. Best is trial 0 with value: 0.34540462780390746.
[I 2024-06-27 18:49:47,874] Trial 1 finished with value: 0.270905226720764 and parameters: {'learning_rate': 0.022511079326359047, 'num_leaves': 367, 'subsample': 0.4788764556416591, 'colsample_bytree': 0.0578123488609055, 'min_data_in_leaf': 81}. Best is trial 1 with value: 0.270905226720764.
[I 2024-06-27 18:49:51,991] Trial 2 finished with value: 0.29803611133391217 and parameters: {'learning_rate': 0.00622066865707042, 'num_leaves': 526, 'subsample': 0.7424201330437578, 'colsample_bytree': 0.7272379393794938, 'min_data_in_leaf': 77}. Best is trial 1 with value: 0.2709052267207

Best parameters for LightGBM: {'learning_rate': 0.023150863954719984, 'num_leaves': 839, 'subsample': 0.23380126793992645, 'colsample_bytree': 0.9559755275835972, 'min_data_in_leaf': 92}


In [45]:
study_cat = optuna.create_study(direction='minimize')
study_cat.optimize(objective_cat, n_trials=30)
print('Best parameters for CatBoost:', study_cat.best_params)

[I 2024-06-27 18:51:56,500] A new study created in memory with name: no-name-20db82c8-477c-4719-9876-63082580a074
[I 2024-06-27 18:52:06,028] Trial 0 finished with value: 0.3014993554572426 and parameters: {'learning_rate': 0.044622680614353843, 'depth': 5, 'l2_leaf_reg': 0.003777639427760866, 'iterations': 147}. Best is trial 0 with value: 0.3014993554572426.
[I 2024-06-27 18:52:22,393] Trial 1 finished with value: 0.344350981393828 and parameters: {'learning_rate': 0.07063399079565157, 'depth': 8, 'l2_leaf_reg': 3.0511046018941705, 'iterations': 210}. Best is trial 0 with value: 0.3014993554572426.
[I 2024-06-27 18:52:37,314] Trial 2 finished with value: 0.3204579286108822 and parameters: {'learning_rate': 0.025404633492631605, 'depth': 7, 'l2_leaf_reg': 0.0054263126533591775, 'iterations': 387}. Best is trial 0 with value: 0.3014993554572426.
[I 2024-06-27 18:52:53,339] Trial 3 finished with value: 0.2989892478779722 and parameters: {'learning_rate': 0.0386373662785382, 'depth': 7, 

Best parameters for CatBoost: {'learning_rate': 0.010068123038950086, 'depth': 3, 'l2_leaf_reg': 0.018807915546575443, 'iterations': 913}


In [46]:
lgb_best_params = study_lgb.best_params
cat_best_params = study_cat.best_params

kf = KFold(n_splits=5, shuffle=True, random_state=42)
voting_r2 = []

for train_index, test_index in kf.split(X_pca):
    X_train_fold, X_test = X_pca[train_index], X_pca[test_index]
    y_train, y_test = y[train_index], y[test_index]

    lgb_model = lgb.LGBMRegressor(**lgb_best_params)
    cat_model = CatBoostRegressor(**cat_best_params)

    voting_model = VotingRegressor(estimators=[
        ('lgb', lgb_model),
        ('cat', cat_model)
    ])
    cat_model.fit(X_train_fold, y_train)
    lgb_model.fit(X_train_fold, y_train)
    voting_model.fit(X_train_fold, y_train)
    y_pred = voting_model.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    voting_r2.append(r2)

print(f'Voting Regressor R^2: {np.mean(voting_r2):.4f} ± {np.std(voting_r2):.4f}')


0:	learn: 1.7688813	total: 19.1ms	remaining: 17.4s
1:	learn: 1.7660052	total: 27.7ms	remaining: 12.6s
2:	learn: 1.7628779	total: 36.6ms	remaining: 11.1s
3:	learn: 1.7601227	total: 45.1ms	remaining: 10.2s
4:	learn: 1.7573586	total: 53.5ms	remaining: 9.71s
5:	learn: 1.7548579	total: 62.2ms	remaining: 9.41s
6:	learn: 1.7515377	total: 71.5ms	remaining: 9.25s
7:	learn: 1.7476583	total: 80.2ms	remaining: 9.07s
8:	learn: 1.7455079	total: 87.9ms	remaining: 8.83s
9:	learn: 1.7428656	total: 95ms	remaining: 8.58s
10:	learn: 1.7410325	total: 102ms	remaining: 8.36s
11:	learn: 1.7380205	total: 109ms	remaining: 8.15s
12:	learn: 1.7352677	total: 117ms	remaining: 8.11s
13:	learn: 1.7335420	total: 129ms	remaining: 8.26s
14:	learn: 1.7310112	total: 140ms	remaining: 8.36s
15:	learn: 1.7281223	total: 149ms	remaining: 8.38s
16:	learn: 1.7262003	total: 159ms	remaining: 8.39s
17:	learn: 1.7237674	total: 172ms	remaining: 8.56s
18:	learn: 1.7204672	total: 182ms	remaining: 8.54s
19:	learn: 1.7171896	total: 195ms

In [47]:
import joblib

In [48]:
joblib.dump(voting_model, 'voting_regressor.joblib')
joblib.dump(lgb_model, 'LGBMRegressor.joblib')
joblib.dump(cat_model, 'CatBoostRegressor.joblib')

['CatBoostRegressor.joblib']