In [1]:
import sys  # System-specific parameters and functions
import numpy as np  # Fundamental package for scientific computing with Python
import pandas as pd  # Powerful data structures for data manipulation and analysis
from datetime import datetime  # Basic date and time types
import warnings  # Warning control
warnings.filterwarnings('ignore')  # Ignore warnings

In [2]:
df = pd.read_csv('/kaggle/input/competition3/data-tbtl/annonimized.csv')
df = df[(df['is_final'] != 0) | (df['pre_score'] != 10000)]
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 288863 entries, 0 to 295197
Data columns (total 11 columns):
 #   Column                           Non-Null Count   Dtype 
---  ------                           --------------   ----- 
 0   concat('it001',`assignment_id`)  288863 non-null  object
 1   concat('it001',`problem_id`)     288863 non-null  object
 2   concat('it001', username)        288863 non-null  object
 3   is_final                         288863 non-null  int64 
 4   status                           288863 non-null  object
 5   pre_score                        288863 non-null  int64 
 6   coefficient                      288863 non-null  int64 
 7   concat('it001',`language_id`)    288863 non-null  object
 8   created_at                       288863 non-null  object
 9   updated_at                       288863 non-null  object
 10  judgement                        288863 non-null  object
dtypes: int64(3), object(8)
memory usage: 26.4+ MB


In [3]:
df = df.rename(columns={"concat('it001',`assignment_id`)": 'assignment_id'})
df = df.rename(columns={"concat('it001',`problem_id`)":'problem_id'})
df = df.rename(columns={"concat('it001', username)":'username'})

In [4]:
df = df.drop(["concat('it001',`language_id`)", 'updated_at'], axis=1)

In [5]:
def calculate_frequency_vector(student_df):
    a = pd.to_datetime(student_df['created_at'], format='%m-%d %H:%M:%S', errors='coerce').dt.hour
    a = a.dropna()  # Drop rows with NaT (errors in parsing)

    hour_counts = a.value_counts().sort_index()
    frequency_vector = np.zeros(24)
    for hour, count in hour_counts.items():
        hour = int(hour)
        frequency_vector[hour] = count
    return frequency_vector

# Dictionary to store frequency vectors for each student
frequency_vectors = {}

# Iterate over each unique student and calculate their frequency vector
for username in df["username"].unique():
    student_df = df[df["username"] == username]
    frequency_vectors[username] = calculate_frequency_vector(student_df)

In [6]:
frequency_df = pd.DataFrame.from_dict(frequency_vectors, orient='index', columns=[f'hour_{i}' for i in range(24)]).reset_index()
frequency_df.rename(columns={'index': "username"}, inplace=True)

In [7]:
df = df.merge(frequency_df, on="username")

In [8]:
fixed_year = 2024  # Chọn một năm bất kỳ
df['created_at'] = [f"{fixed_year}-{date}" for date in df['created_at']]

In [9]:
df['created_at'] = pd.to_datetime(df['created_at'])
def adjust_year(date):
    if date.month >= 9:
        return date.replace(year=date.year - 1)
    return date

# Áp dụng hàm này vào cột 'created_at'
df['created_at'] = df['created_at'].apply(adjust_year)

# Xử lý assignment

In [10]:
from sklearn.preprocessing import LabelEncoder

In [11]:
label_encoder = LabelEncoder()

# Fit và transform cột assignment_id
df['assignment_id_encoded'] = label_encoder.fit_transform(df['assignment_id'])

# Hiển thị kết quả
print(df[['username', 'assignment_id', 'assignment_id_encoded']])

                                        username  \
0       ed9eaeb6a707f50154024b24d7efcb874a9795dd   
1       ed9eaeb6a707f50154024b24d7efcb874a9795dd   
2       ed9eaeb6a707f50154024b24d7efcb874a9795dd   
3       ed9eaeb6a707f50154024b24d7efcb874a9795dd   
4       ed9eaeb6a707f50154024b24d7efcb874a9795dd   
...                                          ...   
288858  232cce96362898f08e9150ba244adaf2d6583ab2   
288859  232cce96362898f08e9150ba244adaf2d6583ab2   
288860  232cce96362898f08e9150ba244adaf2d6583ab2   
288861  232cce96362898f08e9150ba244adaf2d6583ab2   
288862  232cce96362898f08e9150ba244adaf2d6583ab2   

                                   assignment_id  assignment_id_encoded  
0       90ce27571176d87961b565d5ef4b3de33ede04ac                    116  
1       90ce27571176d87961b565d5ef4b3de33ede04ac                    116  
2       90ce27571176d87961b565d5ef4b3de33ede04ac                    116  
3       90ce27571176d87961b565d5ef4b3de33ede04ac                    116  
4    

In [12]:
def calculate_assignment_vector(student_df):
    a = student_df['assignment_id_encoded'].unique()

    assignment_vector = np.zeros(203)
    for value in a:
        assignment_vector[value] = 1
    return assignment_vector

In [13]:
def calculate_count_assignment_vector(student_df):
    a = student_df.groupby('assignment_id_encoded').size()
    count_assignment_vector = np.zeros(203)
    for i, count in a.items():
        count_assignment_vector[i] = count
    return count_assignment_vector

In [14]:
def calculate_status_assignment_vector(student_df):
    status_counts = student_df.groupby('assignment_id_encoded')['status'].value_counts()
    a = status_counts[status_counts.index.get_level_values(1) != 'SCORE']
    status_counts_vector = np.zeros(203)
    for i, count in a.items():
        status_counts_vector[i[0]] = count
    return status_counts_vector

In [15]:
def calculate_count_problem_vector(student_df):
    a = student_df.groupby('assignment_id_encoded')['problem_id'].nunique()
    problem_counts_vector = np.zeros(203)
    for i, count in a.items():
        problem_counts_vector[i] = count
    return problem_counts_vector

In [16]:
def calculate_time_assignment_vector(student_df):
    a = student_df.groupby('assignment_id_encoded')['created_at'].agg(lambda x: (x.max() - x.min()).total_seconds() // 3600)
    time_assignment_vector = np.zeros(203)
    for i, count in a.items():
        time_assignment_vector[i] = count
    return time_assignment_vector

In [17]:
def calculate_count_0_vector(student_df):
    counts_0 = student_df.groupby('assignment_id_encoded')['is_final'].value_counts()
    a = counts_0[counts_0.index.get_level_values(1) == 0]
    counts_0_vector = np.zeros(203)
    for i, count in a.items():
        counts_0_vector[i[0]] = count
    return counts_0_vector

In [18]:
def calculate_mean_prescrore_vector(student_df):
    a = student_df.groupby('assignment_id_encoded')['pre_score'].mean()
    mean_prescrore_vector = np.zeros(203)
    for i, count in a.items():
        if count > 0:
            mean_prescrore_vector[i] = np.log(count)
        else:
            mean_prescrore_vector[i] = 0
    return mean_prescrore_vector

In [19]:
# Dictionary to store frequency vectors for each student
assignment_vector = {}
count_assignment_vector = {}
status_counts_vector = {}
problem_counts_vector = {}
time_assignment_vector = {}
counts_0_vector = {}
mean_prescrore_vector = {}

In [20]:
for username in df["username"].unique():
    student_df = df[df["username"] == username]
    #assignment_vector[username] = calculate_assignment_vector(student_df)
    #count_assignment_vector[username] = calculate_count_assignment_vector(student_df)
    status_counts_vector[username] = calculate_status_assignment_vector(student_df)
    problem_counts_vector[username] = calculate_count_problem_vector(student_df)
    #time_assignment_vector[username] = calculate_time_assignment_vector(student_df)
    #counts_0_vector[username] = calculate_count_0_vector(student_df)
    #mean_prescrore_vector[username] = calculate_mean_prescrore_vector(student_df)

In [21]:
# assignment_df = pd.DataFrame.from_dict(assignment_vector, orient='index', columns=[f'assignment_vector{i}' for i in range(203)]).reset_index()
# assignment_df.rename(columns={'index': "username"}, inplace=True)

# count_assignment_df = pd.DataFrame.from_dict(count_assignment_vector, orient='index', columns=[f'count_assignment_vector{i}' for i in range(203)]).reset_index()
# count_assignment_df.rename(columns={'index': "username"}, inplace=True)

status_counts_df = pd.DataFrame.from_dict(status_counts_vector, orient='index', columns=[f'status_counts_vector{i}' for i in range(203)]).reset_index()
status_counts_df.rename(columns={'index': "username"}, inplace=True)

problem_counts_df = pd.DataFrame.from_dict(problem_counts_vector, orient='index', columns=[f'problem_counts_vector{i}' for i in range(203)]).reset_index()
problem_counts_df.rename(columns={'index': "username"}, inplace=True)

# time_assignment_df = pd.DataFrame.from_dict(time_assignment_vector, orient='index', columns=[f'time_assignment_vector{i}' for i in range(203)]).reset_index()
# time_assignment_df.rename(columns={'index': "username"}, inplace=True)

# counts_0_df = pd.DataFrame.from_dict(counts_0_vector, orient='index', columns=[f'counts_0_vector{i}' for i in range(203)]).reset_index()
# counts_0_df.rename(columns={'index': "username"}, inplace=True)

# mean_prescrore_df = pd.DataFrame.from_dict(mean_prescrore_vector, orient='index', columns=[f'mean_prescrore_vector{i}' for i in range(203)]).reset_index()
# mean_prescrore_df.rename(columns={'index': "username"}, inplace=True)

In [22]:
# df = df.merge(assignment_df, on="username")

# df = df.merge(count_assignment_df, on="username")

df = df.merge(status_counts_df, on="username")

df = df.merge(problem_counts_df, on="username")

# df = df.merge(time_assignment_df, on="username")

# df = df.merge(counts_0_df, on="username")

# df = df.merge(mean_prescrore_df, on="username")

# Xử lý problem 

In [23]:
label_encoder1 = LabelEncoder()

# Fit và transform cột assignment_id
df['problem_id_encoded'] = label_encoder1.fit_transform(df['problem_id'])

# Hiển thị kết quả
print(df[['username', 'assignment_id', 'problem_id_encoded']])

                                        username  \
0       ed9eaeb6a707f50154024b24d7efcb874a9795dd   
1       ed9eaeb6a707f50154024b24d7efcb874a9795dd   
2       ed9eaeb6a707f50154024b24d7efcb874a9795dd   
3       ed9eaeb6a707f50154024b24d7efcb874a9795dd   
4       ed9eaeb6a707f50154024b24d7efcb874a9795dd   
...                                          ...   
288858  232cce96362898f08e9150ba244adaf2d6583ab2   
288859  232cce96362898f08e9150ba244adaf2d6583ab2   
288860  232cce96362898f08e9150ba244adaf2d6583ab2   
288861  232cce96362898f08e9150ba244adaf2d6583ab2   
288862  232cce96362898f08e9150ba244adaf2d6583ab2   

                                   assignment_id  problem_id_encoded  
0       90ce27571176d87961b565d5ef4b3de33ede04ac                 208  
1       90ce27571176d87961b565d5ef4b3de33ede04ac                 208  
2       90ce27571176d87961b565d5ef4b3de33ede04ac                 208  
3       90ce27571176d87961b565d5ef4b3de33ede04ac                 335  
4       90ce27571176

In [24]:
def calculate_problem_vector(student_df):
    a = student_df['problem_id_encoded'].unique()

    problem_vector = np.zeros(468)
    for value in a:
        problem_vector[value] = 1
    return problem_vector

In [25]:
def calculate_count_problem_vector(student_df):
    a = student_df.groupby('problem_id_encoded').size()
    count_problem_vector = np.zeros(468)
    for i, count in a.items():
        count_problem_vector[i] = count
    return count_problem_vector

In [26]:
def calculate_time_problem_vector(student_df):
    a = student_df.groupby('problem_id_encoded')['created_at'].agg(lambda x: (x.max() - x.min()).total_seconds() // 3600)
    time_problem_vector = np.zeros(468)
    for i, count in a.items():
        time_problem_vector[i] = count
    return time_problem_vector

In [27]:
def calculate_count_problem_0_vector(student_df):
    count_problem_0 = student_df.groupby('problem_id_encoded')['is_final'].value_counts()
    a = count_problem_0[count_problem_0.index.get_level_values(1) == 0]
    count_problem_0_vector = np.zeros(468)
    for i, count in a.items():
        count_problem_0_vector[i[0]] = count
    return count_problem_0_vector

In [28]:
def calculate_mean_prescrore_problem_vector(student_df):
    a = student_df.groupby('problem_id_encoded')['pre_score'].mean()
    mean_prescrore_problem_vector = np.zeros(468)
    for i, count in a.items():
        if count > 0:
            mean_prescrore_problem_vector[i] = np.log(count)
        else:
            mean_prescrore_problem_vector[i] = 0
    return mean_prescrore_problem_vector

In [29]:
problem_vector = {}
count_problem_vector = {}
time_problem_vector = {}
count_problem_0_vector = {}
mean_prescrore_problem_vector = {}

In [30]:
for username in df["username"].unique():
    student_df = df[df["username"] == username]
    problem_vector[username] = calculate_problem_vector(student_df)
    count_problem_vector[username] = calculate_count_problem_vector(student_df)
    time_problem_vector[username] = calculate_time_problem_vector(student_df)
    count_problem_0_vector[username] = calculate_count_problem_0_vector(student_df)
    mean_prescrore_problem_vector[username] = calculate_mean_prescrore_problem_vector(student_df)

In [31]:
problem_df = pd.DataFrame.from_dict(problem_vector, orient='index', columns=[f'problem_vector{i}' for i in range(468)]).reset_index()
problem_df.rename(columns={'index': "username"}, inplace=True)

count_problem_df = pd.DataFrame.from_dict(count_problem_vector, orient='index', columns=[f'count_problem_vector{i}' for i in range(468)]).reset_index()
count_problem_df.rename(columns={'index': "username"}, inplace=True)

time_problem_df = pd.DataFrame.from_dict(time_problem_vector, orient='index', columns=[f'time_problem_vector{i}' for i in range(468)]).reset_index()
time_problem_df.rename(columns={'index': "username"}, inplace=True)

count_problem_0_df = pd.DataFrame.from_dict(count_problem_0_vector, orient='index', columns=[f'count_problem_0_vector{i}' for i in range(468)]).reset_index()
count_problem_0_df.rename(columns={'index': "username"}, inplace=True)

mean_prescrore_problem_df = pd.DataFrame.from_dict(mean_prescrore_problem_vector, orient='index', columns=[f'mean_prescrore_problem_vector{i}' for i in range(468)]).reset_index()
mean_prescrore_problem_df.rename(columns={'index': "username"}, inplace=True)

In [32]:
df = df.merge(problem_df, on="username")

df = df.merge(count_problem_df, on="username")

df = df.merge(time_problem_df, on="username")

df = df.merge(count_problem_0_df, on="username")

df = df.merge(mean_prescrore_problem_df, on="username")

In [33]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

In [34]:
df = reduce_mem_usage(df)

Memory usage after optimization is: 1541.59 MB
Decreased by 74.8%


In [35]:
!pip install lightgbm catboost



In [36]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import os
import numpy as np
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.metrics import mean_squared_error as MSE
import lightgbm as lgb
import optuna
from sklearn.model_selection import cross_validate
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.metrics import r2_score
from sklearn.ensemble import VotingRegressor
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer, KNNImputer

In [37]:
train_term1 = df.drop(['assignment_id','problem_id','is_final','status','pre_score','coefficient','created_at','judgement'], axis = 1)

In [38]:
test_term1 = df.drop(['assignment_id','problem_id','is_final','status','pre_score','coefficient','created_at','judgement'], axis = 1)

In [39]:
test_term1.drop_duplicates(subset='username', keep='first', inplace=True)

In [40]:
qt_train = pd.read_csv('/kaggle/input/competition3/public_it001/qt-public.csv')

In [41]:
qt_train.isnull().sum()
qt_train = qt_train.rename(columns={"hash":'username'})
qt_train.dropna(inplace=True)
train_term1 = train_term1.merge(qt_train, on='username', how='inner')
train_term1.drop_duplicates(subset='username', keep='first', inplace=True)
train_term1['diemqt'] = train_term1['diemqt'].str.replace('\xa0', ' ', regex=True)
train_term1['diemqt'] = train_term1['diemqt'].replace(' ', np.nan).astype(float)
train_term1.dropna(inplace=True)

In [42]:
common_cols = train_term1.columns.intersection(test_term1.columns)

# Tạo DataFrame mới chỉ chứa các cột chung
train_term1_common = train_term1[common_cols]
df_test_common = test_term1[common_cols]

# Tìm các hàng khác nhau
different_rows = pd.concat([train_term1_common, df_test_common]).drop_duplicates(keep=False)

different_rows.reset_index(drop=True, inplace=True)
different_rows.head(5)

Unnamed: 0,username,hour_0,hour_1,hour_2,hour_3,hour_4,hour_5,hour_6,hour_7,hour_8,...,mean_prescrore_problem_vector458,mean_prescrore_problem_vector459,mean_prescrore_problem_vector460,mean_prescrore_problem_vector461,mean_prescrore_problem_vector462,mean_prescrore_problem_vector463,mean_prescrore_problem_vector464,mean_prescrore_problem_vector465,mean_prescrore_problem_vector466,mean_prescrore_problem_vector467
0,ed9eaeb6a707f50154024b24d7efcb874a9795dd,6.0,6.0,1.0,1.0,6.0,8.0,5.0,11.0,17.0,...,0.0,0.0,0.0,0.0,9.03125,0.0,0.0,8.054688,0.0,0.0
1,ba12c0a2cb367af0467e479c03507c71a805d291,1.0,3.0,7.0,10.0,25.0,5.0,16.0,28.0,36.0,...,9.210938,7.417969,0.0,0.0,0.0,0.0,8.875,9.210938,0.0,0.0
2,b7298b0fe50443a623af9b56792b330c2d052845,0.0,24.0,28.0,39.0,22.0,2.0,2.0,4.0,7.0,...,9.210938,9.210938,0.0,0.0,0.0,0.0,9.15625,9.210938,0.0,0.0
3,c60be70309789b39355dc612f36e37090ccad5dc,0.0,4.0,20.0,13.0,9.0,4.0,5.0,10.0,11.0,...,9.210938,9.210938,0.0,0.0,0.0,0.0,8.851562,9.125,0.0,0.0
4,a22a58c5be8aa2c2700619e37f2b7a6e4efa7e6b,0.0,6.0,0.0,0.0,1.0,1.0,19.0,38.0,55.0,...,9.210938,9.210938,0.0,0.0,0.0,0.0,9.101562,9.210938,0.0,0.0


In [43]:
X_test= different_rows.drop(columns=["username"])

In [44]:
X_pca = np.asarray(X_test)

In [45]:
import joblib

In [46]:
lgb_model = joblib.load('/kaggle/input/proccesing-data/LGBMRegressor.joblib')
y_pre1 = lgb_model.predict(X_pca)



In [47]:
cat_model = joblib.load('/kaggle/input/proccesing-data/CatBoostRegressor.joblib')
y_pre2 = cat_model.predict(X_pca)

In [48]:
votting_model = joblib.load('/kaggle/input/proccesing-data/voting_regressor.joblib')
y_pre3 = votting_model.predict(X_pca)



In [49]:
username = different_rows['username'].to_list()
results1 = []
for i, value in enumerate(y_pre1):
    results1.append((username[i], value))
df = pd.DataFrame(results1, columns=['file_name', 'label'])
df.to_csv('output_1.csv', index=False, header=False)

In [50]:
results2 = []
for i, value in enumerate(y_pre2):
    results2.append((username[i], value))
df = pd.DataFrame(results2, columns=['file_name', 'label'])
df.to_csv('output_2.csv', index=False, header=False)

In [51]:
results3 = []
for i, value in enumerate(y_pre3):
    results3.append((username[i], value))
df = pd.DataFrame(results3, columns=['file_name', 'label'])
df.to_csv('output_3.csv', index=False, header=False)