# **KASDD F1 Lap time - Biasa Aja**

# Import Dataset

### **Sumber**
> https://www.datacamp.com/tutorial/random-forests-classifier-python <br>
> https://www.freecodecamp.org/news/how-to-use-the-tree-based-algorithm-for-machine-learning/ <br>
> https://forecastegy.com/posts/does-random-forest-need-feature-scaling-or-normalization/#:~:text=If%20you%20are%20using%20Random,does%20not%20require%20feature%20scaling. <br>
> https://medium.com/@jackiee.jecksom/clustering-and-principal-component-analysis-pca-from-sklearn-c8ea5fed6648 <br>
> https://365datascience.com/tutorials/python-tutorials/pca-k-means/

In [274]:
# import library dan data
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import math
import scipy.stats as scp
import scipy.cluster.hierarchy as shc

from xgboost import XGBClassifier
from numpy.polynomial.polynomial import polyfit
from sklearn.preprocessing import LabelEncoder, Normalizer, StandardScaler, MinMaxScaler
from sklearn.feature_selection import chi2, mutual_info_regression, mutual_info_classif, SelectKBest, mutual_info_regression, SelectPercentile, mutual_info_regression, f_classif
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate, KFold
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, classification_report, confusion_matrix
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, make_scorer
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.naive_bayes import GaussianNB, MultinomialNB, CategoricalNB
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.linear_model import LinearRegression, Lasso, Ridge, LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split, KFold, cross_val_score, StratifiedKFold
from sklearn.cluster import KMeans
from yellowbrick.cluster import SilhouetteVisualizer, KElbowVisualizer
from sklearn.metrics import silhouette_score
from sklearn.cluster import AgglomerativeClustering
from sklearn.model_selection import RepeatedStratifiedKFold

train = pd.read_csv("f1_train.csv")
train = train.drop(axis=1, columns=["ID"])

f1_classif = pd.read_csv("f1_test_classif.csv")
id_classif = f1_classif['ID']
f1_classif = f1_classif.drop(axis=1, columns=["ID"])

f1_reg = pd.read_csv("f1_test_reg.csv")
id_reg = f1_reg['ID']
f1_reg = f1_reg.drop(axis=1, columns=["ID"])

# Preprocessing

In [275]:
def cek_duplicates(df):
    if df.duplicated().sum() > 0:
        print("Terdapat", df.duplicated().sum(), "pasang data yang redundan")
        display(df[df.duplicated()])
    else:
        print("Tidak ada data yang redundan")

def cek_null(df):
    col_na = df.isnull().sum().sort_values(ascending=True)
    percent = col_na*100 / len(df)

    missing_data = pd.concat([col_na, percent], axis=1, keys=['Total', 'Percent'])

    if (missing_data[missing_data['Total'] > 0].shape[0] == 0):
        print("Tidak ditemukan missing value pada dataset")

    else:
        print(missing_data[missing_data['Total'] > 0])

def cek_outlier(df):
    df_numerical = df.select_dtypes(include=['float64', 'int64']) 
    Q1 = df_numerical.quantile(0.25, numeric_only=True)
    Q3 = df_numerical.quantile(0.75, numeric_only=True)

    # Menghitung RUB dan RLB.
    IQR = Q3 - Q1
    lower_limit = Q1 - 1.5 * IQR
    upper_limit = Q3 + 1.5 * IQR

    # Menampilkan banyaknya outlier pada atribut.
    outliers = (df_numerical < lower_limit) | (df_numerical > upper_limit)

    # Menghitung dan menampilkan persentase outlier pada tiap atribut.
    percentage_outliers = (outliers.sum() / len(df)) * 100
    print("Persentase Outlier pada tiap atribut:")
    print(percentage_outliers)

### Handling Missing Value

In [276]:
cek_null(f1_classif)

                    Total    Percent
IsPersonalBest          5   0.212857
Sector2SessionTime      5   0.212857
Sector2Time             5   0.212857
SpeedI2                 5   0.212857
Sector3SessionTime     13   0.553427
Sector3Time            13   0.553427
LapTime                41   1.745424
Sector1Time            52   2.213708
Sector1SessionTime     58   2.469136
SpeedFL                95   4.044274
SpeedST               239  10.174542
SpeedI1               359  15.283099
PitOutTime           2230  94.934014
PitInTime            2259  96.168582
DeletedReason        2312  98.424862


In [277]:
cek_null(f1_reg)

                    Total    Percent
IsPersonalBest          3   0.127768
Sector2SessionTime      3   0.127768
Sector2Time             3   0.127768
SpeedI2                 3   0.127768
Sector3SessionTime      8   0.340716
Sector3Time             8   0.340716
LapTime                32   1.362862
Sector1Time            62   2.640545
Sector1SessionTime     65   2.768313
SpeedFL                91   3.875639
SpeedST               225   9.582624
SpeedI1               364  15.502555
PitOutTime           2215  94.335605
PitInTime            2261  96.294719
DeletedReason        2314  98.551959


In [278]:
cek_null(train)

                    Total    Percent
IsPersonalBest         30   0.159676
SpeedI2                39   0.207579
Sector2SessionTime     39   0.207579
Sector2Time            39   0.207579
Sector3SessionTime     92   0.489674
Sector3Time            92   0.489674
LapTime               290   1.543538
Sector1Time           407   2.166276
Sector1SessionTime    444   2.363211
SpeedFL               775   4.124973
SpeedST              1692   9.005748
SpeedI1              2917  15.525868
PitOutTime          17749  94.469874
PitInTime           18053  96.087928
DeletedReason       18477  98.344688


In [279]:
f1_classif = f1_classif.drop(axis=1, columns=['PitOutTime', 'PitInTime', 'DeletedReason'])
f1_reg = f1_reg.drop(axis=1, columns=['PitOutTime', 'PitInTime', 'DeletedReason'])
train = train.drop(axis=1, columns=['PitOutTime', 'PitInTime', 'DeletedReason'])

In [280]:
def show_data_type_and_distribution(df):
    print("Menampilkan informasi dataset dan tipe data")
    print('#'*50)
    df.info()
    print('#'*50)
    print("Menampilkan distribusi data numerik")
    numerics = ['SpeedI2', 'SpeedFL', 'SpeedST', 'SpeedI1']
    for col in numerics:
        df_feature = f1_classif[col]
        plt.figure(figsize=(10,5))
        plt.title(f'Distribusi data {col}')
        sns.kdeplot(df_feature, fill=True)
        plt.show()
    print("Menampilkan Modus dari data kategorikal")
    categoricals = ["IsPersonalBest", "Sector2SessionTime", "Sector2Time","Sector3SessionTime", "Sector3Time", "LapTime", "Sector1Time", "Sector1SessionTime"]
    for col in categoricals:
        print(f"Mode for {col} = {f1_classif[col].mode()[0]}")

In [281]:
def fill_null_mean(df):
    mean = df.mean()
    filled_df = df.fillna(mean)
    return filled_df
def fill_null_median(df):
    median = df.median()
    filled_df = df.fillna(median)
    return filled_df
def fill_null_mode(df):
    mode = df.mode()[0]
    filled_df = df.fillna(mode)
    return filled_df

In [282]:
f1_reg['SpeedI2'] = fill_null_median(f1_reg['SpeedI2'])
f1_reg['SpeedFL'] = fill_null_median(f1_reg['SpeedFL'])
f1_reg['SpeedST'] = fill_null_mean(f1_reg['SpeedST'])
f1_reg['SpeedI1'] = fill_null_median(f1_reg['SpeedI1'])
f1_reg['IsPersonalBest'] = fill_null_mode(f1_reg['IsPersonalBest'])
f1_reg['Sector2SessionTime'] = fill_null_mode(f1_reg['Sector2SessionTime'])
f1_reg['Sector2Time'] = fill_null_mode(f1_reg['Sector2Time'])
f1_reg['Sector3SessionTime'] = fill_null_mode(f1_reg['Sector3SessionTime'])
f1_reg['Sector3Time'] = fill_null_mode(f1_reg['Sector3Time'])
f1_reg['LapTime'] = fill_null_mode(f1_reg['LapTime'])
f1_reg['Sector1Time'] = fill_null_mode(f1_reg['Sector1Time'])
f1_reg['Sector1SessionTime'] = fill_null_mode(f1_reg['Sector1SessionTime'])

In [283]:
f1_classif['SpeedI2'] = fill_null_median(f1_classif['SpeedI2'])
f1_classif['SpeedFL'] = fill_null_median(f1_classif['SpeedFL'])
f1_classif['SpeedST'] = fill_null_mean(f1_classif['SpeedST'])
f1_classif['SpeedI1'] = fill_null_median(f1_classif['SpeedI1'])
f1_classif['IsPersonalBest'] = fill_null_mode(f1_classif['IsPersonalBest'])
f1_classif['Sector2SessionTime'] = fill_null_mode(f1_classif['Sector2SessionTime'])
f1_classif['Sector2Time'] = fill_null_mode(f1_classif['Sector2Time'])
f1_classif['Sector3SessionTime'] = fill_null_mode(f1_classif['Sector3SessionTime'])
f1_classif['Sector3Time'] = fill_null_mode(f1_classif['Sector3Time'])
f1_classif['LapTime'] = fill_null_mode(f1_classif['LapTime'])
f1_classif['Sector1Time'] = fill_null_mode(f1_classif['Sector1Time'])
f1_classif['Sector1SessionTime'] = fill_null_mode(f1_classif['Sector1SessionTime'])

In [284]:
train['SpeedI2'] = fill_null_median(train['SpeedI2'])
train['SpeedFL'] = fill_null_median(train['SpeedFL'])
train['SpeedST'] = fill_null_mean(train['SpeedST'])
train['SpeedI1'] = fill_null_median(train['SpeedI1'])
train['IsPersonalBest'] = fill_null_mode(train['IsPersonalBest'])
train['Sector2SessionTime'] = fill_null_mode(train['Sector2SessionTime'])
train['Sector2Time'] = fill_null_mode(train['Sector2Time'])
train['Sector3SessionTime'] = fill_null_mode(train['Sector3SessionTime'])
train['Sector3Time'] = fill_null_mode(train['Sector3Time'])
train['LapTime'] = fill_null_mode(train['LapTime'])
train['Sector1Time'] = fill_null_mode(train['Sector1Time'])
train['Sector1SessionTime'] = fill_null_mode(train['Sector1SessionTime'])

In [285]:
f1_classif.drop_duplicates()
f1_reg.drop_duplicates()
train.drop_duplicates()

Unnamed: 0,Time,LapTime,Stint,Sector1Time,Sector2Time,Sector3Time,Sector1SessionTime,Sector2SessionTime,Sector3SessionTime,SpeedI1,...,LapStartTime,Deleted,AirTemp,Humidity,Pressure,Rainfall,TrackTemp,WindDirection,WindSpeed,Pos_cat
0,0 days 02:26:25.496000,0 days 00:01:40.943000,6.0,0 days 00:00:46.163000,0 days 00:00:28.979000,0 days 00:00:25.801000,0 days 02:25:36.926000,0 days 02:26:05.905000,0 days 02:26:31.706000,283.0,...,0 days 02:24:44.553000,False,17.1,68.0,1008.0,False,24.8,157,3.0,Participant
1,0 days 01:13:51.732000,0 days 00:01:48.067000,1.0,0 days 00:00:39.129000,0 days 00:00:43.601000,0 days 00:00:25.337000,0 days 01:12:42.794000,0 days 01:13:26.395000,0 days 01:13:51.732000,199.0,...,0 days 01:12:03.665000,False,25.0,49.0,1008.5,False,43.3,275,1.0,Podium
2,0 days 02:22:35.139000,0 days 00:01:22.881000,2.0,0 days 00:00:28.890000,0 days 00:00:18.554000,0 days 00:00:35.437000,0 days 02:21:41.150000,0 days 02:21:59.704000,0 days 02:22:35.141000,275.0,...,0 days 02:21:12.258000,False,17.6,57.0,1018.2,False,29.9,0,0.6,Participant
3,0 days 01:03:36.822000,0 days 00:01:31.585000,1.0,0 days 00:00:28.315000,0 days 00:00:28.929000,0 days 00:00:30.478000,0 days 01:04:58.279000,0 days 01:03:06.412000,0 days 01:03:36.927000,266.0,...,0 days 01:02:04.960000,False,17.6,67.0,1007.7,False,29.5,135,1.8,Podium
4,0 days 01:45:40.072000,0 days 00:01:34.742000,2.0,0 days 00:00:34.895000,0 days 00:00:29.874000,0 days 00:00:29.973000,0 days 01:44:40.174000,0 days 01:45:10.048000,0 days 01:45:40.021000,274.0,...,0 days 01:44:05.330000,False,26.1,55.0,1011.0,False,31.0,271,1.3,Point
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18783,0 days 01:27:00.913000,0 days 00:01:25.235000,2.0,0 days 00:00:30.620000,0 days 00:00:30.668000,0 days 00:00:23.947000,0 days 01:26:06.294000,0 days 01:26:36.962000,0 days 01:27:00.909000,276.0,...,0 days 01:25:35.678000,False,29.3,32.0,988.4,False,49.9,252,1.2,Participant
18784,0 days 01:21:32.309000,0 days 00:01:40.363000,1.0,0 days 00:00:29.488000,0 days 00:00:42.838000,0 days 00:00:28.037000,0 days 01:20:21.469000,0 days 01:21:04.307000,0 days 01:21:32.344000,297.0,...,0 days 01:19:51.946000,False,30.0,72.0,1009.0,False,36.8,145,1.1,Participant
18785,0 days 02:00:31.678000,0 days 00:01:52.495000,3.0,0 days 00:00:32.659000,0 days 00:00:49.567000,0 days 00:00:30.269000,0 days 01:59:11.871000,0 days 02:00:01.438000,0 days 02:00:31.707000,309.0,...,0 days 01:58:39.183000,False,17.6,66.0,966.7,False,30.3,146,1.1,Participant
18786,0 days 01:07:59.947000,0 days 00:01:25.938000,1.0,0 days 00:00:30.564000,0 days 00:00:31.370000,0 days 00:00:24.004000,0 days 01:07:04.556000,0 days 01:07:35.926000,0 days 01:07:59.930000,256.0,...,0 days 01:06:34.009000,False,28.8,33.0,988.5,False,50.1,134,1.5,Participant


### Handling Outliers

In [286]:
print(cek_outlier(f1_classif))

Persentase Outlier pada tiap atribut:
Stint             0.255428
SpeedI1           2.852278
SpeedI2           0.510856
SpeedFL           1.277139
SpeedST           8.173691
TyreLife          1.575138
AirTemp           0.000000
Humidity          0.000000
Pressure         16.560238
TrackTemp         0.000000
WindDirection     0.000000
WindSpeed         7.747978
dtype: float64
None


In [287]:
print(cek_outlier(f1_reg))

Persentase Outlier pada tiap atribut:
Stint             0.340716
SpeedI1           3.066440
SpeedI2           0.255537
SpeedFL           0.979557
SpeedST           7.708688
AirTemp           0.000000
Humidity          0.000000
Pressure         16.269165
TrackTemp         0.000000
WindDirection     0.000000
WindSpeed         6.090290
dtype: float64
None


In [288]:
print(cek_outlier(train))

Persentase Outlier pada tiap atribut:
Stint             0.298063
SpeedI1           2.746434
SpeedI2           0.548222
SpeedFL           1.319991
SpeedST           7.259953
TyreLife          1.948052
AirTemp           0.000000
Humidity          0.000000
Pressure         15.829253
TrackTemp         0.000000
WindDirection     0.000000
WindSpeed         8.026400
dtype: float64
None


> Atribut `Presure` digunakan untuk memprediksi `TyreLife`, sehingga tidak kami drop .

### Encoding Categorical Attributes

In [289]:
for col in f1_classif.select_dtypes(include=object).columns:
    print(col, f": {len(f1_classif[col].unique())}", f1_classif[col].unique())
    print("\n")

Time : 2349 ['0 days 01:19:09.993000' '0 days 01:46:56.673000'
 '0 days 01:58:13.959000' ... '0 days 01:16:20.505000'
 '0 days 02:38:25.861000' '0 days 01:46:48.433000']


LapTime : 2243 ['0 days 00:01:25.001000' '0 days 00:01:32.378000'
 '0 days 00:01:37.974000' ... '0 days 00:01:40.339000'
 '0 days 00:01:40.794000' '0 days 00:01:33.528000']


Sector1Time : 2135 ['0 days 00:00:30.631000' '0 days 00:00:31.596000'
 '0 days 00:00:30.942000' ... '0 days 00:00:29.114000'
 '0 days 00:00:26.874000' '0 days 00:00:31.922000']


Sector2Time : 2189 ['0 days 00:00:30.430000' '0 days 00:00:34.849000'
 '0 days 00:00:42.828000' ... '0 days 00:00:42.991000'
 '0 days 00:00:40.382000' '0 days 00:00:35.533000']


Sector3Time : 2135 ['0 days 00:00:23.940000' '0 days 00:00:25.933000'
 '0 days 00:00:24.204000' ... '0 days 00:00:28.234000'
 '0 days 00:00:33.538000' '0 days 00:00:26.073000']


Sector1SessionTime : 2290 ['0 days 01:18:15.630000' '0 days 01:45:55.886000'
 '0 days 01:57:06.877000' ... '0 days 0

In [290]:
f1_classif['Compound'] = f1_classif['Compound'].map({'INTERMEDIATE':1, 'MEDIUM':4, 'HARD':2, 'SOFT':3, 'WET':0})

f1_reg['Compound'] = f1_reg['Compound'].map({'INTERMEDIATE':1, 'MEDIUM':4, 'HARD':2, 'SOFT':3, 'WET':0})
f1_reg['Pos_cat'] = f1_reg['Pos_cat'].map({'Participant':0, 'Podium':2, 'Point':1})

train['Compound'] = train['Compound'].map({'INTERMEDIATE':1, 'MEDIUM':4, 'HARD':2, 'SOFT':3, 'WET':0})
train['Pos_cat'] = train['Pos_cat'].map({'Participant':0, 'Podium':2, 'Point':1})

In [291]:
def zero_days_remover (duration):
    return duration.replace('0 days ', '')

In [292]:
time_att = []
for col in f1_classif.select_dtypes(include=object).columns:
    time_att.append(col)

In [293]:
time_att

['Time',
 'LapTime',
 'Sector1Time',
 'Sector2Time',
 'Sector3Time',
 'Sector1SessionTime',
 'Sector2SessionTime',
 'Sector3SessionTime',
 'LapStartTime']

In [294]:
for time in time_att:
    f1_classif[time] = f1_classif[time].apply(zero_days_remover)
    f1_reg[time] = f1_reg[time].apply(zero_days_remover)
    train[time] = train[time].apply(zero_days_remover)

Untuk memudahkan encoding waktu, kami menghilangkan 0 days karena seluruh data memiliki 0 days dan hal tersebut tidak membantu kami dalam melakukan encoding waktu

In [295]:
import datetime
import time


In [296]:
def time_converter (time_str):
    if '.' in time_str:
        time_str, milliseconds = time_str.split('.')
        milliseconds = int(milliseconds)
    else:
        milliseconds = 0

    x_time = datetime.datetime.strptime(time_str, '%H:%M:%S')

    total_seconds = datetime.timedelta(
        hours=x_time.hour,
        minutes=x_time.minute,
        seconds=x_time.second,
        microseconds=milliseconds / 1000  # Convert milliseconds to microseconds
    ).total_seconds()
    
    return total_seconds

In [297]:
for time in time_att:
    f1_classif[time] = f1_classif[time].apply(time_converter)
    f1_reg[time] = f1_reg[time].apply(time_converter)
    train[time] = train[time].apply(time_converter)

In [298]:
boolean_columns = ['IsPersonalBest', 'FreshTyre', 'Deleted', 'Rainfall']
f1_classif = pd.get_dummies(f1_classif, columns = boolean_columns, drop_first=True) 
f1_reg = pd.get_dummies(f1_reg, columns = boolean_columns, drop_first=True) 
train = pd.get_dummies(train, columns = boolean_columns, drop_first=True) 

# **KLASIFIKASI**

In [299]:
# Split Dataset Klasifikasi
X_class_train = train.drop(columns=['Pos_cat'], axis=1)
y_class_train = train['Pos_cat']

X_class_test = f1_classif

In [300]:
from sklearn.preprocessing import PolynomialFeatures

scaler = StandardScaler()
scaler.fit(X_class_train)

X_class_train = scaler.transform(X_class_train)
X_class_test = scaler.transform(X_class_test)

poly = PolynomialFeatures(degree=2)

X_class_train = poly.fit_transform(X_class_train)
X_class_test = poly.transform(X_class_test)

## Random Forest

In [301]:
# Random Forest Hyperparameter
from sklearn.model_selection import RandomizedSearchCV
param_grid = {'criterion': ['entropy', 'gini'],
               'min_samples_split': [5, 10, 15, 20, 25],
               'min_samples_leaf': [1, 2, 4],
               'max_depth' : [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
               'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 
               }

rf_f1_hp = RandomForestClassifier()
clf_rfc_f1 = RandomizedSearchCV(rf_f1_hp, param_distributions=param_grid, cv=3, n_iter=100, n_jobs=-1, verbose=2, random_state=42)
clf_rfc_f1.fit(X_class_train, y_class_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


In [None]:
rf_f1_hp_applied = RandomForestClassifier(**clf_rfc_f1.best_params_)
rf_f1_hp_applied.fit(X_class_train, y_class_train)
# Memprediksi data testing
predicted = rf_f1_hp_applied.predict(X_class_test)

In [None]:
classif_df = pd.DataFrame({
    'ID' : id_classif,
    'Pos_cat' : predicted
})

classif_df['Pos_cat'] = classif_df['Pos_cat'].map({0: 'Participant', 2: 'Podium', 1: 'Point'})

classif_df.to_csv('f1_classif_result.csv', index=False)

## **REGRESI**

In [None]:
# Split dan seleksi fitur Dataset Klasifikasi
X_regress_train = train.drop(columns=['TyreLife'], axis=1)
y_regress_train = train['TyreLife']
X_regress_test = f1_reg

#Standarisasi
scaler = MinMaxScaler()

X_regress_train = scaler.fit_transform(X_regress_train)
X_regress_test = scaler.transform(X_regress_test)

In [None]:
ridge = Ridge(alpha=0.01)
f1ridge = ridge.fit(X_regress_train, y_regress_train)

In [None]:
predicted = f1ridge.predict(X_regress_test)

In [None]:
regress_df = pd.DataFrame({
    'ID' : id_reg,
    'TyreLife' : predicted
})

regress_df.to_csv('f1_reg_result.csv', index=False)