# **KASDD F1 Lap time - Biasa Aja**

# Import Dataset

In [None]:
# import library dan data
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import math
import scipy.stats as scp
import scipy.cluster.hierarchy as shc

from decimal import Decimal
from numpy.polynomial.polynomial import polyfit
from sklearn.preprocessing import LabelEncoder, Normalizer, StandardScaler, MinMaxScaler
from sklearn.feature_selection import chi2, mutual_info_regression, mutual_info_classif, SelectKBest, mutual_info_regression, SelectPercentile, mutual_info_regression, f_classif
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate, KFold
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, classification_report, confusion_matrix
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, make_scorer
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.naive_bayes import GaussianNB, MultinomialNB, CategoricalNB
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.linear_model import LinearRegression, Lasso, Ridge, LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split, KFold, cross_val_score, StratifiedKFold
from sklearn.cluster import KMeans
from yellowbrick.cluster import SilhouetteVisualizer, KElbowVisualizer
from sklearn.metrics import silhouette_score
from sklearn.cluster import AgglomerativeClustering
from sklearn.model_selection import RepeatedStratifiedKFold


f1_data = pd.read_csv("f1_train.csv")
f1_data = f1_data.drop(axis=1, columns=["ID"])
f1_data

In [None]:
f1_data.info()

# Preprocessing

In [None]:
def cek_duplicates(df):
    if df.duplicated().sum() > 0:
        print("Terdapat", df.duplicated().sum(), "pasang data yang redundan")
        display(df[df.duplicated()])
    else:
        print("Tidak ada data yang redundan")

def cek_null(df):
    col_na = df.isnull().sum().sort_values(ascending=True)
    percent = col_na*100 / len(df)

    missing_data = pd.concat([col_na, percent], axis=1, keys=['Total', 'Percent'])

    if (missing_data[missing_data['Total'] > 0].shape[0] == 0):
        print("Tidak ditemukan missing value pada dataset")

    else:
        print(missing_data[missing_data['Total'] > 0])

def cek_outlier(df):
    df_numerical = df.select_dtypes(include=['float64', 'int64']) 
    Q1 = df_numerical.quantile(0.25, numeric_only=True)
    Q3 = df_numerical.quantile(0.75, numeric_only=True)

    # Menghitung RUB dan RLB.
    IQR = Q3 - Q1
    lower_limit = Q1 - 1.5 * IQR
    upper_limit = Q3 + 1.5 * IQR

    # Menampilkan banyaknya outlier pada atribut.
    outliers = (df_numerical < lower_limit) | (df_numerical > upper_limit)

    # Menghitung dan menampilkan persentase outlier pada tiap atribut.
    percentage_outliers = (outliers.sum() / len(df)) * 100
    print("Persentase Outlier pada tiap atribut:")
    print(percentage_outliers)

### Handling Missing Value

In [None]:
cek_null(f1_data)

> Atribut `PitOutTime`, `PitInTime`, dan `DeletedReason` memiliki persentase jumlah missing value mencapai lebih dari 90%. Oleh karena itu, atribut-atribut tersebut perlu di drop.

In [None]:
f1_data = f1_data.drop(axis=1, columns=['PitOutTime', 'PitInTime', 'DeletedReason'])

In [None]:
def show_data_type_and_distribution(df):
    print("Menampilkan informasi dataset dan tipe data")
    print('#'*50)
    df.info()
    print('#'*50)
    print("Menampilkan distribusi data numerik")
    numerics = ['SpeedI2', 'SpeedFL', 'SpeedST', 'SpeedI1']
    for col in numerics:
        df_feature = f1_data[col]
        plt.figure(figsize=(10,5))
        plt.title(f'Distribusi data {col}')
        sns.kdeplot(df_feature, fill=True)
        plt.show()
    print("Menampilkan Modus dari data kategorikal")
    categoricals = ["IsPersonalBest", "Sector2SessionTime", "Sector2Time","Sector3SessionTime", "Sector3Time", "LapTime", "Sector1Time", "Sector1SessionTime"]
    for col in categoricals:
        print(f"Mode for {col} = {f1_data[col].mode()[0]}")

In [None]:
show_data_type_and_distribution(f1_data)

In [None]:
def fill_null_mean(df):
    mean = df.mean()
    filled_df = df.fillna(mean)
    return filled_df
def fill_null_median(df):
    median = df.median()
    filled_df = df.fillna(median)
    return filled_df
def fill_null_mode(df):
    mode = df.mode()[0]
    filled_df = df.fillna(mode)
    return filled_df

In [None]:
f1_data['SpeedI2'] = fill_null_median(f1_data['SpeedI2'])
f1_data['SpeedFL'] = fill_null_median(f1_data['SpeedFL'])
f1_data['SpeedST'] = fill_null_mean(f1_data['SpeedST'])
f1_data['SpeedI1'] = fill_null_median(f1_data['SpeedI1'])
f1_data['IsPersonalBest'] = fill_null_mode(f1_data['IsPersonalBest'])
f1_data['Sector2SessionTime'] = fill_null_mode(f1_data['Sector2SessionTime'])
f1_data['Sector2Time'] = fill_null_mode(f1_data['Sector2Time'])
f1_data['Sector3SessionTime'] = fill_null_mode(f1_data['Sector3SessionTime'])
f1_data['Sector3Time'] = fill_null_mode(f1_data['Sector3Time'])
f1_data['LapTime'] = fill_null_mode(f1_data['LapTime'])
f1_data['Sector1Time'] = fill_null_mode(f1_data['Sector1Time'])
f1_data['Sector1SessionTime'] = fill_null_mode(f1_data['Sector1SessionTime'])

In [None]:
cek_null(f1_data)

### Handling Duplicate Value

In [None]:
print("Jumlah duplikasi data : " + str(f1_data.duplicated().sum()))

In [None]:
f1_data = f1_data.drop_duplicates()

### Handling Outliers

In [None]:
f1_data.boxplot(vert=False,figsize=(20,20))
plt.show()

In [None]:
laptime_numerical = f1_data.select_dtypes(include=['float64', 'int64']) 
Q1 = laptime_numerical.quantile(0.25)
Q3 = laptime_numerical.quantile(0.75)
IQR = Q3 - Q1

In [None]:
print(cek_outlier(f1_data))

> Atribut `Presure` digunakan untuk memprediksi `TyreLife`, sehingga tidak kami drop .

### Encoding Categorical Attributes

In [None]:
print("Menampilkan nilai unique yang terdapat pada setiap kolom kategorikal")
print('#'*70)
print()
for col in f1_data.select_dtypes(include=object).columns:
    print(col, f": {len(f1_data[col].unique())}", f1_data[col].unique())
    print("\n")

In [None]:
f1_data['Compound'] = f1_data['Compound'].map({'INTERMEDIATE':1, 'MEDIUM':4, 'HARD':2, 'SOFT':3, 'WET':0})
f1_data['Pos_cat'] = f1_data['Pos_cat'].map({'Participant':0, 'Podium':2, 'Point':1})

In [None]:
def zero_days_remover (duration):
    return duration.replace('0 days ', '')

In [None]:
time_att = []
for col in f1_data.select_dtypes(include=object).columns:
    time_att.append(col)

In [None]:
time_att

In [None]:
for time in time_att:
    f1_data[time] = f1_data[time].apply(zero_days_remover)

Untuk memudahkan encoding waktu, kami menghilangkan 0 days karena seluruh data memiliki 0 days dan hal tersebut tidak membantu kami dalam melakukan encoding waktu

In [None]:
import datetime
import time


In [None]:
def time_converter (time_str):
    if '.' in time_str:
        time_str, milliseconds = time_str.split('.')
        milliseconds = int(milliseconds)
    else:
        milliseconds = 0

    x_time = datetime.datetime.strptime(time_str, '%H:%M:%S')

    total_seconds = datetime.timedelta(
        hours=x_time.hour,
        minutes=x_time.minute,
        seconds=x_time.second,
        microseconds=milliseconds / 1000  # Convert milliseconds to microseconds
    ).total_seconds()
    
    return total_seconds

In [None]:
for time in time_att:
    f1_data[time] = f1_data[time].apply(time_converter)

In [None]:
f1_data

In [None]:
def convert_time_period(seconds):
    if 0 <= seconds < 6 * 3600:
        return 3   # From 00:00 to 05:59
    elif 6 * 3600 <= seconds < 12 * 3600:
        return 0  # From 06:00 to 11:59
    elif 12 * 3600 <= seconds < 17 * 3600:
        return 1  # From 12:00 to 16:59
    elif 17 * 3600 <= seconds < 21 * 3600:
        return 2  # From 17:00 to 20:59
    else:
        return 3  # From 21:00 to 23:59
f1_data['Time'] = f1_data['Time'].apply(convert_time_period)
f1_data['LapStartTime'] = f1_data['LapStartTime'].apply(convert_time_period)

f1_data

# **EKSPLORASI** 

Apakah penggunaan ‘Compound’ yang berbeda berpengaruh terhadap performa? 


Performa dari suatu mobil F1 dapat ditentukan melalui waktu yang ditempuh mobil tersebut selama satu lap, yaitu atribut LapTime. Atribut ini juga kami pilih sebagai representasi performa yang dianalisis pengaruhnya oleh Compound karena dalam balapan F1, tipe compound yang berbeda tidak mungkin dipakai dalam satu lap yang sama. Oleh karena itu pada eksplorasi ini kami hanya mengambil atribut Compound dan LapTime

In [None]:
compount_influence = f1_data.copy()

In [None]:
compount_influence = compount_influence[['Compound', 'LapTime']]
compount_influence

In [None]:
compount_influence['Compound'] = compount_influence['Compound'].map({1: 'INTERMEDIATE', 4: 'MEDIUM', 2: 'HARD', 3: 'SOFT', 0: 'WET'})
compount_influence

In [None]:
compounds = compount_influence['Compound'].unique()
for compound in compounds:
    plt.figure(figsize=(8, 6))
    subset = compount_influence[compount_influence['Compound'] == compound]['LapTime']
    sns.kdeplot(subset, fill=True)
    plt.title(f'KDE of Lap Times for {compound} Compound')
    plt.xlabel('Lap Time')
    plt.ylabel('Density')
    plt.grid(True)
    plt.show()

In [None]:
median_laptimes = compount_influence.groupby('Compound')['LapTime'].median()
median_laptimes = median_laptimes.sort_values()
median_laptimes.plot(kind='bar', figsize=(10, 6), color='skyblue')
plt.title('Median Lap Times by Compound')
plt.xlabel('Compound Type')
plt.ylabel('Median Lap Time')
plt.xticks(rotation=45)
plt.grid(True)
plt.show()

> Grafik diatas adalah grafik yang menampilkan median LapTime dari setiap Compound dan diurutkan dari LapTime paling cepat hingga LapTime paling lambat

> Berdasarkan visualisasi diatas, dapat dianalisis bahwa setiap tipe Compound dapat mempengaruhi performa mobil F1 karena menghasilkan LapTime yang berbeda-beda. Performa terbaik didapatkan ketika mobil-mobil F1 menggunakan Compound bertipe SOFT. Lalu, diikuti dengan tipe Compound HARD, MEDIUM, & INTERMEDIATE untuk performa terbaik kedua, ketiga, dan keempat. Untuk performa terburuk, didapatkan ketika mobil F1 menggunakan tipe Compound WET.

Bagaimana ciri-ciri driver dengan kategori posisi ‘Pos_cat’ Podium dibandingkan dengan kategori posisi lainnya?


Analisis perbedaan ketika sirkuit hujan ‘Rainfall’ atau tidak


In [138]:
df_rain = f1_data.copy()
#plt.figure(figsize=(40,40))
#sns.heatmap(df_rain.corr(),annot=True)
#plt.show()
mean_attributes = ['LapTime', 'TyreLife', 'AirTemp', 'Humidity', 'TrackTemp','WindSpeed']
mode_attributes = ['Compound']

reverse_compound_mapping = {1: 'INTERMEDIATE', 4: 'MEDIUM', 2: 'HARD', 3: 'SOFT', 0: 'WET'}
df_rain['Compound'] = df_rain['Compound'].map(reverse_compound_mapping)

In [133]:
rainfall_data = df_rain[df_rain['Rainfall'] == True]
rainfall_mean = rainfall_data[mean_attributes].median()
rainfall_mode = rainfall_data[mode_attributes].mode().iloc[0]

rainfall_combined = pd.concat([rainfall_mean, rainfall_mode])
rainfall_combined


LapTime         100.00078
TyreLife              7.0
AirTemp              17.5
Humidity             66.0
TrackTemp            27.3
WindSpeed             1.7
Compound     INTERMEDIATE
dtype: object

In [134]:
no_rainfall_data = df_rain[df_rain['Rainfall'] == False]
no_rainfall_mean = no_rainfall_data[mean_attributes].median()
no_rainfall_mode = no_rainfall_data[mode_attributes].mode().iloc[0]

no_rainfall_combined = pd.concat([no_rainfall_mean, no_rainfall_mode])
no_rainfall_combined

LapTime      88.0006
TyreLife        12.0
AirTemp         25.6
Humidity        54.0
TrackTemp       34.3
WindSpeed        1.5
Compound        HARD
dtype: object

Di sini, kami akan melakukan perbandingan kondisi balapan ketika sirkuit 'Rainfall' dan tidak dengan asumsi bahwa data ini adalah kumpulan dari balapan-balapan satu musim di lokasi dan waktu yg berbeda. Dari data yang telah diolah, kami menemukan bahwa terdapat beberapa aspek yang berbeda dari sirkuit 'Rainfall' dan tidak.  LapTime dari dua keadaan sirkuit ini berbeda cukup signifikan, di mana dalam keadaan 'Rainfall' diperlukan lebih banyak waktu untuk melakukan 'LapTime' jika dibandingkan dengan track tidak 'Rainfall'. Dari segi 'AirTemp', sirkuit 'Rainfall' memiliki temperatur udara yang lebih rendah. Selain itu, sirkuit 'Rainfall' memiliki temperatur track yang lebih rendah, serta memiliki 'humidity' lebih tinggi jika dibanding track yang tidak 'Rainfall'. Ban Intermediate menjadi ban yang sering dipakai pada sirkuit 'Rainfall'. Di sirkuit yang tidak 'Rainfall', temperatur track lebih tinggi dibanding track 'Rainfall' dan memiliki 'WindDirection' lebih tinggi dibanding track 'Rainfall'. Adapun 'Compound' ban yang sering dipakai untuk sirkuit tidak 'Rainfall' adalah ban berjenis Hard. Kedua tipe sirkuit ini tidak berbeda terlalu jauh jika kita tinjau dari segi 'Pressure' atau tekanan udara.

Adakah rentang umur ban ‘TyreLife’ dengan performa terbaik dibandingkan rentang umur ban lainnya?


# **REGRESI**

In [None]:
f1_data

In [None]:
# Decision Tree  

# **KLASIFIKASI**

Klasifikasi digunakan untuk membuat model dengan target `Pos_cat`

In [None]:
# Split Dataset Klasifikasi
X_classification_temp = f1_data.drop(columns=['Pos_cat'], axis=1)
y_classification = f1_data['Pos_cat']

mi = mutual_info_classif(X_classification_temp, y_classification)
mi = pd.Series(mi)
mi.index = X_classification_temp.columns
mi.sort_values(ascending=False)

KBest = math.ceil(0.2 * len(mi.index))

selector = SelectKBest(f_classif, k=KBest) 
X_classification = selector.fit_transform(X_classification_temp, y_classification)
input_features = selector.feature_names_in_
selector.get_feature_names_out(input_features=input_features)

In [None]:
X_class_train, X_class_test, y_class_train, y_class_test = train_test_split(X_classification, y_classification, test_size=0.3, random_state=42)

In [None]:
#Standarisasi
scaler = MinMaxScaler()

X_train_scaled = scaler.fit_transform(X_class_train)
X_test_scaled = scaler.transform(X_class_test)

In [None]:
# Decision Tree  

In [None]:
# Random Forest

In [None]:
# Decision Tree Hyperparameter

In [None]:
# Random Forest Hyperparameter