In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
# Load dataset
import chardet

# Buka file dalam mode byte untuk mendeteksi encoding
with open("2023-2024 NBA Player Stats - Regular.csv", "rb") as f:
    result = chardet.detect(f.read(100000))  # Baca sebagian data untuk deteksi
    print(result)



{'encoding': 'Windows-1252', 'confidence': 0.73, 'language': ''}


In [None]:
df = pd.read_csv("2023-2024 NBA Player Stats - Regular.csv", encoding="Windows-1252", delimiter=";")

# Tampilkan 5 baris pertama
print(df.head())

# Cek informasi dataset
print(df.info())

# Cek jumlah data per posisi
print(df["Pos"].value_counts())  # Sesuaikan dengan nama kolom posisi di dataset

   Rk            Player   Pos  Age   Tm   G  GS    MP   FG   FGA  ...    FT%  \
0   1  Precious Achiuwa  PF-C   24  TOT  74  18  21.9  3.2   6.3  ...  0.616   
1   1  Precious Achiuwa     C   24  TOR  25   0  17.5  3.1   6.8  ...  0.571   
2   1  Precious Achiuwa    PF   24  NYK  49  18  24.2  3.2   6.1  ...  0.643   
3   2       Bam Adebayo     C   26  MIA  71  71  34.0  7.5  14.3  ...  0.755   
4   3      Ochai Agbaji    SG   23  TOT  78  28  21.0  2.3   5.6  ...  0.661   

   ORB  DRB   TRB  AST  STL  BLK  TOV   PF   PTS  
0  2.6  4.0   6.6  1.3  0.6  0.9  1.1  1.9   7.6  
1  2.0  3.4   5.4  1.8  0.6  0.5  1.2  1.6   7.7  
2  2.9  4.3   7.2  1.1  0.6  1.1  1.1  2.1   7.6  
3  2.2  8.1  10.4  3.9  1.1  0.9  2.3  2.2  19.3  
4  0.9  1.8   2.8  1.1  0.6  0.6  0.8  1.5   5.8  

[5 rows x 30 columns]
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 735 entries, 0 to 734
Data columns (total 30 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Rk    

In [None]:
# Hapus kolom non-numerik (nama pemain, dll.)
X = df.drop(columns=["Pos"])  # Hapus kolom non-numerik
y = df["Pos"]

# Konversi semua data ke numerik dan ganti yang gagal dengan NaN
X = X.apply(pd.to_numeric, errors='coerce')

from sklearn.preprocessing import LabelEncoder

# Encode kolom tim ke angka
le = LabelEncoder()
df["Tm"] = le.fit_transform(df["Tm"])
df["Player"] = le.fit_transform(df["Player"])

# Ganti nilai NaN dengan rata-rata kolom
X.fillna(X.mean(), inplace=True)

# Normalisasi fitur numerik
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Cek tipe data untuk melihat apakah ada kolom non-numerik
print(df.dtypes)

Rk          int64
Player      int32
Pos        object
Age         int64
Tm          int32
G           int64
GS          int64
MP        float64
FG        float64
FGA       float64
FG%       float64
3P        float64
3PA       float64
3P%       float64
2P        float64
2PA       float64
2P%       float64
eFG%      float64
FT        float64
FTA       float64
FT%       float64
ORB       float64
DRB       float64
TRB       float64
AST       float64
STL       float64
BLK       float64
TOV       float64
PF        float64
PTS       float64
dtype: object


  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


In [None]:


print(df.dtypes)


Rk          int64
Player      int32
Pos        object
Age         int64
Tm          int32
G           int64
GS          int64
MP        float64
FG        float64
FGA       float64
FG%       float64
3P        float64
3PA       float64
3P%       float64
2P        float64
2PA       float64
2P%       float64
eFG%      float64
FT        float64
FTA       float64
FT%       float64
ORB       float64
DRB       float64
TRB       float64
AST       float64
STL       float64
BLK       float64
TOV       float64
PF        float64
PTS       float64
dtype: object


In [None]:
print(y.value_counts())


Pos
SF       155
SG       154
PF       147
PG       147
C        119
PG-SG      4
C-PF       3
SF-PF      2
PF-C       1
PF-SF      1
SF-SG      1
SG-PG      1
Name: count, dtype: int64


In [None]:
# Hapus kelas yang jumlah datanya kurang dari 2
counts = y.value_counts()
valid_classes = counts[counts >= 2].index
df_filtered = df[df["Pos"].isin(valid_classes)]

# Gunakan dataset yang sudah difilter
X = df_filtered.drop(columns=["Player", "Tm", "Pos"])
y = df_filtered["Pos"]

# Normalisasi
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Bagi dataset dengan stratify
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)


In [None]:
# Inisialisasi dan latih model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Prediksi pada data test
y_pred = model.predict(X_test)


In [None]:
# Cek akurasi
accuracy = accuracy_score(y_test, y_pred)
print(f"Akurasi Model: {accuracy:.2f}")

# Laporan klasifikasi
print(classification_report(y_test, y_pred))

# Confusion Matrix
plt.figure(figsize=(6, 4))
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt="d", cmap="Blues", xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()


Akurasi Model: 0.48
              precision    recall  f1-score   support

           C       0.59      0.79      0.68        24
        C-PF       0.00      0.00      0.00         1
          PF       0.36      0.27      0.31        30
          PG       0.71      0.69      0.70        29
       PG-SG       0.00      0.00      0.00         1
          SF       0.39      0.45      0.42        31
          SG       0.34      0.32      0.33        31

    accuracy                           0.48       147
   macro avg       0.34      0.36      0.35       147
weighted avg       0.47      0.48      0.47       147



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


NameError: name 'label_encoder' is not defined

<Figure size 600x400 with 0 Axes>

In [None]:
print("Jumlah fitur dalam X:", len(X.columns))
print("Jumlah fitur dalam df:", len(df.drop(columns=["Pos"]).columns))


Jumlah fitur dalam X: 27
Jumlah fitur dalam df: 29


In [None]:
# Lihat fitur yang paling berpengaruh dalam prediksi
feature_importances = pd.Series(model.feature_importances_, index=df.drop(columns=["Pos"]).columns)
feature_importances.sort_values(ascending=False).plot(kind="bar", figsize=(10,5), title="Feature Importance")
plt.show()


ValueError: Length of values (27) does not match length of index (29)