In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score, classification_report, f1_score
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import pickle

In [2]:
# Load the dataset
df = pd.read_csv("Pokedex_Cleaned.csv", encoding="ISO-8859-1")

# Display dataset information
df.info()

# Check the shape of the dataset
df.shape

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1173 entries, 0 to 1172
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   #               1173 non-null   int64 
 1   Name            1173 non-null   object
 2   Primary Type    1173 non-null   object
 3   Secondary Type  633 non-null    object
 4   Total           1173 non-null   int64 
 5   HP              1173 non-null   int64 
 6   Attack          1173 non-null   int64 
 7   Defense         1173 non-null   int64 
 8   Sp.Atk          1173 non-null   int64 
 9   Sp.Def          1173 non-null   int64 
 10  Speed           1173 non-null   int64 
 11  Variant         190 non-null    object
dtypes: int64(8), object(4)
memory usage: 110.1+ KB


(1173, 12)

In [5]:
# Removing duplicates
df.drop_duplicates(inplace=True)

# Trim whitespace and replace spaces with underscores in column names
df.columns = df.columns.str.strip().str.replace(" ", "_")

# Fill missing numerical values with the median
num_cols = df.select_dtypes(include=[np.number]).columns
df[num_cols] = df[num_cols].fillna(df[num_cols].median())

# Fill missing categorical values with "Unknown"
cat_cols = df.select_dtypes(include=["object"]).columns
df[cat_cols] = df[cat_cols].fillna("Unknown")

# Check for remaining missing values
df.isnull().sum()

#                 0
Name              0
Primary_Type      0
Secondary_Type    0
Total             0
HP                0
Attack            0
Defense           0
Sp.Atk            0
Sp.Def            0
Speed             0
Variant           0
Combat_Class      0
dtype: int64

In [23]:
# Create bins for the 'Total' column to classify combat strength
bins = [0, 400, 500, 600, 680, df['Total'].max()]
labels = ['Weak', 'Average', 'Strong', 'Pseudo-Legend', 'Legendary']

# Create Combat_Class column
df['Combat_Class'] = pd.cut(df['Total'], bins=bins, labels=labels, right=False)

# Print some examples to verify
print(df[['Name', 'Total', 'Combat_Class']].sort_values('Total'))

# Encode the Combat_Class
le = LabelEncoder()
df['Combat_Class'] = le.fit_transform(df['Combat_Class'])

# Save the LabelEncoder
with open("label_encoder.pkl", "wb") as f:
    pickle.dump(le, f)

# Drop unnecessary columns and encode categorical variables
df_class = df.drop(columns=["#", "Name", "Total"])
df_encoded = pd.get_dummies(df_class, columns=["Primary_Type", "Secondary_Type", "Variant"], drop_first=True)

# Define features (X) and target (y)
X = df_encoded.drop(columns=["Combat_Class"])
y = df_encoded["Combat_Class"]

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

            Name  Total Combat_Class
890   Wishiwashi    175         Weak
236      Sunkern    180         Weak
973         Blip    180         Weak
1022     SnomIce    185         Weak
359      Azurill    190         Weak
...          ...    ...          ...
463      Groudon    770    Legendary
465     Rayquaza    780    Legendary
192       Mewtwo    780    Legendary
193       Mewtwo    780    Legendary
1043   Eternatus   1125          NaN

[1172 rows x 3 columns]


In [24]:
X.head()

Unnamed: 0,HP,Attack,Defense,Sp.Atk,Sp.Def,Speed,Primary_Type_Dark,Primary_Type_Dragon,Primary_Type_Electric,Primary_Type_Fairy,...,Variant_Two-Segment Form,Variant_Ultra,Variant_Unbound,Variant_Unknown,Variant_Wash,Variant_White,Variant_White Plumage,Variant_White-Striped Form,Variant_Yellow Plumage,Variant_Zero Form
0,45,49,49,65,65,45,False,False,False,False,...,False,False,False,True,False,False,False,False,False,False
1,60,62,63,80,80,60,False,False,False,False,...,False,False,False,True,False,False,False,False,False,False
2,80,82,83,100,100,80,False,False,False,False,...,False,False,False,True,False,False,False,False,False,False
3,80,100,123,122,120,80,False,False,False,False,...,False,False,False,True,False,False,False,False,False,False
4,39,52,43,60,50,65,False,False,False,False,...,False,False,False,True,False,False,False,False,False,False


In [25]:
# Train SVC model
svc = SVC()
svc.fit(X_train, y_train)

# Make predictions and evaluate the model
y_pred_svc = svc.predict(X_test)
print("SVC Accuracy:", accuracy_score(y_test, y_pred_svc))
print("SVC Confusion Matrix:\n", confusion_matrix(y_test, y_pred_svc))
print(classification_report(y_test, y_pred_svc))

SVC Accuracy: 0.8765957446808511
SVC Confusion Matrix:
 [[78  0  0  2  0]
 [ 0  8  1  0  0]
 [ 0  0  2 11  0]
 [10  0  0 36  0]
 [ 5  0  0  0 82]]
              precision    recall  f1-score   support

           0       0.84      0.97      0.90        80
           1       1.00      0.89      0.94         9
           2       0.67      0.15      0.25        13
           3       0.73      0.78      0.76        46
           4       1.00      0.94      0.97        87

    accuracy                           0.88       235
   macro avg       0.85      0.75      0.76       235
weighted avg       0.87      0.88      0.86       235



In [26]:
# Train Decision Tree Classifier
dtc = DecisionTreeClassifier(random_state=42)
dtc.fit(X_train, y_train)

# Make predictions and evaluate the model
y_pred_dtc = dtc.predict(X_test)
print("DTC Accuracy:", accuracy_score(y_test, y_pred_dtc))
print("DTC Confusion Matrix:\n", confusion_matrix(y_test, y_pred_dtc))
print(classification_report(y_test, y_pred_dtc))

DTC Accuracy: 0.7659574468085106
DTC Confusion Matrix:
 [[60  0  1 14  5]
 [ 0  5  3  1  0]
 [ 0  0  8  5  0]
 [13  0  6 26  1]
 [ 6  0  0  0 81]]
              precision    recall  f1-score   support

           0       0.76      0.75      0.75        80
           1       1.00      0.56      0.71         9
           2       0.44      0.62      0.52        13
           3       0.57      0.57      0.57        46
           4       0.93      0.93      0.93        87

    accuracy                           0.77       235
   macro avg       0.74      0.68      0.70       235
weighted avg       0.78      0.77      0.77       235



In [27]:
# Train Random Forest Classifier
rfc = RandomForestClassifier(random_state=42)
rfc.fit(X_train, y_train)

# Make predictions and evaluate the model
y_pred_rfc = rfc.predict(X_test)
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rfc))
print("Random Forest Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rfc))
print(classification_report(y_test, y_pred_rfc))

Random Forest Accuracy: 0.825531914893617
Random Forest Confusion Matrix:
 [[69  0  0  6  5]
 [ 0  5  4  0  0]
 [ 0  1  3  9  0]
 [13  0  0 33  0]
 [ 3  0  0  0 84]]
              precision    recall  f1-score   support

           0       0.81      0.86      0.84        80
           1       0.83      0.56      0.67         9
           2       0.43      0.23      0.30        13
           3       0.69      0.72      0.70        46
           4       0.94      0.97      0.95        87

    accuracy                           0.83       235
   macro avg       0.74      0.67      0.69       235
weighted avg       0.82      0.83      0.82       235



In [28]:
# Evaluate models using accuracy and F1-score
train_accuracy_svc = svc.score(X_train, y_train)
test_accuracy_svc = svc.score(X_test, y_test)
train_accuracy_dtc = dtc.score(X_train, y_train)
test_accuracy_dtc = dtc.score(X_test, y_test)
train_accuracy_rfc = rfc.score(X_train, y_train)
test_accuracy_rfc = rfc.score(X_test, y_test)

print(f"SVC Training Accuracy: {train_accuracy_svc:.4f}")
print(f"SVC Test Accuracy: {test_accuracy_svc:.4f}")
print(f"DTC Training Accuracy: {train_accuracy_dtc:.4f}")
print(f"DTC Test Accuracy: {test_accuracy_dtc:.4f}")
print(f"Random Forest Training Accuracy: {train_accuracy_rfc:.4f}")
print(f"Random Forest Test Accuracy: {test_accuracy_rfc:.4f}")

SVC Training Accuracy: 0.9125
SVC Test Accuracy: 0.8766
DTC Training Accuracy: 1.0000
DTC Test Accuracy: 0.7660
Random Forest Training Accuracy: 1.0000
Random Forest Test Accuracy: 0.8255


In [29]:
# Calculate weighted F1-scores
f1_scores = {
    "SVC": f1_score(y_test, y_pred_svc, average='weighted'),
    "Decision Tree": f1_score(y_test, y_pred_dtc, average='weighted'),
    "Random Forest": f1_score(y_test, y_pred_rfc, average='weighted')
}

# Print F1-scores for all models
print("\nWeighted F1-scores for models:")
for model_name, score in f1_scores.items():
    print(f"{model_name}: {score:.4f}")

# Identify the best model based on F1-score
best_model_name = max(f1_scores, key=f1_scores.get)
print(f"\nBest model based on weighted F1-score: {best_model_name}")


Weighted F1-scores for models:
SVC: 0.8645
Decision Tree: 0.7682
Random Forest: 0.8177

Best model based on weighted F1-score: SVC


In [30]:
# Map model name to actual object
model_map = {
    "SVC": svc,
    "Decision Tree": dtc,
    "Random Forest": rfc
}
best_model = model_map[best_model_name]
# Encode the 'Combat_Class' column


print(df[['Name', 'Total', 'Combat_Class']])

              Name  Total  Combat_Class
0        Bulbasaur    318             4
1          Ivysaur    405             0
2         Venusaur    525             3
3         Venusaur    625             2
4       Charmander    309             4
...            ...    ...           ...
1168        Chi-Yu    570             3
1169  Roaring Moon    590             3
1170  Iron Valiant    590             3
1171      Koraidon    670             2
1172      Miraidon    670             2

[1172 rows x 3 columns]


In [31]:
# Save all models
with open('svc_model.pkl', 'wb') as f:
    pickle.dump(svc, f)

with open('dtc_model.pkl', 'wb') as f:
    pickle.dump(dtc, f)

with open('rfc_model.pkl', 'wb') as f:
    pickle.dump(rfc, f)

# Save the best model
with open("best_pokemon_model.pkl", "wb") as f:
    pickle.dump(best_model, f)

print("\nAll models and the best model have been saved successfully.")


All models and the best model have been saved successfully.
