<a href="https://www.kaggle.com/code/manpreetaiml/supervisedpredictionmodel?scriptVersionId=249840965" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# Loading Dependencies

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.utils import resample

import warnings
warnings.filterwarnings("ignore", message="X does not have valid feature names")

# Loading data

In [None]:
data = pd.read_csv('/kaggle/input/jojo-stand-processed-dataset/labeled-encoded-jojo-stands.csv', encoding='latin1')

In [None]:
data.head()

In [None]:
data.tail()

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
print('Max: ',data[data['Rank'] == data['Rank'].max()][0:3])
print('Min: ' , data[data['Rank'] == data['Rank'].min()][0:3])

# Data Balancing

In [None]:
# imbalanced
data['Rank'].value_counts().reset_index()

In [None]:
# Sep majority and minority classes
dfs = [data[data['Rank'] == i] for i in sorted(data['Rank'].unique())]

# Max count among all classes
max_count = max(len(df) for df in dfs)

# Upsample all classes to have max_count samples
upsampled = [resample(df, replace=True, n_samples=max_count, random_state=42) for df in dfs]

# balanced data
balanced_data = pd.concat(upsampled)

# Exploratory Data Analysis

In [None]:
# Defining plots

labels = ['Weak','Average','Strong','God']

def classDist(data):
  # Get count of each class
  x = data['Rank'].value_counts().reset_index()

  plt.figure(figsize=(6,3))
  plt.bar(x=x['Rank'],height=x['count'], color='Green')
  plt.xticks(ticks=range(len(labels)), labels=labels)
  plt.title('Class Distribution')
  plt.show()

  plt.figure(figsize=(6,5))
  plt.pie(x=x['count'], labels=labels, autopct='%1.1f%%')
  plt.title('Class distribution')
  plt.show()

def statDist(data):
  data[['PWR', 'SPD', 'RNG', 'PER', 'PRC', 'DEV']].hist(figsize=(10,5), bins=15)
  plt.suptitle("Distribution of Stats")
  plt.tight_layout()
  plt.show()

def boxPlot(data):
  data[['PWR', 'SPD', 'RNG', 'PER', 'PRC', 'DEV']].plot(kind='box', figsize=(10, 6))
  plt.title("Boxplot of All Stats")
  plt.show()

## Before Balancing

In [None]:
# Display class distribution
print('Class Distribution: ')
classDist(data)

# Display Stat Distribution
print('Stat Distribution: ')
statDist(data)

#Display boxplot for outliers
print('Boxplot: ')
boxPlot(data)

## After Balancing

In [None]:
# Balance Data
data = balanced_data

In [None]:
# Display class distribution
print('Class Distribution: ')
classDist(data)

# Display Stat Distribution
print('Stat Distribution: ')
statDist(data)

#Display boxplot for outliers
print('Boxplot: ')
boxPlot(data)

In [None]:
sns.heatmap(data.drop('Stand', axis=1).corr(), cmap='Blues', annot=True)
plt.title('Correlation of features and label')
plt.show()

# Model creation

In [None]:
x = data.drop(['Stand','Rank'], axis=1)
y = data['Rank']

In [None]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)

## Logistic Regression

In [None]:
lr_model = LogisticRegression(max_iter=500)

In [None]:
lr_model.fit(x_train,y_train)

## Random Forest

In [None]:
rf_model = RandomForestClassifier(n_estimators=110,random_state=42)

In [None]:
rf_model.fit(x_train,y_train)

## SVC

In [None]:
svc_model = SVC(kernel='linear', C=1.0, gamma='auto')

In [None]:
svc_model.fit(x_train,y_train)

## KNN

In [None]:
knn_model = KNeighborsClassifier(weights='distance', n_neighbors=2)

In [None]:
knn_model.fit(x_train,y_train)

## GBC

In [None]:
gbc_model = GradientBoostingClassifier(n_estimators=100,random_state=42, learning_rate=0.01, max_depth=10)

In [None]:
gbc_model.fit(x_train,y_train)

# Performance evaluation

In [None]:
results = {}

def performance_evaluation(title,model):
  print(f'Model Name: {title}')
  y_pred = model.predict(x_test)

  # test Accuracy Score
  test_accuracy = accuracy_score(y_test,y_pred)
  print(f'Model Accuracy Score: {test_accuracy}')

  # Confusion Matrix
  cm = confusion_matrix(y_test,y_pred)
  sns.heatmap(cm,cmap='Blues',annot=True)
  plt.title('Confusion Matrix: ')
  plt.show()

  # Classification report
  class_report = classification_report(y_test,y_pred, output_dict=True)
  print(classification_report(y_test,y_pred))

  results[title] = {
      'accuracy': test_accuracy,
      'precision': class_report['macro avg']['precision'],
      'recall': class_report['macro avg']['recall'],
      'f1-score': class_report['macro avg']['f1-score']
  }

## Logistic Regression

In [None]:
performance_evaluation('Logistic Regression Model', lr_model)

## Random Forest

In [None]:
performance_evaluation('Random Forest Classifier', rf_model)

## SVC

In [None]:
performance_evaluation('Support Vector Classifier', svc_model)

## KNN

In [None]:
performance_evaluation('K Nearest Neighbours', knn_model)

## GBC

In [None]:
performance_evaluation('Gradient Boosting Classifier', gbc_model)

## All Model Comparison

In [None]:
# Convert result dict to pandas dataframe and transpose so models are rows
df_results = pd.DataFrame(results).T

# Plot bar graph to compare models based on each metric
plt.figure(figsize=(10, 6))
df_results.plot(kind='bar')
plt.title('Model Performance Comparison')
plt.ylabel('Score')
plt.xticks(rotation=45)
plt.ylim(0, 1)
plt.legend(loc='lower right')
plt.grid(True)
plt.tight_layout()
plt.show()

# Unlabeled data prediction

In [None]:
encoded_map = { 'F':0, 'E':2 , 'D':4, 'C':6, 'B':8, 'A':10, 'Infi':20  }
encoded_class = { 0: 'Weak', 1: 'Average' , 2:'Strong', 3:'God' }

def detect_and_display(pwr,spd,rng,per,prc,dev, model):
  encoded_pwr = encoded_map[pwr]
  encoded_spd = encoded_map[spd]
  encoded_rng = encoded_map[rng]
  encoded_per = encoded_map[per]
  encoded_prc = encoded_map[prc]
  encoded_dev = encoded_map[dev]

  stats = np.array([[encoded_pwr,encoded_spd,encoded_rng,encoded_per,encoded_prc,encoded_dev]])

  prediction = model.predict(stats)

  print(f'Prediction: {prediction}, Rank: {encoded_class[prediction[0]]}')


In [None]:
pwr = input('PWR: ').capitalize()
spd = input('SPD: ').capitalize()
rng = input('RNG: ').capitalize()
per = input('PER: ').capitalize()
prc = input('PRC: ').capitalize()
dev = input('DEV: ').capitalize()

## LR Model

In [None]:
detect_and_display(pwr,spd,rng,per,prc,dev, lr_model)

## Random Forest Classifier

In [None]:
detect_and_display(pwr,spd,rng,per,prc,dev, rf_model)

## SVC Model

In [None]:
detect_and_display(pwr,spd,rng,per,prc,dev, svc_model)

## KNN Model

In [None]:
detect_and_display(pwr,spd,rng,per,prc,dev, knn_model)