## Importing Libraries

In [1]:
import pandas as pd
import numpy as np

from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
# from sklearn.linear_model import LogisticRegression

## Function and Class

In [2]:
def train_test_split(X, y, test_size=0.25, random_state=None):
    """
    Split data into random train and test subsets.
    """

    # Using seed() to get the same outcome every time
    if random_state is not None:
        np.random.seed(random_state)
    
    # Get the indices for all data in set
    indices = np.arange(len(X))

    # Shuffle the data indices
    np.random.shuffle(indices)
    
    # Determine the split index
    split_index = int(len(X) * (1 - test_size))
    
    # Split into training and testing indices
    train_indices = indices[:split_index]
    test_indices = indices[split_index:]
    
    # Assign the training and testing sets
    X_train, y_train = X[train_indices], y[train_indices]
    X_test, y_test = X[test_indices], y[test_indices]
    
    return X_train, X_test, y_train, y_test

In [3]:
class LogisticRegression:
    # Class constructor
    def __init__(self, alpha=1, iteration=1500):
        self.w = None
        # Define the learning rate
        self.alpha = alpha
        # Define the iteration number
        self.iteration = iteration

    def sigmoid(self, z):
        return 1 / (1 + np.exp(-z))

    # Class function to fit the data (find the appropiate value of w)
    def fit(self, X, y):
        # Number of samples & number of features (dimensions)
        n, d = X.shape

        # Initialize w (d + 1 to absorb bias into w)
        self.w = np.zeros(d + 1)
        # Apply transformation to X
        X = np.append(X, np.ones((n, 1)), axis=1)

        # Gradient Descent
        for _ in range(self.iteration):
            linear_model = np.dot(X, self.w)
            y_predicted = self.sigmoid(linear_model)
            
            # Compute gradients
            dw = (1 / n) * np.dot(X.T, (y_predicted - y))
            
            # Update weights
            self.w -= self.alpha * dw

    def predict(self, X):
        n = X.shape[0]
        
        # Add bias term by appending a column of ones to the feature matrix
        X = np.append(X, np.ones((n, 1)), axis=1)
        
        linear_model = np.dot(X, self.w)
        y_predicted = self.sigmoid(linear_model)
        y_predicted_cls = [1 if i > 0.5 else 0 for i in y_predicted]
        return np.array(y_predicted_cls)
    
    def score(self, X, y):
        y_pred = self.predict(X)
        accuracy = np.mean(y_pred == y)
        return accuracy


## Data Loading and Pre-processing

### Import dataset

In [4]:
data = 'Pokemon.csv'

df = pd.read_csv('Pokemon.csv')

## Exploratory data analysis

In [5]:
print(f"Found {df.shape[0]} samples with {df.shape[1]} features")

Found 800 samples with 13 features


#### Let's preview the dataset

In [6]:
df.head()

Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
0,1,Bulbasaur,Grass,Poison,318,45,49,49,65,65,45,1,False
1,2,Ivysaur,Grass,Poison,405,60,62,63,80,80,60,1,False
2,3,Venusaur,Grass,Poison,525,80,82,83,100,100,80,1,False
3,3,VenusaurMega Venusaur,Grass,Poison,625,80,100,123,122,120,80,1,False
4,4,Charmander,Fire,,309,39,52,43,60,50,65,1,False


#### Now let's view the columns

In [7]:
col_names = df.columns

col_names

Index(['#', 'Name', 'Type 1', 'Type 2', 'Total', 'HP', 'Attack', 'Defense',
       'Sp. Atk', 'Sp. Def', 'Speed', 'Generation', 'Legendary'],
      dtype='object')

#### Check distribution of target_class column

In [8]:
df['Legendary'].value_counts()

Legendary
False    735
True      65
Name: count, dtype: int64

#### View summary of the dataset

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 800 entries, 0 to 799
Data columns (total 13 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   #           800 non-null    int64 
 1   Name        800 non-null    object
 2   Type 1      800 non-null    object
 3   Type 2      414 non-null    object
 4   Total       800 non-null    int64 
 5   HP          800 non-null    int64 
 6   Attack      800 non-null    int64 
 7   Defense     800 non-null    int64 
 8   Sp. Atk     800 non-null    int64 
 9   Sp. Def     800 non-null    int64 
 10  Speed       800 non-null    int64 
 11  Generation  800 non-null    int64 
 12  Legendary   800 non-null    bool  
dtypes: bool(1), int64(9), object(3)
memory usage: 75.9+ KB


#### Check of missing value

In [10]:
df.isnull().sum()

#               0
Name            0
Type 1          0
Type 2        386
Total           0
HP              0
Attack          0
Defense         0
Sp. Atk         0
Sp. Def         0
Speed           0
Generation      0
Legendary       0
dtype: int64

Type 2 missing is normal, not all pokemon has secondary type

### Remove unused features

In [11]:
# df.drop(columns=['#', 'Name', 'Type 1', 'Type 2'],inplace=True)
df.drop(columns=['#', 'Name', 'Type 1', 'Type 2', 'HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed', 'Generation'],inplace=True)

df

Unnamed: 0,Total,Legendary
0,318,False
1,405,False
2,525,False
3,625,False
4,309,False
...,...,...
795,600,True
796,700,True
797,600,True
798,680,True


### Declare feature vector and target variable

In [12]:
X = np.array(df.drop(['Legendary'], axis=1))
y = np.array(df['Legendary'])

### Split data into separate training and test set

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

## Training

### Prediction

In [14]:
h = LogisticRegression(iteration=1500)
h.fit(X_train,y_train)
y_pred = h.predict(X_test)

  return 1 / (1 + np.exp(-z))


### Evaluation

In [15]:
score = round(h.score(X_test, y_test)*100, 2)
print("The accuracy score we have achieved using Logistic Regression is: "+str(score)+" %")

The accuracy score we have achieved using Logistic Regression is: 91.25 %


  return 1 / (1 + np.exp(-z))
