In [27]:
import numpy as np
import networkx as nx
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

class alien:
    def __init__(self):
        pass
    def _read_data(self):
        df = pd.read_csv("Alien.csv") 
        df = df[df["Diameter"]<100]
        df = df[df["Sex"]!="I"] 	
        return  df
        
    
    def train(self,clf=SVC(kernel='linear'),ratio=0.1):
        df=self._read_data()
        X=df.drop(columns=['Sex'],axis=1)
        y = df['Sex']

        num_features = X.select_dtypes(exclude="object").columns
        cat_features = X.select_dtypes(include="object").columns

        num_pipeline= Pipeline(
            steps=[
            ("imputer",SimpleImputer(strategy="median")),
            ("scaler",StandardScaler())
            ]
        )
        cat_pipeline=Pipeline(
            steps=[
            ("imputer",SimpleImputer(strategy="most_frequent")),
            ("one_hot_encoder",OneHotEncoder()),
            ("scaler",StandardScaler(with_mean=False))
            ]
        )
        preprocessor=ColumnTransformer(
            [
            ("num_pipeline",num_pipeline,num_features),
            ("cat_pipelines",cat_pipeline,cat_features)
            ]
        )

        X = preprocessor.fit_transform(X)
        y = y.map({"M": 0, "F": 1, "I": 2 })

        X_sample, _, y_sample, _ = train_test_split(X, y, test_size=ratio, random_state=32)
        X_train, X_test, y_train, y_test = train_test_split(X_sample, y_sample, test_size=0.2, random_state=10) 
        
        clf.fit(X_train, y_train)
        return clf, X_test, y_test,X_train, y_train

    def evaluate(self,clf=SVC(kernel='linear')):
        size_range=np.linspace(500, 3800, 5).astype(int)
        ratio_range=1-size_range/3900

        accuracy_test=[]
        accuracy_train=[]
        for ratio in ratio_range:
            clf, X_test, y_test, X_train,y_train = self.train(clf, ratio)
            y_pred = clf.predict(X_test)
            accuracy_test.append(accuracy_score(y_test, y_pred))
            y_train_pred = clf.predict(X_train)
            accuracy_train.append(accuracy_score(y_train, y_train_pred))

        df = {
        'data_size': size_range,
        'Training Accuracy': accuracy_train,
        'Testing Accuracy': accuracy_test
        }

        return pd.DataFrame(df)


obj=alien()
obj.evaluate(SVC(kernel='linear'))

obj.evaluate(KNeighborsClassifier(n_neighbors=3))
# obj.evaluate(MLPClassifier(hidden_layer_sizes=(20,20,10), activation='relu', solver='adam', max_iter=1000, random_state=19))


Unnamed: 0,data_size,Training Accuracy,Testing Accuracy
0,500,0.727941,0.485294
1,1325,0.743767,0.519337
2,2150,0.729522,0.525597
3,2975,0.762022,0.509852
4,3800,0.761583,0.519305
