<a href="https://colab.research.google.com/github/DeepaliSaini4/Machine-Learning/blob/main/TrainingAndTestingSplit.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [17]:
# Import libraries
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Load dataset
iris = load_iris()
X = pd.DataFrame(iris.data, columns=iris.feature_names)
y = pd.Series(iris.target, name='species')

# Display original data
print("Original Dataset:")
print(f"Total samples: {X.shape[0]}")
print(f"Class distribution:\n{y.value_counts()}")

# Split data (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y  # Preserve class ratios
)

# Verify split
print("\nAfter Train-Test Split:")
print(f"Training samples: {X_train.shape[0]} ({len(y_train)/len(y):.0%})")
print(f"Testing samples: {X_test.shape[0]} ({len(y_test)/len(y):.0%})")
print("\nTraining class distribution:\n", y_train.value_counts())
print("Testing class distribution:\n", y_test.value_counts())

# Train/test a simple model
model = DecisionTreeClassifier(random_state=42)
model.fit(X_train, y_train)
train_acc = accuracy_score(y_train, model.predict(X_train))
test_acc = accuracy_score(y_test, model.predict(X_test))

print(f"\nModel Training Accuracy: {train_acc:.2%}")
print(f"Model Testing Accuracy: {test_acc:.2%}")



Original Dataset:
Total samples: 150
Class distribution:
species
0    50
1    50
2    50
Name: count, dtype: int64

After Train-Test Split:
Training samples: 120 (80%)
Testing samples: 30 (20%)

Training class distribution:
 species
0    40
2    40
1    40
Name: count, dtype: int64
Testing class distribution:
 species
0    10
2    10
1    10
Name: count, dtype: int64

Model Training Accuracy: 100.00%
Model Testing Accuracy: 93.33%
