# Preprocessing Function

In [24]:
from sklearn.preprocessing import StandardScaler
import pandas as pd

def preprocess_data(df, target):
    """
    Clean and preprocess the dataset.
    - Handle missing values
    - Encode categorical data
    - Scale numerical features
    """
    # Handle missing values
    numeric_cols = df.select_dtypes(include=['number']).columns
    categorical_cols = df.select_dtypes(exclude=['number']).columns

    df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median())
    df[categorical_cols] = df[categorical_cols].fillna('Unknown')  # Fill non-numeric with a placeholder

    # Separate target variable
    y = df[target]
    X = df.drop(target, axis=1)

    # Encode categorical columns
    X = pd.get_dummies(X, drop_first=True)

    # Scale numerical features
    scaler = StandardScaler()
    numerical_cols = X.select_dtypes(include='number').columns
    X[numerical_cols] = scaler.fit_transform(X[numerical_cols])

    return X, y


# Model Training—Turning Data into Predictions

In [25]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

def train_model(X, y):
    """
    Train a machine learning model on the data.
    """
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train a Random Forest Classifier
    model = RandomForestClassifier()
    model.fit(X_train, y_train)

    return model, X_test, y_test


# Model Evaluation—Grading the Model’s Performance

In [30]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

def train_model(X, y):
    """
    Train a Random Forest regression model.
    """
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train a Random Forest Regressor
    model = RandomForestRegressor(random_state=42)
    model.fit(X_train, y_train)

    return model, X_test, y_test


In [31]:
# Load dataset
df = pd.read_csv('/content/population.csv')

# Preprocess data
X, y = preprocess_data(df, target='Growth Rate')

# Train model
model, X_test, y_test = train_model(X, y)

# Evaluate model
evaluate_model(model, X_test, y_test)


Mean Squared Error: 4.91478258385093e-05
R-squared: 0.6895863170915072
