In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np

# Load the NYC Temperature dataset
# For this example, we'll create a synthetic dataset
# Replace this with actual data loading
data = {
    'temperature': np.random.uniform(low=30, high=90, size=1000),
    'target': np.random.uniform(low=30, high=90, size=1000)
}
df = pd.DataFrame(data)

# Define features and target
X = df[['temperature']]
y = df['target']

# Define split ratios
splits = {
    '50:50': 0.5,
    '80:20': 0.2,
    '90:10': 0.1,
    '75:25': 0.25
}

# Initialize dictionary to store RMSE values
rmse_values = {}

# Iterate over each split ratio
for ratio, test_size in splits.items():
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)

    # Train a model
    model = LinearRegression()
    model.fit(X_train, y_train)

    # Predict and calculate RMSE
    y_pred = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))

    # Store the RMSE value
    rmse_values[ratio] = rmse

# Output RMSE values
for ratio, rmse in rmse_values.items():
    print(f'RMSE for split {ratio}: {rmse:.4f}')



RMSE for split 50:50: 17.1529
RMSE for split 80:20: 17.1703
RMSE for split 90:10: 17.4394
RMSE for split 75:25: 16.9691
