In [3]:
import csv
import pandas as pd
from sklearn.model_selection import train_test_split

# Linear regression functions
def linear_regression(x, y):
    n = len(x)
    sum_x = sum(x)
    sum_y = sum(y)
    sum_xy = sum(x_i * y_i for x_i, y_i in zip(x, y))
    sum_x2 = sum(x_i**2 for x_i in x)

    # Slope (m) and Intercept (b)
    m = (n * sum_xy - sum_x * sum_y) / (n * sum_x2 - sum_x**2)
    b = (sum_y - m * sum_x) / n
    return m, b

def predict(x, m, b):
    return [m * x_i + b for x_i in x]

def mean_squared_error(y_true, y_pred):
    return sum((y_t - y_p)**2 for y_t, y_p in zip(y_true, y_pred)) / len(y_true)

# Load real dataset from CSV
def load_dataset(filepath):
    x, y = [], []
    with open(filepath, 'r') as file:
        reader = csv.reader(file)
        next(reader)  # Skip header row if any
        for row in reader:
            x.append(float(row[0]))
            y.append(float(row[1]))
    return x, y

# Split the data into train and test sets
def split_data(x, y, test_size=0.2):
    return train_test_split(x, y, test_size=test_size, random_state=42)

# Main execution
dataset_path = "homeprices.csv"
x, y = load_dataset(dataset_path)

# Split the dataset
x_train, x_test, y_train, y_test = split_data(x, y,0.4)

# Train the model using the training set
m, b = linear_regression(x_train, y_train)
print(f"Slope (m): {m}")
print(f"Intercept (b): {b}")

# Predict on the test set
predictions = predict(x_test, m, b)

# Create a DataFrame for test set, predictions, and actual labels
df = pd.DataFrame({
    "Test Set (x)": x_test,
    "Actual Labels (y)": y_test,
    "Predictions": predictions
})

print("\nTest Set, Predictions, and Actual Labels:")
print(df)

# Evaluate the model using the test set
mse = mean_squared_error(y_test, predictions)
print(f"\nMean Squared Error on Test Set: {mse}")

Slope (m): 127.63157894736842
Intercept (b): 213421.05263157896

Test Set, Predictions, and Actual Labels:
   Test Set (x)  Actual Labels (y)    Predictions
0        3000.0           565000.0  596315.789474
1        4000.0           725000.0  723947.368421

Mean Squared Error on Test Set: 490893351.8005559


In [5]:
import csv
import pandas as pd
from sklearn.model_selection import train_test_split

# Linear regression functions
def linear_regression(x, y):
    n = len(x)
    sum_x = sum(x)
    sum_y = sum(y)
    sum_xy = sum(x_i * y_i for x_i, y_i in zip(x, y))
    sum_x2 = sum(x_i**2 for x_i in x)

    # Slope (m) and Intercept (b)
    m = (n * sum_xy - sum_x * sum_y) / (n * sum_x2 - sum_x**2)
    b = (sum_y - m * sum_x) / n
    return m, b

def predict(x, m, b):
    return [m * x_i + b for x_i in x]

def mean_squared_error(y_true, y_pred):
    return sum((y_t - y_p)**2 for y_t, y_p in zip(y_true, y_pred)) / len(y_true)

# Load real dataset from CSV
def load_dataset(filepath):
    x, y = [], []
    with open(filepath, 'r') as file:
        reader = csv.reader(file)
        next(reader)  # Skip header row if any
        for row in reader:
            x.append(float(row[0]))
            y.append(float(row[1]))
    return x, y

# Split the data into train and test sets
def split_data(x, y, test_size=0.2):
    return train_test_split(x, y, test_size=test_size, random_state=42)

# Main execution
dataset_path = "test.csv"
x, y = load_dataset(dataset_path)

# Split the dataset
x_train, x_test, y_train, y_test = split_data(x, y,0.4)

# Train the model using the training set
m, b = linear_regression(x_train, y_train)
print(f"Slope (m): {m}")
print(f"Intercept (b): {b}")

# Predict on the test set
predictions = predict(x_test, m, b)

# Create a DataFrame for test set, predictions, and actual labels
df = pd.DataFrame({
    "Test Set (x)": x_test,
    "Actual Labels (y)": y_test,
    "Predictions": predictions
})

print("\nTest Set, Predictions, and Actual Labels:")
print(df)

# Evaluate the model using the test set
mse = mean_squared_error(y_test, predictions)
print(f"\nMean Squared Error on Test Set: {mse}")

Slope (m): 0.9999872296376983
Intercept (b): 0.004833788440252344

Test Set, Predictions, and Actual Labels:
        Test Set (x)  Actual Labels (y)  Predictions
0               86.0          86.799992    86.003736
1               48.0          46.395593    48.004221
2               81.0          78.213789    81.003799
3               45.0          47.025158    45.004259
4               65.0          60.183043    65.004004
...              ...                ...          ...
399995          82.0          77.898839    82.003787
399996          22.0          18.834193    22.004553
399997          42.0          42.454765    42.004297
399998          10.0          18.820687    10.004706
399999          39.0          38.586835    39.004336

[400000 rows x 3 columns]

Mean Squared Error on Test Set: 8.989691603587925
