In [1]:
import numpy as np
import pandas as pd

In [2]:
#Read in the csv into a dataframe and then yoink the columns

df = pd.read_csv('ETData.csv')
years = df['Years of Experience']
salary = df['Salary']

In [3]:
def clean_data(df, test_size=0.25):
    # Drop rows with NaN values
    df_cleaned = df.dropna()

    # Shuffle the DataFrame
    df_shuffled = df_cleaned.sample(frac=1).reset_index(drop=True)

    # Split the data into training and testing sets
    split_index = int(len(df_shuffled)*(1 - test_size))
    train_df = df_shuffled[:split_index]
    test_df = df_shuffled[split_index:]

    years_train = train_df['Years of Experience']
    salary_train = train_df['Salary']
    years_test = test_df['Years of Experience']
    salary_test = test_df['Salary']

    return years_train, years_test, salary_train, salary_test

In [4]:
def calculate_initial_slope(years, salary):
    print("Starting Training")

    # Get the maximum and minimum values and their indices
    yearsMax = years.max()
    yearsMin = years.min()
    idxMax = years.idxmax()
    idxMin = years.idxmin()

    # Calculate the initial slope
    initialSlope = (salary[idxMax] - salary[idxMin]) / (yearsMax - yearsMin)

    return initialSlope

In [5]:
def train(years, salary, iterations, learning_rate):
    # Handling missing values
    df = pd.DataFrame({'Years': years, 'Salary': salary})
    df.dropna(inplace=True)
    years = df['Years']
    salary = df['Salary']

    # Using calculated initial slope
    slope = calculate_initial_slope(years, salary)
    intercept = 0

    for iteration in range(iterations):
        slope_grad = 0
        intercept_grad = 0
        N = len(years)

        for x, y in zip(years, salary):
            if x == 0:  # Skip this iteration if years of experience is 0
                continue

            y_pred = slope * x + intercept
            error = y - y_pred

            slope_grad += -2/N * x * error
            intercept_grad += -2/N * error

        slope -= learning_rate * slope_grad
        intercept -= learning_rate * intercept_grad

    return slope, intercept
print(train(years, salary, 10, 0.00001))

Starting Training
(8613.480341247212, 2.7963355116748025)


In [27]:
def test_model(years_train, salary_train, years_test, salary_test, iterations, learning_rate):
    slope, intercept = train(years_train, salary_train, iterations, learning_rate)

    if pd.isna(slope) or pd.isna(intercept):
        print("Training resulted in NaN values for slope or intercept.")
        return None

    # Predict the salaries for the testing set
    salary_pred = slope * years_test + intercept

    # Calculate the Mean Squared Error
    mse = np.mean((salary_test - salary_pred) ** 2)
    rmse = np.sqrt(mse)

    # Calculate Mean Absolute Percentage Error (MAPE)
    mape = np.mean(np.abs((salary_test - salary_pred) / salary_test)) * 100

    return {
        "slope": slope,
        "intercept": intercept,
        "mse": mse,
        "rmse": rmse,
        "mape": mape
    }

years_train, years_test, salary_train, salary_test = clean_data(df)
results = test_model(years_train, salary_train, years_test, salary_test, 10000, 0.0011)

if results:
    print("Slope:", results["slope"])
    print("Intercept:", results["intercept"])
    print("Mean Squared Error:", results["mse"])
    print("Root Mean Squared Error:", results["rmse"])
    print("Mean Absolute Percentage Error:", results["mape"], "%")
else:
    print("Model training was unsuccessful.")


Starting Training
Slope: 6807.146764459766
Intercept: 31692.440353390168
Mean Squared Error: 325638565.2083961
Root Mean Squared Error: 18045.458298652215
Mean Absolute Percentage Error: 12.35310407577707 %


In [7]:
def find_best_learning_rate(years_train, salary_train, years_test, salary_test, iterations, min_lr, max_lr, lr_step):
    best_mape = float('inf')
    best_lr = None

    current_lr = min_lr
    while current_lr <= max_lr:
        # Train the model with the current learning rate
        slope, intercept = train(years_train, salary_train, iterations, current_lr)

        if not pd.isna(slope) and not pd.isna(intercept):
            # Predict the salaries for the testing set
            salary_pred = slope * years_test + intercept

            # Calculate Mean Absolute Percentage Error (MAPE)
            mape = np.mean(np.abs((salary_test - salary_pred) / salary_test)) * 100

            if mape < best_mape:
                best_mape = mape
                best_lr = current_lr

        # Increment the learning rate
        current_lr += lr_step

    return best_lr, best_mape

# Example usage
years_train, years_test, salary_train, salary_test = clean_data(df)
best_lr, best_mape = find_best_learning_rate(years_train, salary_train, years_test, salary_test, 10000, 0.0001, 0.009, 0.001)

print("Best Learning Rate:", best_lr)
print("Best MAPE:", best_mape, "%")


Starting Training
Starting Training
Starting Training
Starting Training
Starting Training
Starting Training
Starting Training
Starting Training
Starting Training


  slope_grad += -2/N * x * error
  slope -= learning_rate * slope_grad


Best Learning Rate: 0.0011
Best MAPE: 11.67473201097884 %
