In [2]:
# import libraries and data

import pandas as pd
import numpy as np
import plotly.express as px
# import seaborn as sns
# import matplotlib.pyplot as plt

data = pd.read_csv('../datasets/Salary_dataset.csv')
data.head(50)

Unnamed: 0.1,Unnamed: 0,YearsExperience,Salary
0,0,1.2,39344.0
1,1,1.4,46206.0
2,2,1.6,37732.0
3,3,2.1,43526.0
4,4,2.3,39892.0
5,5,3.0,56643.0
6,6,3.1,60151.0
7,7,3.3,54446.0
8,8,3.3,64446.0
9,9,3.8,57190.0


In [3]:
data.columns

Index(['Unnamed: 0', 'YearsExperience', 'Salary'], dtype='object')

In [4]:
data.rename(columns={'Unnamed: 0':'Id'}, inplace=True)
data.drop(columns='Id',
          axis=1,
          inplace=True)
data.head(10)

Unnamed: 0,YearsExperience,Salary
0,1.2,39344.0
1,1.4,46206.0
2,1.6,37732.0
3,2.1,43526.0
4,2.3,39892.0
5,3.0,56643.0
6,3.1,60151.0
7,3.3,54446.0
8,3.3,64446.0
9,3.8,57190.0


In [5]:
from sklearn.model_selection import train_test_split

X = data.drop(columns='Salary', axis=1)
y = data['Salary']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

In [6]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X_train, y_train)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [7]:
y_pred = model.predict(X_test)

In [8]:
comparassion_table = {
    'Actual': y_test,
    'Predicted': y_pred
}

df_compare = pd.DataFrame(comparassion_table)
df_compare.head(30)

Unnamed: 0,Actual,Predicted
20,91739.0,91101.582558
24,109432.0,109298.208882
7,54446.0,56623.764259
18,81364.0,82482.127983
2,37732.0,40342.572284
27,112636.0,117917.663457
26,116970.0,116959.946282
16,66030.0,74820.390583
25,105583.0,112171.360407


In [9]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

r2 = r2_score(y_test, y_pred)
MAE = mean_absolute_error(y_test, y_pred)
MSE = mean_squared_error(y_test, y_pred)


print(f'INTERCEPT: {model.intercept_:.2f}') # valor de y quando X é = 0
print(f'COEF: {model.coef_[0]:.2f}') # aumento em y para cada unidade de X
print(f'R2 Score: {r2}') # veracidade da predição do modelo de 0 a 1
print(f'MAE: {MAE}') # média de erro no valor da predição
print(f'MSE: {MSE}') # média do quadrado do erro da predição

INTERCEPT: 25019.10
COEF: 9577.17
R2 Score: 0.9746363787521536
MAE: 3038.6823612518833
MSE: 17978409.497344162


In [10]:
data.corr()

Unnamed: 0,YearsExperience,Salary
YearsExperience,1.0,0.978242
Salary,0.978242,1.0


In [11]:
import numpy as np

value_to_predict = np.array([1, 4,5, 10, 10.5, 11.2, 12.4, 14]).reshape(-1, 1)

def predicted_salary(x):
  predictions = []
  for i in x:
    prediction = model.predict(i.reshape(1, -1)) # Reshape each value to a 2D array
    predictions.append(prediction[0]) # Append the single prediction value
  return predictions

predictions = predicted_salary(value_to_predict)

for experience, salary in zip(value_to_predict, predictions):
    print(f'Predicted Salary for {experience[0]} years of experience: {salary:,.0f}')

Predicted Salary for 1.0 years of experience: 34,596
Predicted Salary for 4.0 years of experience: 63,328
Predicted Salary for 5.0 years of experience: 72,905
Predicted Salary for 10.0 years of experience: 120,791
Predicted Salary for 10.5 years of experience: 125,579
Predicted Salary for 11.2 years of experience: 132,283
Predicted Salary for 12.4 years of experience: 143,776
Predicted Salary for 14.0 years of experience: 159,100




In [None]:
df_predict = pd.DataFrame({
    'YearsExperience': value_to_predict.flatten(),
    'PredictedSalary': predictions
})

fig = px.scatter(df_predict,
                 x='YearsExperience',
                 y='PredictedSalary',
                #  text=df_predict['PredictedSalary'].apply(lambda x: f"{x:,.0f}"),
                 title='Predicted Salary vs Years of Experience',
                 labels={'YearsExperience': 'Years of Experience', 'PredictedSalary': 'Predicted Salary (R$)'}
)

fig.update_traces(
    hovertemplate='<b>Experience:</b> %{x:.1f} anos<br>' +
                  '<b>Predicted Salary:</b> R$ %{y:,.0f}<extra></extra>',
    marker=dict(size=8, color='green')
)

# Add the line plot
fig.add_trace(
    px.line(df_predict, x='YearsExperience', y='PredictedSalary').data[0]
)

# Layout opcional
fig.update_layout(
    title_font_size=20,
    xaxis_title="Years of Experience",
    yaxis_title="Predicted Salary (R$)",
    hoverlabel=dict(bgcolor="white", font_size=12, font_family="Arial")
)

fig.show()