# NACHO DOCHAO & MAX VILARASAU #

# Import, Read & Display #

In [1]:
import numpy as np
import pandas as pd
import statsmodels.formula.api as smf
from sklearn.metrics import mean_squared_error

df=pd.read_excel('Beauty.xlsx')
df

Unnamed: 0,wage,exper,looks,union,goodhlth,black,female,married,south,bigcity,smllcity,service,educ
0,5.73,30.0,4.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,14.0
1,4.28,28.0,3.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,12.0
2,7.96,35.0,4.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,10.0
3,11.57,38.0,3.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,16.0
4,11.42,27.0,3.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,16.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1255,1.61,25.0,3.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,12.0
1256,1.68,4.0,2.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,12.0
1257,3.29,35.0,3.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,12.0
1258,2.31,15.0,3.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,10.0


# Train-Test Partition #

In [2]:
np.random.seed(42)  # For reproducibility
n = len(df)  # Number of rows in the DataFrame
index = np.random.binomial(1, 0.7, n).astype(bool)  # 70% train
train, test = df.loc[index, :], df.loc[~index, :]  # Split into train and test

# Model Zero #

In [3]:
modelpred0 = smf.ols('np.log(wage) ~ exper + looks + union + goodhlth + black + female + married + south + bigcity + smllcity + service + educ', data=train).fit()

modelpred0.predict(test)

i_five = test.iloc[4].name

print(f'El individuo en la 5ª fila del test set es el individuo nº {i_five}')

El individuo en la 5ª fila del test set es el individuo nº 11


# Model One #

In [4]:
modpred1 = smf.ols('wage ~ educ', data=train).fit()

#ignora, es para evitar un error
train = train.copy()
test = test.copy()

train.loc[:, 'wage_pred'] = modpred1.predict(train)
test.loc[:, 'wage_pred'] = modpred1.predict(test)

print('Test Set with Predictions:')
test[['wage', 'educ', 'wage_pred']].head()

Test Set with Predictions:


Unnamed: 0,wage,educ,wage_pred
1,4.28,12.0,6.111651
2,7.96,10.0,5.377926
7,7.69,16.0,7.5791
9,3.89,12.0,6.111651
11,4.03,16.0,7.5791


In [5]:
real_value = test.loc[11, 'wage']
predicted_value = test.loc[11, 'wage_pred']

prediction_error = real_value - predicted_value

print(f'Wage Real: {real_value}')
print(f'Wage Predicho: {predicted_value}')
print(f'Error: {real_value} - {predicted_value} = {prediction_error}')

if real_value > predicted_value:
    print(f'\nDado que {real_value} es mayor que {predicted_value}, la predicción es menor\nque el valor real y, por lo tanto, el modelo está infraprediciendo.')
else:
    print(f'\nDado que {real_value} es menor que {predicted_value}, la predicción es mayor\nque el valor real y, por lo tanto, el modelo está sobreprediciendo.')

Wage Real: 4.03
Wage Predicho: 7.579100101885763
Error: 4.03 - 7.579100101885763 = -3.549100101885763

Dado que 4.03 es menor que 7.579100101885763, la predicción es mayor
que el valor real y, por lo tanto, el modelo está sobreprediciendo.


In [6]:
mse_value_test = mean_squared_error(test['wage'], test['wage_pred'])
mse_value_train = mean_squared_error(train['wage'], train['wage_pred'])

rmse_value_test = np.sqrt(mse_value_test)
rmse_value_train = np.sqrt(mse_value_train)

print(f'TEST -> Root Mean Squared Error: {rmse_value_test:.3f} (to the third decimal point)')
print(f'TRAIN -> Root Mean Squared Error: {rmse_value_train:.3f} (to the third decimal point)')

if rmse_value_test > rmse_value_train:
    print(f'\nEl Error en TEST es mayor que en TRAIN; esto es lo esperable')
else:
    print(f'\nEl Error en TEST es menor que en TRAIN; esto NO es lo esperable')

TEST -> Root Mean Squared Error: 4.109 (to the third decimal point)
TRAIN -> Root Mean Squared Error: 4.731 (to the third decimal point)

El Error en TEST es menor que en TRAIN; esto NO es lo esperable


# Model Two #

In [7]:
modpred2 = smf.ols('wage ~ looks', data=train).fit()

train = train.copy()
test = test.copy()

train.loc[:, 'wage_pred_2'] = modpred2.predict(train)
test.loc[:, 'wage_pred_2'] = modpred2.predict(test)


print('Test Set with Predictions:')
test[['wage', 'looks', 'wage_pred_2']].head()

Test Set with Predictions:


Unnamed: 0,wage,looks,wage_pred_2
1,4.28,3.0,6.247255
2,7.96,4.0,6.621805
7,7.69,4.0,6.621805
9,3.89,3.0,6.247255
11,4.03,4.0,6.621805


In [8]:
educ_mse_value_test = mean_squared_error(test['wage'], test['wage_pred'])
educ_mse_value_train = mean_squared_error(train['wage'], train['wage_pred'])

educ_rmse_value_test = np.sqrt(educ_mse_value_test)
educ_rmse_value_train = np.sqrt(educ_mse_value_train)

print(f'TEST EDUC -> Root Mean Squared Error: {educ_rmse_value_test:.3f} (to the third decimal point)')
print(f'TRAIN EDUC -> Root Mean Squared Error: {educ_rmse_value_train:.3f} (to the third decimal point)')

###

looks_mse_value_test = mean_squared_error(test['wage'], test['wage_pred_2'])
looks_mse_value_train = mean_squared_error(train['wage'], train['wage_pred_2'])

looks_rmse_value_test = np.sqrt(looks_mse_value_test)
looks_rmse_value_train = np.sqrt(looks_mse_value_train)

print(f'TEST LOOKS -> Root Mean Squared Error: {looks_rmse_value_test:.3f} (to the third decimal point)')
print(f'TRAIN LOOKS -> Root Mean Squared Error: {looks_rmse_value_train:.3f} (to the third decimal point)')

###

if educ_rmse_value_test > looks_rmse_value_test:
    print(f'\nEl RMSE del modelo 2 que contempla la variable looks ({looks_rmse_value_test}) tiene un\nRMSE menor que el modelo 1 que contempla la variable educ ({educ_rmse_value_test}).')
else:
    print(f'\nEl RMSE del modelo 1 que contempla la variable educ ({educ_rmse_value_test}) tiene un\nRMSE menor que el modelo 2 que contempla la variable looks ({looks_rmse_value_test}).')

TEST EDUC -> Root Mean Squared Error: 4.109 (to the third decimal point)
TRAIN EDUC -> Root Mean Squared Error: 4.731 (to the third decimal point)
TEST LOOKS -> Root Mean Squared Error: 4.240 (to the third decimal point)
TRAIN LOOKS -> Root Mean Squared Error: 4.818 (to the third decimal point)

El RMSE del modelo 1 que contempla la variable educ (4.109053329782886) tiene un
RMSE menor que el modelo 2 que contempla la variable looks (4.239846271969317).
