# USA HOUSING CRISIS


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

## Ver /Descrever dados

In [None]:
# Ler o DataSet
df=pd.read_csv("../Datasets/USA_Housing.csv")
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.columns

In [None]:
df.rename(columns={
    'Avg. Area Income': "Income",
    'Avg. Area House Age': "Age",
    'Avg. Area Number of Rooms': "Rooms",    
    'Avg. Area Number of Bedrooms': "Bedrooms",
    'Area Population': "Population"
}, inplace=True)

In [None]:
plt.figure(figsize=(6,4))
sns.heatmap(df.drop(['Address'], axis=1).corr(), annot=True)

In [None]:
sns.pairplot(df)

In [None]:
df.columns

In [None]:
#scale the attributes using normal scaler
from sklearn.preprocessing import StandardScaler
standardScaler = StandardScaler()
columns_to_scale = ['Income', 'Age', 'Rooms', 'Bedrooms', 'Population']
df[columns_to_scale] = standardScaler.fit_transform(df[columns_to_scale])
df

In [None]:
X=df.drop(['Price','Address'],axis=1)
y=df['Price']

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=24)

## Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression

lr =LinearRegression()
lr.fit(X_train, y_train)

In [None]:
lr_pred= lr.predict(X_test)

In [None]:
plt.scatter(y_test, lr_pred, s=1)

In [None]:
sns.displot((y_test-lr_pred))

In [None]:
from sklearn.metrics import r2_score, mean_absolute_error as mae, mean_squared_error as mse

In [None]:
print("R2: ", r2_score(y_test, lr_pred))
print("MAE: ", mae(y_test, lr_pred))
print("MSE: ", mse(y_test, lr_pred))
print("RMSE: ", np.sqrt(mse(y_test, lr_pred)))

# 26/04/24

## Criar Funções para apresentar avaliação de perfomance

In [None]:
def evaluate(reais, previstos):
    _mae = mae(reais, previstos)
    _mse = mse(reais, previstos)
    _rmse = np.sqrt(_mse)  # Calculate RMSE from MSE
    _r2 = r2_score(reais, previstos)
    return round(_mae, 2), round(_mse, 2), round(_rmse, 2), round(_r2, 2)

def print_evaluate(model, reais, previstos):
    mae, mse, rmse, r2 = evaluate(reais,previstos)
    print("------------------------------------------")
    print("MAE:", mae)
    print("RMSE:", rmse)
    print("R2 score:", r2)
    print("------------------------------------------")
    
    results=np.array([model,mae,mse,rmse,r2])
    results_df.loc[len(results_df)+1]= results

results_df = pd.DataFrame(columns=['Model', 'MAE','MSE','RMSE','R2'])

In [None]:
from time import time
from sklearn.model_selection import GridSearchCV

def grid_search(model, params):
    GSCV = GridSearchCV(estimator=model,
                        param_grid=params,
                        cv=5,
                        n_jobs=-1,
                        verbose=2
                       ) 
    start=time()
    GSCV.fit(X_train,y_train)
    grid_time=time()-start
    print("Grid time: ", grid_time)
    print(GSCV.best_params_)
    return GSCV.best_params_

In [None]:
#registar valores de RL
print_evaluate("Linear Regression", y_test,lr_pred)

# Random Forest

In [None]:
from sklearn.ensemble import  RandomForestRegressor

In [None]:
rf_reg= RandomForestRegressor(random_state=24)

rf_reg.fit(X_train, y_train)

rf_pred = rf_reg.predict(X_test)

In [None]:
print("Random Forest Regressor:")
print_evaluate("Linear Regression", y_test,rf_pred)

In [None]:
rf_reg.get_params()

# Optimize RF

In [None]:
model = RandomForestRegressor(random_state=24)
params ={
    'n_estimators':[100, 500,1000],
    'max_features':[1.0,'sqrt','log2',None],
    'max_depth':[None, 5,10],
    'criterion':['squared_error', 'poisson']
}

best_params = grid_search(model, params)
best_params

In [None]:
#create a new RF
rf_reg_2 = RandomForestRegressor(random_state=24,
                                n_estimators=best_params['n_estimators'],
                                max_features=best_params['max_features'],
                                max_depth=best_params['max_depth'],
                                criterion=best_params['criterion'])
rf_reg_2.fit(X_train,y_train)
rf_pred_2 = rf_reg_2.predict(X_test)
print("Random Forest Regressor:")
print_evaluate("Linear Regression", y_test,rf_pred_2)

# SVM

In [None]:
from sklearn.svm import SVR

In [None]:
svr= SVR()

svr.fit(X_train, y_train)

svr_pred = svr.predict(X_test)
print("Random Forest Regressor:")
print_evaluate("SVR", y_test,svr_pred)

In [None]:
svr.get_params()

In [None]:
model=SVR()
params={
    'C':[1.0,0.1,10],
    'kernel':['rbf', 'poly'],
    'degree':[3,5],
    'gamma':['scale',1]
}
best_params=grid_search(model,params)
best_params

In [None]:
svr_2=SVR(
        C=best_params['C'],
        kernel=best_params['kernel'],
        degree=best_params['degree'],
        gamma=best_params['gamma'],
)
svr_2.fit(X_train,y_train)

svr_2_pred=svr_2.predict(X_test)

print("Random Forest Regressor:")
print_evaluate("SVR Optimized", y_test,svr_2_pred)

In [None]:
results_df

# Aula 30/04/24 - XGBoost

In [None]:
from xgboost.sklearn import XGBRegressor

In [None]:
# Create a xgboost regressor
xgb = XGBRegressor()

# Train it

xgb.fit(X_train,y_train)

# Make predictions on the test dataset
xgb_pred = xgb.predict(X_test)

print("XGBoost Regressor")
print_evaluate("XGBoost", y_test, xgb_pred)

In [None]:
xgb.get_params()

In [None]:
model = XGBRegressor()
params={
    'booster':['gbtree','gblinear'],
    'learning_rate':[0.01,0.01],
    'n_estimators':[100,500],
    'objective':['reg:squarederror']
}

best_params=grid_search(model, params)
best_params

In [None]:
# Create a xgboost regressor
xgb_2 = XGBRegressor(
    booster=best_params['booster'],
    learning_rate=best_params['learning_rate'],
    n_estimators=best_params['n_estimators'],
    objective=best_params['objective']
)

# Train it

xgb_2.fit(X_train,y_train)

# Make predictions on the test dataset
xgb_pred_2 = xgb_2.predict(X_test)

print("XGBoost Regressor")
print_evaluate("XGBoost", y_test, xgb_pred_2)