In [2]:
import pandas as pd
import numpy as np
import os

from sklearn.model_selection import train_test_split

# Algoritmos de Regresión
from sklearn.linear_model import LinearRegression, Lasso, Lars, Ridge, BayesianRidge, ElasticNet
# Métricas
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest, f_regression, f_classif
from sklearn.model_selection import train_test_split
from datetime import datetime
from sklearn.impute import SimpleImputer
import itertools
from sklearn.preprocessing import PolynomialFeatures, MinMaxScaler

data_path = 'C:/Users/aldol/Programming/Diplo/DataExam_Students/'
os.listdir(data_path)


abnb_v = pd.read_csv(data_path + 'airbnb_test.csv')
abnb_t = pd.read_csv(data_path + 'airbnb_train.csv')

pd.set_option('display.max_columns', None)


# Imputación

In [3]:
cont=['bathrooms','bedrooms','beds']

imputador_media = SimpleImputer(strategy='median')
abnb_t[cont]=imputador_media.fit_transform(abnb_t[cont])

## Rankear categóricas por su mediana de precio

In [4]:
var_categoricas=['property_type','room_type','bed_type','cancellation_policy','cleaning_fee','city','host_identity_verified','instant_bookable','neighbourhood','host_response_rate','review_scores_rating']


def ranking_con_otros(df, columnas):
    diccionarios = {}  # Usamos un diccionario en lugar de una lista de diccionarios

    for col in columnas:
        df[col] = df[col].fillna('Otros')
        
        medianas = df.groupby(col)['price'].median().reset_index()
 
        frecuencias = df[col].value_counts()
        umbral = len(df) * 0.02
        categorias_otras = frecuencias[frecuencias < umbral].index
        medianas.loc[medianas[col].isin(categorias_otras), col] = 'Otros'

        min_value = medianas['price'].min()
        max_value = medianas['price'].max()
        medianas['indice'] = (medianas['price'] - min_value) / (max_value - min_value)
        diccionario = dict(zip(medianas[col], medianas['indice']))
        diccionarios[col] = diccionario

    return diccionarios

#ranking_con_otros(abnb_t, var_categoricas)

# Años siendo host, first review, last review

In [5]:
fechas=['host_since','first_review','last_review']

for fchs in fechas:
    abnb_t[fchs] = pd.to_datetime(abnb_t[fchs], errors='coerce')
    abnb_v[fchs] = pd.to_datetime(abnb_v[fchs], errors='coerce')

    hoy = datetime.now()  
    abnb_t[f'years_{fchs}'] = (hoy - abnb_t[fchs]).dt.days/365
    abnb_v[f'years_{fchs}'] = (hoy - abnb_v[fchs]).dt.days/365

## Ver si tienen ciertas amenidades

In [6]:
def has_amenities(df, amenities):
    for a in amenities:

        df[f'has_{a}'] = 0

        for index, row in df.iterrows():
            if a in row['amenities']:
                df.at[index, f'has_{a}'] = 1
    return df


In [7]:
amenities = ['Internet', 'Pets', 'Kitchen', 'Gym', 'parking', 'Pool', 'Air conditioning', 'Hot tub', 'Cable TV', 'Washer','Wheelchair','Gym','Elevator','Heating','friendly','Essentials','Iron','dryer']

resultado = has_amenities(abnb_t, amenities)
resultado_v=has_amenities(abnb_v, amenities)

In [8]:
abnb_t=pd.DataFrame(resultado)
abnb_v=pd.DataFrame(resultado_v)

# Relación accommodates con beds, bathrooms. bedrooms

In [9]:
bbb=['bathrooms', 'bedrooms', 'beds']

for b in bbb:
    abnb_t[f'accommodates_{b}']=abnb_t[b]/abnb_t['accommodates']
    abnb_v[f'accommodates_{b}']=abnb_v[b]/abnb_v['accommodates']

# Cuántas amenidades tienen y qué promedio

In [10]:
amenidades=['has_Internet',	'has_Pets',	'has_Kitchen',	'has_Gym',	'has_parking',	'has_Pool',	'has_Air conditioning',	'has_Hot tub',	'has_Cable TV',	'has_Washer','has_Wheelchair','has_Elevator','has_Heating','has_friendly','has_Essentials','has_Iron','has_dryer']


abnb_t['total_amenities'] = abnb_t[amenidades].sum(axis=1)
abnb_v['total_amenities'] = abnb_v[amenidades].sum(axis=1)

abnb_t['mean_amenities'] = abnb_t[amenidades].mean(axis=1)
abnb_v['mean_amenities'] = abnb_v[amenidades].mean(axis=1)

# Pasar a categorías numéricas

In [11]:
abnb_t['host_has_profile_pic'] = abnb_t['host_has_profile_pic'].map({'f': 0,np.nan:0,'t': 1}).astype(int)
abnb_v['host_has_profile_pic'] = abnb_v['host_has_profile_pic'].map({'f': 0,np.nan:0,'t': 1}).astype(int)

# Combinaciones de amenidades

In [12]:
caracteristicas = ['has_Internet', 'has_Pets', 'has_Kitchen', 'has_Gym',
                   'has_parking', 'has_Pool', 'has_Air conditioning', 'has_Hot tub',
                   'has_Cable TV', 'has_Washer','has_Wheelchair','has_Elevator',
                   'has_Heating','has_friendly','has_Essentials','has_Iron','has_dryer']

def generar_combinaciones(df):
    max_tamano_combinacion = 4
    
    for r in range(2, min(max_tamano_combinacion + 1, len(caracteristicas) + 1)):
        for combo in itertools.combinations(caracteristicas, r):
            nombre_columna = '_'.join(combo)
            df[nombre_columna] = df[list(combo)].sum(axis=1).apply(lambda x: 1 if x == len(combo) else 0)

    return df


In [13]:
abnb_t=generar_combinaciones(abnb_t)
abnb_v=generar_combinaciones(abnb_v)

  df[nombre_columna] = df[list(combo)].sum(axis=1).apply(lambda x: 1 if x == len(combo) else 0)
  df[nombre_columna] = df[list(combo)].sum(axis=1).apply(lambda x: 1 if x == len(combo) else 0)
  df[nombre_columna] = df[list(combo)].sum(axis=1).apply(lambda x: 1 if x == len(combo) else 0)
  df[nombre_columna] = df[list(combo)].sum(axis=1).apply(lambda x: 1 if x == len(combo) else 0)
  df[nombre_columna] = df[list(combo)].sum(axis=1).apply(lambda x: 1 if x == len(combo) else 0)
  df[nombre_columna] = df[list(combo)].sum(axis=1).apply(lambda x: 1 if x == len(combo) else 0)
  df[nombre_columna] = df[list(combo)].sum(axis=1).apply(lambda x: 1 if x == len(combo) else 0)
  df[nombre_columna] = df[list(combo)].sum(axis=1).apply(lambda x: 1 if x == len(combo) else 0)
  df[nombre_columna] = df[list(combo)].sum(axis=1).apply(lambda x: 1 if x == len(combo) else 0)
  df[nombre_columna] = df[list(combo)].sum(axis=1).apply(lambda x: 1 if x == len(combo) else 0)
  df[nombre_columna] = df[list(combo)].s

# Clasificación de los clientes:

In [29]:
tar = 'price'
drop=['amenities',
 'description',
 'host_since',
 #'latitude',
 #'longitude',
 'last_review',
 'first_review',
 'name'#,
 #'thumbnail_url',
 #'zipcode'
 ]

aux=abnb_t.drop(columns=drop)
aux_v=abnb_v.drop(columns=drop)


X = aux.drop(columns=[tar])
X_v = aux_v.copy()

y = aux[tar].copy()
y_log = np.log(y)

# Tienen url

In [30]:
X['thumbnail_url'] = X['thumbnail_url'].notnull().astype(int)
X_v['thumbnail_url'] = X_v['thumbnail_url'].notnull().astype(int)

# Zipcode

In [31]:
X['zipcode'] = pd.to_numeric(X['zipcode'], errors='coerce')
X_v['zipcode'] = pd.to_numeric(X_v['zipcode'], errors='coerce')

In [32]:
dict_property_type={'House': 1.0,
  'Townhouse': 0.75,
  'Apartment': 0.5,
  'Otros': 0.25,
  'Condominium': 0.0}

dict_room_type={'Shared room': 0.0, 'Private room': 0.5, 'Entire home/apt': 1.0}

dict_bed_type= {'Otros': 0.0, 'Real Bed': 1.0}

dict_cancellation_policy={'flexible': 1.0,
'moderate': 0.6666666666666666,
'strict': 0.3333333333333333,
'Otros': 0.0}

dict_cleaning_fee={False: 0.0, True: 1.0}

dict_city={'Chicago': 0.0, 'LA': 0.2, 'NYC': 0.4, 'DC': 0.6, 'Boston': 0.8, 'SF': 1.0}

dict_host_identity_verified={'f': 0.0, 'Otros': 0.5, 't': 1.0}

dict_host_response_rate={'100%': 1.0, '90%': 0.5, 'Otros': 0.0}

instant_bookable={'t': 1.0, 'f': 0.0}

neighbourhood={'Bushwick': 1.0,
'Bedford-Stuyvesant': 0.6666666666666666,
'Williamsburg': 0.3333333333333333,
'Otros': 0.0}


dict_review_scores_rating= {80.0: 0.0,
  90.0: 0.08333333333333333,
  91.0: 0.16666666666666666,
  'Otros': 0.25,
  95.0: 0.3333333333333333,
  92.0: 0.4166666666666667,
  96.0: 0.5,
  93.0: 0.5833333333333334,
  94.0: 0.6666666666666666,
  97.0: 0.75,
  100.0: 0.8333333333333334,
  98.0: 0.9166666666666666,
  99.0: 1.0}

In [33]:
diccionarios={'property_type':dict_property_type,
 'room_type':dict_room_type,
 'bed_type':dict_bed_type,
 'cancellation_policy':dict_cancellation_policy,
 'cleaning_fee':dict_cleaning_fee,
 'city':dict_city,
 'host_identity_verified':dict_host_identity_verified,
 'host_response_rate':dict_host_response_rate,
 'instant_bookable':instant_bookable,
 'neighbourhood':neighbourhood,
 'review_scores_rating':dict_review_scores_rating}

def mapeo_diccionairios(df,diccionarios):

    for i, j in diccionarios.items():

        df[i] = df[i].map(j)
    
    return df

X.columns

In [34]:
X=mapeo_diccionairios(X,diccionarios)
X_v=mapeo_diccionairios(X_v,diccionarios)

# Host profile

In [35]:
X['host_identity_profile'] = 0  # Establece un valor predeterminado

# Verifica si hay al menos un dormitorio o una cama
X.loc[(X['host_identity_verified'] > 0) & (X['host_has_profile_pic'] > 0 ) , 'host_identity_profile'] = 1

  X['host_identity_profile'] = 0  # Establece un valor predeterminado


In [36]:
X_v['host_identity_profile'] = 0  # Establece un valor predeterminado

# Verifica si hay al menos un dormitorio o una cama
X_v.loc[(X_v['host_identity_verified'] > 0) & (X_v['host_has_profile_pic'] > 0 ) , 'host_identity_profile'] = 1

# Fillna

In [37]:
columnas_con_na = X.columns[X.isna().any()].tolist()

#Imputacion por mediana para las variables continuas
imputador_media = SimpleImputer(strategy='median')
X[columnas_con_na]=imputador_media.fit_transform(X[columnas_con_na])
X_v[columnas_con_na]=imputador_media.fit_transform(X_v[columnas_con_na])

In [38]:
X = X.fillna(0)
X_v=X_v.fillna(0)

# Características polinomiales

scaler = MinMaxScaler()

df_numeric =X[['property_type', 'room_type', 'accommodates', 'bathrooms', 'bed_type',
       'cancellation_policy', 'cleaning_fee', 'city', 'host_identity_verified','instant_bookable', 'neighbourhood', 'number_of_reviews', 'bedrooms', 'beds',
       'years_host_since',	'years_first_review',	'years_last_review',	
       'accommodates_bathrooms',	'accommodates_bedrooms',	'accommodates_beds'	,
       'total_amenities',	'mean_amenities'
       ]]

# Ajusta y transforma el DataFrame
X = pd.DataFrame(scaler.fit_transform(df_numeric), columns=df_numeric.columns)

scaler = MinMaxScaler()

df_numeric =X_v[['property_type', 'room_type', 'accommodates', 'bathrooms', 'bed_type',
       'cancellation_policy', 'cleaning_fee', 'city', 'host_identity_verified','instant_bookable', 'neighbourhood', 'number_of_reviews', 'bedrooms', 'beds',
       'years_host_since',	'years_first_review',	'years_last_review',	
       'accommodates_bathrooms',	'accommodates_bedrooms',	'accommodates_beds'	,
       'total_amenities',	'mean_amenities'
       ]]

# Ajusta y transforma el DataFrame
X_v = pd.DataFrame(scaler.fit_transform(df_numeric), columns=df_numeric.columns)

In [39]:
df_numeric =X[['property_type', 'room_type', 'accommodates', 'bathrooms', 'bed_type',
       'cancellation_policy', 'cleaning_fee', 'city', 'host_identity_verified','instant_bookable', 'neighbourhood', 'number_of_reviews', 'bedrooms', 'beds',
       'years_host_since',	'years_first_review',	'years_last_review',	
       'accommodates_bathrooms',	'accommodates_bedrooms',	'accommodates_beds'	,
       'total_amenities',	'mean_amenities'
       ]]

poly = PolynomialFeatures(degree=2, include_bias=False)
poly_features = poly.fit_transform(df_numeric)

poly_df = pd.DataFrame(poly_features)

poly_feature_names = [f'poly_{i}' for i in range(poly_df.shape[1])]
poly_df.columns = poly_feature_names

X = pd.concat([X, poly_df], axis=1)

In [40]:
df_numeric = X_v[['property_type', 'room_type', 'accommodates', 'bathrooms', 'bed_type',
       'cancellation_policy', 'cleaning_fee', 'city', 'host_identity_verified','instant_bookable', 'neighbourhood', 'number_of_reviews', 'bedrooms', 'beds',
       'years_host_since',	'years_first_review',	'years_last_review',	
       'accommodates_bathrooms',	'accommodates_bedrooms',	'accommodates_beds'	,
       'total_amenities',	'mean_amenities'
       ]]


poly = PolynomialFeatures(degree=2, include_bias=False)
poly_features = poly.fit_transform(df_numeric)

poly_df = pd.DataFrame(poly_features)

poly_feature_names = [f'poly_{i}' for i in range(poly_df.shape[1])]
poly_df.columns = poly_feature_names

X_v = pd.concat([X_v, poly_df], axis=1)

# Pueba de modelos

In [41]:
#Lineal
lr = LinearRegression(fit_intercept=True)
lr.fit(X, y_log)
y_p=lr.predict(X)

In [43]:
# Métricas:
print('Validation:')
print(f'MAE: {mean_absolute_error(y, np.exp(y_p))}')
print(f'R2: {r2_score(y , np.exp(y_p))}')

Validation:
MAE: 54.801916995026076
R2: 0.5149126616803267


In [44]:
y_p_v=lr.predict(X_v)

In [45]:
Equipo3_Airbnb = pd.DataFrame(np.exp(y_p_v), columns=['price'], index=aux_v.id).reset_index()
Equipo3_Airbnb.head()


Unnamed: 0,id,price
0,7702,85.296827
1,56643,349.644405
2,51825,85.660684
3,51470,116.46361
4,47904,92.230441


In [47]:
Equipo3_Airbnb.to_csv('Equipo3_Airbnb.csv')

54.801916995026076