In [1]:
# importar paquetes importantes

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, recall_score, confusion_matrix
import sklearn.metrics as metrics
from sklearn.preprocessing import StandardScaler, MinMaxScaler



In [2]:
df_train = pd.read_parquet("data/train.parquet")
df_test = pd.read_parquet("data/test.parquet")

In [3]:
#cantidad de datos 
df_train.shape
df_test.shape

(38498, 21)

In [4]:
df_train.head(1)

Unnamed: 0,id,url,region,region_url,price,type,sqfeet,beds,baths,cats_allowed,...,wheelchair_access,electric_vehicle_charge,comes_furnished,laundry_options,parking_options,image_url,description,lat,long,state
0,7048013474,https://boise.craigslist.org/apa/d/very-nice-b...,boise,https://boise.craigslist.org,1350,house,1200,2,2.0,1,...,0,0,0,w/d in unit,detached garage,https://images.craigslist.org/00B0B_cPiJMEheZe...,Super cute row house in the Boise bench area. ...,43.5851,-116.225,id


In [5]:
# Crear la columna 'category_price'
def create_category(price):
    if price <= 999:
        return 1
    else:
        return 0
    
df_train['category_price'] = df_train['price'].apply(create_category)
df_test['category_price'] = None

In [6]:
df_train.head(1)

Unnamed: 0,id,url,region,region_url,price,type,sqfeet,beds,baths,cats_allowed,...,electric_vehicle_charge,comes_furnished,laundry_options,parking_options,image_url,description,lat,long,state,category_price
0,7048013474,https://boise.craigslist.org/apa/d/very-nice-b...,boise,https://boise.craigslist.org,1350,house,1200,2,2.0,1,...,0,0,w/d in unit,detached garage,https://images.craigslist.org/00B0B_cPiJMEheZe...,Super cute row house in the Boise bench area. ...,43.5851,-116.225,id,0


In [7]:
#eliminar columnas innecesarias
df_train =df_train.drop(['id','url','state','region_url', 'laundry_options', 'parking_options', 'image_url','lat', 'long', 'description' ], axis= 1)

df_test =df_test.drop  (['id','url','state','region_url', 'laundry_options', 'parking_options', 'image_url','lat', 'long', 'description' ], axis= 1)

In [8]:
#categorizar

df_train = pd.get_dummies(df_train)
df_test = pd.get_dummies(df_test)

In [9]:
df_train.head(1)

Unnamed: 0,price,sqfeet,beds,baths,cats_allowed,dogs_allowed,smoking_allowed,wheelchair_access,electric_vehicle_charge,comes_furnished,...,type_condo,type_cottage/cabin,type_duplex,type_flat,type_house,type_in-law,type_land,type_loft,type_manufactured,type_townhouse
0,1350,1200,2,2.0,1,1,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [10]:
columns_to_use = set(df_train.columns).intersection(set(df_test.columns))
columns_to_use

{'baths',
 'beds',
 'cats_allowed',
 'comes_furnished',
 'dogs_allowed',
 'electric_vehicle_charge',
 'region_SF bay area',
 'region_abilene',
 'region_akron / canton',
 'region_albany',
 'region_albuquerque',
 'region_altoona-johnstown',
 'region_amarillo',
 'region_ames',
 'region_anchorage / mat-su',
 'region_ann arbor',
 'region_annapolis',
 'region_appleton-oshkosh-FDL',
 'region_asheville',
 'region_ashtabula',
 'region_athens',
 'region_atlanta',
 'region_auburn',
 'region_augusta',
 'region_austin',
 'region_bakersfield',
 'region_baltimore',
 'region_baton rouge',
 'region_battle creek',
 'region_beaumont / port arthur',
 'region_bellingham',
 'region_bemidji',
 'region_bend',
 'region_billings',
 'region_binghamton',
 'region_birmingham',
 'region_bismarck',
 'region_bloomington',
 'region_bloomington-normal',
 'region_boise',
 'region_boone',
 'region_boston',
 'region_boulder',
 'region_bowling green',
 'region_bozeman',
 'region_brainerd',
 'region_brownsville',
 'region_b

In [11]:
#Separar las características y la variable objetivo en conjunto de entrenamiento y prueba
X = df_train[columns_to_use]
y = df_train.category_price

  X = df_train[columns_to_use]


In [12]:
#Escalar los datos
scaler = StandardScaler()
X_train = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
#Crear el modelo
clf = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=42)


In [14]:
#Entrenar el modelo
clf.fit(X_train, y_train)

In [15]:
RandomForestClassifier(n_jobs=-1, random_state=42)


In [16]:
#Hacer predicciones con el conjunto de prueba
y_train_pred = clf.predict(X_train)
y_test_pred = clf.predict(X_test)
print(metrics.accuracy_score(y_train, y_train_pred))
print(metrics.accuracy_score(y_test, y_test_pred))

0.9801070051193616
0.9338345647656431


In [20]:
#Calcular la precisión
accuracy = accuracy_score(y_test, y_test_pred)
print("Accuracy: ", accuracy)

Accuracy:  0.9338345647656431


In [21]:
#Calcular el recall
conf_matrix = confusion_matrix(y_test, y_test_pred)
TP = conf_matrix[0, 0]
FN = conf_matrix[0, 1]
recall_low = TP / (TP + FN)
print("Recall for low: {:.2f}%".format(recall_low * 100))

Recall for low: 94.29%


In [17]:
#c
X_new = df_test[list(columns_to_use)]
y_pred = clf.predict(X_new)

In [18]:
##Exportar el DataFrame a un archivo csv
df_y_pred = pd.DataFrame(y_pred,columns=["category_price"])
df_y_pred.to_csv("data/Aljose30.csv", index=False)