In [None]:
pip install pandas scikit-learn imbalanced-learn

SyntaxError: invalid syntax (747503510.py, line 1)

In [1]:
import pandas as pd


df = pd.read_csv("/Users/eduardaferreira/Desktop/AB_NYC_2019.csv")

# Ver informações básicas
print("Tamanho do dataset (linhas, colunas):", df.shape)
print("\nColunas disponíveis:")
print(df.columns)
print("\nPrimeiras 5 linhas:")
print(df.head())

Tamanho do dataset (linhas, colunas): (48895, 16)

Colunas disponíveis:
Index(['id', 'name', 'host_id', 'host_name', 'neighbourhood_group',
       'neighbourhood', 'latitude', 'longitude', 'room_type', 'price',
       'minimum_nights', 'number_of_reviews', 'last_review',
       'reviews_per_month', 'calculated_host_listings_count',
       'availability_365'],
      dtype='object')

Primeiras 5 linhas:
     id                                              name  host_id  \
0  2539                Clean & quiet apt home by the park     2787   
1  2595                             Skylit Midtown Castle     2845   
2  3647               THE VILLAGE OF HARLEM....NEW YORK !     4632   
3  3831                   Cozy Entire Floor of Brownstone     4869   
4  5022  Entire Apt: Spacious Studio/Loft by central park     7192   

     host_name neighbourhood_group neighbourhood  latitude  longitude  \
0         John            Brooklyn    Kensington  40.64749  -73.97237   
1     Jennifer           Man

In [2]:

print("\nValores faltantes por coluna:")
print(df.isnull().sum())


Valores faltantes por coluna:
id                                    0
name                                 16
host_id                               0
host_name                            21
neighbourhood_group                   0
neighbourhood                         0
latitude                              0
longitude                             0
room_type                             0
price                                 0
minimum_nights                        0
number_of_reviews                     0
last_review                       10052
reviews_per_month                 10052
calculated_host_listings_count        0
availability_365                      0
dtype: int64


In [3]:
# Estatísticas descritivas
print("\nEstatísticas de colunas numéricas:")
print(df[['price', 'minimum_nights', 'number_of_reviews', 'reviews_per_month', 'availability_365']].describe())


Estatísticas de colunas numéricas:
              price  minimum_nights  number_of_reviews  reviews_per_month  \
count  48895.000000    48895.000000       48895.000000       38843.000000   
mean     152.720687        7.029962          23.274466           1.373221   
std      240.154170       20.510550          44.550582           1.680442   
min        0.000000        1.000000           0.000000           0.010000   
25%       69.000000        1.000000           1.000000           0.190000   
50%      106.000000        3.000000           5.000000           0.720000   
75%      175.000000        5.000000          24.000000           2.020000   
max    10000.000000     1250.000000         629.000000          58.500000   

       availability_365  
count      48895.000000  
mean         112.781327  
std          131.622289  
min            0.000000  
25%            0.000000  
50%           45.000000  
75%          227.000000  
max          365.000000  


In [7]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

df = pd.read_csv("/Users/eduardaferreira/Desktop/AB_NYC_2019.csv")
print("Tamanho inicial (linhas, colunas):", df.shape)

# Passo 2: valores omissos
# Descartar colunas irrelevantes
df = df.drop(['name', 'host_name', 'last_review'], axis=1)

df.loc[df['number_of_reviews'] == 0, 'reviews_per_month'] = 0
imputer = SimpleImputer(strategy='median')
df[['reviews_per_month']] = imputer.fit_transform(df[['reviews_per_month']])

print("Valores faltantes após tratamento:")
print(df.isnull().sum())

# Passo 3: Remover outliers
def replace_outliers(df, column, min_value=None):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
   
    if min_value is not None and lower_bound < min_value:
        lower_bound = min_value
    df[column] = df[column].clip(lower=lower_bound, upper=upper_bound)
    return df

df = replace_outliers(df, 'price', min_value=10)  # Definir mínimo de $10 para price
df = replace_outliers(df, 'minimum_nights')

print("Estatísticas de 'price' e 'minimum_nights' após substituição de outliers (ajustado):")
print(df[['price', 'minimum_nights']].describe())

# Passo 4: Normalizar valores numéricos

numeric_cols = ['price', 'minimum_nights', 'number_of_reviews', 'reviews_per_month', 'availability_365']

scaler = StandardScaler()
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

print("Valores normalizados (primeiras 5 linhas):")
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(df[numeric_cols].head())
    
# Passo 5: Codificar variáveis categóricas (ajustado para não dropar categorias)
encoder = OneHotEncoder(sparse_output=False)  # Removido o drop='first'
encoded_cats = encoder.fit_transform(df[['neighbourhood_group', 'room_type']])
encoded_df = pd.DataFrame(encoded_cats, columns=encoder.get_feature_names_out())
df = pd.concat([df.drop(['neighbourhood_group', 'room_type'], axis=1), encoded_df], axis=1)

# Simplificar neighbourhood com PCP (top 20)
top_neighbourhoods = df['neighbourhood'].value_counts().head(20).index
df['neighbourhood'] = df['neighbourhood'].apply(lambda x: x if x in top_neighbourhoods else 'Outros')
encoded_neigh = encoder.fit_transform(df[['neighbourhood']])
encoded_neigh_df = pd.DataFrame(encoded_neigh, columns=encoder.get_feature_names_out())
df = pd.concat([df.drop('neighbourhood', axis=1), encoded_neigh_df], axis=1)
print("Colunas após codificação:")
print(df.columns)

#Passo 6: Selecionar atributos relevantes 
cols_to_drop = ['id', 'host_id']
df_for_corr = df.drop(cols_to_drop, axis=1)
spearman_corr = df_for_corr.corr(method='spearman')['price'].abs()
relevant_features = spearman_corr[spearman_corr > 0.1].index
df = df[relevant_features.union(['price'])]
print("Colunas após seleção de atributos:")
print(df.columns)

df.to_csv("dataset_tratado.csv", index=False)

import os
os.getcwd()


Tamanho inicial (linhas, colunas): (48895, 16)
Valores faltantes após tratamento:
id                                0
host_id                           0
neighbourhood_group               0
neighbourhood                     0
latitude                          0
longitude                         0
room_type                         0
price                             0
minimum_nights                    0
number_of_reviews                 0
reviews_per_month                 0
calculated_host_listings_count    0
availability_365                  0
dtype: int64
Estatísticas de 'price' e 'minimum_nights' após substituição de outliers (ajustado):
              price  minimum_nights
count  48895.000000     48895.00000
mean     132.982002         3.82532
std       83.527057         3.32072
min       10.000000         1.00000
25%       69.000000         1.00000
50%      106.000000         3.00000
75%      175.000000         5.00000
max      334.000000        11.00000
Valores normalizados (primei

'/Users/eduardaferreira/Desktop/projeto paradigmas'