In [21]:
import pandas as pd
import numpy as np
import re

import sidetable as stb

import matplotlib.pyplot as plt
import seaborn as sns

from scipy.stats import skew
from scipy.stats import kurtosistest
from scipy import stats

plt.rcParams['figure.figsize'] = (15, 10)
pd.options.display.max_columns = None
# Gestión de nulos
# ======================
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import KNNImputer

import warnings
warnings.filterwarnings('ignore')


In [2]:
df = pd.read_csv("files/Hostel.csv", index_col= 0)

In [3]:
df.shape

(342, 15)

In [4]:
df.dtypes

hostel.name       object
City              object
price.from         int64
Distance          object
summary.score    float64
rating.band       object
atmosphere       float64
cleanliness      float64
facilities       float64
location.y       float64
security         float64
staff            float64
valueformoney    float64
lon              float64
lat              float64
dtype: object

In [5]:
df.isnull().sum()

hostel.name       0
City              0
price.from        0
Distance          0
summary.score    15
rating.band      15
atmosphere       15
cleanliness      15
facilities       15
location.y       15
security         15
staff            15
valueformoney    15
lon              44
lat              44
dtype: int64

In [6]:
df.isnull().sum() * 100 / df.shape[0]

hostel.name       0.000000
City              0.000000
price.from        0.000000
Distance          0.000000
summary.score     4.385965
rating.band       4.385965
atmosphere        4.385965
cleanliness       4.385965
facilities        4.385965
location.y        4.385965
security          4.385965
staff             4.385965
valueformoney     4.385965
lon              12.865497
lat              12.865497
dtype: float64

In [7]:
numericas= df.select_dtypes(include=np.number)

In [8]:
imputerKNN = KNNImputer(n_neighbors=5)
imputerKNN.fit(numericas)

In [9]:
numericas_knn= imputerKNN.transform(numericas)

In [10]:
df_knn_imputer = pd.DataFrame(numericas_knn, columns = numericas.columns)

In [11]:
df_knn_imputer.head()

Unnamed: 0,price.from,summary.score,atmosphere,cleanliness,facilities,location.y,security,staff,valueformoney,lon,lat
0,3300.0,9.2,8.9,9.4,9.3,8.9,9.0,9.4,9.4,135.513767,34.682678
1,2600.0,9.5,9.4,9.7,9.5,9.7,9.2,9.7,9.5,135.806311,34.88932
2,3600.0,8.7,8.0,7.0,9.0,8.0,10.0,10.0,9.0,139.777472,35.697447
3,2600.0,7.4,8.0,7.5,7.5,7.5,7.0,8.0,6.5,139.783667,35.712716
4,1500.0,9.4,9.5,9.5,9.0,9.0,9.5,10.0,9.5,139.798371,35.727898


In [12]:
columnas_knn = df_knn_imputer.columns

In [13]:
df.drop(columnas_knn, axis = 1, inplace = True)

In [14]:
df[columnas_knn] = numericas_knn

In [15]:
df.isnull().sum()

hostel.name       0
City              0
Distance          0
rating.band      15
price.from        0
summary.score     0
atmosphere        0
cleanliness       0
facilities        0
location.y        0
security          0
staff             0
valueformoney     0
lon               0
lat               0
dtype: int64

In [16]:
df["rating.band"].mode()

0    Superb
dtype: object

In [17]:
df.dropna(inplace=True, axis= 0, subset=["rating.band"])

In [18]:
df.isnull().sum()

hostel.name      0
City             0
Distance         0
rating.band      0
price.from       0
summary.score    0
atmosphere       0
cleanliness      0
facilities       0
location.y       0
security         0
staff            0
valueformoney    0
lon              0
lat              0
dtype: int64

In [19]:
df.duplicated().sum()

0

In [20]:
df.head(3)

Unnamed: 0,hostel.name,City,Distance,rating.band,price.from,summary.score,atmosphere,cleanliness,facilities,location.y,security,staff,valueformoney,lon,lat
1,"""Bike & Bed"" CharinCo Hostel",Osaka,2.9km from city centre,Superb,3300.0,9.2,8.9,9.4,9.3,8.9,9.0,9.4,9.4,135.513767,34.682678
2,& And Hostel,Fukuoka-City,0.7km from city centre,Superb,2600.0,9.5,9.4,9.7,9.5,9.7,9.2,9.7,9.5,135.806311,34.88932
3,&And Hostel Akihabara,Tokyo,7.8km from city centre,Fabulous,3600.0,8.7,8.0,7.0,9.0,8.0,10.0,10.0,9.0,139.777472,35.697447


In [24]:
df['Distance']= df['Distance'].apply(lambda x: re.search('^[0-9].[0-9]', x))

1      <re.Match object; span=(0, 3), match='2.9'>
2      <re.Match object; span=(0, 3), match='0.7'>
3      <re.Match object; span=(0, 3), match='7.8'>
4      <re.Match object; span=(0, 3), match='8.7'>
5                                             None
                          ...                     
338    <re.Match object; span=(0, 3), match='2.6'>
339    <re.Match object; span=(0, 3), match='2.9'>
340                                           None
341    <re.Match object; span=(0, 3), match='2.4'>
342    <re.Match object; span=(0, 3), match='5.9'>
Name: Distance, Length: 327, dtype: object

In [26]:
def extract_and_convert(x):
    match = re.search(r'^[0-9].[0-9]', x)
    if match:
        return float(match.group())
    else:
        return None  # En caso de no encontrar una coincidencia

In [27]:
df['DistanceNum'] = df['Distance'].apply(extract_and_convert)

In [28]:
df.head(2)

Unnamed: 0,hostel.name,City,Distance,rating.band,price.from,summary.score,atmosphere,cleanliness,facilities,location.y,security,staff,valueformoney,lon,lat,DistanceNum
1,"""Bike & Bed"" CharinCo Hostel",Osaka,2.9km from city centre,Superb,3300.0,9.2,8.9,9.4,9.3,8.9,9.0,9.4,9.4,135.513767,34.682678,2.9
2,& And Hostel,Fukuoka-City,0.7km from city centre,Superb,2600.0,9.5,9.4,9.7,9.5,9.7,9.2,9.7,9.5,135.806311,34.88932,0.7


In [29]:
df.drop(columns=['Distance'], axis = 1, inplace=True)

In [30]:
df.head(2)

Unnamed: 0,hostel.name,City,rating.band,price.from,summary.score,atmosphere,cleanliness,facilities,location.y,security,staff,valueformoney,lon,lat,DistanceNum
1,"""Bike & Bed"" CharinCo Hostel",Osaka,Superb,3300.0,9.2,8.9,9.4,9.3,8.9,9.0,9.4,9.4,135.513767,34.682678,2.9
2,& And Hostel,Fukuoka-City,Superb,2600.0,9.5,9.4,9.7,9.5,9.7,9.2,9.7,9.5,135.806311,34.88932,0.7


In [32]:
df.to_csv("files/japon_limpio2.csv")