# Imports

In [0]:
from pyspark.sql.types import StringType, NumericType, IntegerType
from pyspark.sql import functions as F
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Utility functions

In [0]:


def get_missing_stats_df(df):
    total_rows = df.count()
    
    missing_counts = []
    
    for col_name, col_type in df.dtypes:
        # For string columns, check for null OR empty strings
        if col_type == 'string':
            missing_expr = F.sum(
                F.when(F.col(col_name).isNull() | (F.col(col_name) == ""), 1).otherwise(0)
            )
        # For numeric columns, only check for null (not empty strings)
        else:
            missing_expr = F.sum(F.when(F.col(col_name).isNull(), 1).otherwise(0))
        
        missing_counts.append(missing_expr.alias(col_name))
    
    # Collect results
    missing = (
        df.select(missing_counts)
        .toPandas()
        .T
        .reset_index()
        .rename(columns={"index": "column", 0: "missing_count"})
    )
    
    missing["missing_percent"] = (missing["missing_count"] / total_rows) * 100
    missing["data_type"] = [dict(df.dtypes)[col] for col in missing["column"]]
    
    # Sort by missing count descending for better visibility
    missing = missing.sort_values("missing_count", ascending=False)
    
    return missing

In [0]:
def get_rows_with_missing_values(df, include_missing_flags=True):
    """
    Returns a DataFrame containing only rows that have at least one missing value.
    
    Parameters:
    - df: Input DataFrame
    - include_missing_flags: If True, adds boolean columns showing which fields are missing
    """
    # Create conditions and flag columns
    missing_conditions = []
    flag_columns = []
    
    for col_name, col_type in df.dtypes:
        if col_type == 'string':
            condition = F.col(col_name).isNull() | (F.col(col_name) == "")
        else:
            condition = F.col(col_name).isNull()
        
        missing_conditions.append(condition)
        flag_columns.append(F.when(condition, True).otherwise(False).alias(f"missing_{col_name}"))
    
    # Combined condition for filtering
    combined_condition = missing_conditions[0]
    for condition in missing_conditions[1:]:
        combined_condition = combined_condition | condition
    
    # Filter rows with missing values
    missing_rows_df = df.filter(combined_condition)
    
    # Add missing flags if requested
    if include_missing_flags:
        missing_rows_df = missing_rows_df.select(
            "*", *flag_columns
        )
    
    return missing_rows_df


# Analysis

In [0]:
columns_to_keep = [
    'host_id','host_since','host_is_superhost','latitude','longitude','property_type','room_type','accommodates','bathrooms','bathrooms_text','bedrooms','beds','amenities','price','minimum_nights','maximum_nights', 'number_of_reviews','review_scores_rating','license','instant_bookable','reviews_per_month'
]

df = spark.read.table('airbnb.raw.listings').select(columns_to_keep)
display(df)

In [0]:
print(df.dtypes)


In [0]:
display(df.describe())

In [0]:
display(df.summary())


In [0]:
df_missing_stats = get_missing_stats_df(df)
display(df_missing_stats)

As _Bathrooms_ column consists of only null rows, we will remove that to get clearer data.


In [0]:
columns = [
    'host_id','host_since','host_is_superhost','latitude','longitude','property_type','room_type','accommodates', 'bathrooms_text','bedrooms','beds','amenities','price','minimum_nights','maximum_nights', 'number_of_reviews','review_scores_rating','license','instant_bookable','reviews_per_month'
]

df = df.select(columns)
df_missing_rows = get_rows_with_missing_values(df, include_missing_flags=True)
print(f"Rows with missing values: {df_missing_rows.count()}")
print(f"Total rows in dataset: {df_missing_rows.count()}")
print(f"Missing rows percentage: {(df_missing_rows.count() / df_missing_rows.count()) * 100:.2f}%")

# Show sample of missing rows
display(df_missing_rows.limit(20))

In [0]:
df.dtypes

# Correlation between colums missingness

In [0]:
null_indicator_df = df.select([
    F.when(F.col(col).isNull(), 0).otherwise(1).alias(col) 
    for col in df.columns
])

In [0]:
from pyspark.sql import functions as F
from pyspark.ml.stat import Correlation
from pyspark.ml.feature import VectorAssembler

# Convert to vector column for ML correlation
assembler = VectorAssembler(
    inputCols=null_indicator_df.columns, 
    outputCol="features"
)

vector_df = assembler.transform(null_indicator_df)

# Calculate correlation matrix
corr_matrix = Correlation.corr(vector_df, "features").head()[0]

# Convert to pandas for better visualization
corr_array = corr_matrix.toArray()
corr_pd = pd.DataFrame(corr_array, 
                      columns=null_indicator_df.columns, 
                      index=null_indicator_df.columns)

print(corr_pd)

In [0]:
# Create a filtered correlation matrix (only strong correlations)
threshold = 0
filtered_corr = corr_pd.where(abs(corr_pd) > threshold)

plt.figure(figsize=(14, 10))
sns.heatmap(filtered_corr, 
            annot=True, 
            cmap='RdBu_r', 
            center=0,
            fmt='.2f',
            square=True)
plt.title(f'Missing Data Correlation (|r| > {threshold})')
plt.tight_layout()
plt.show()

# Price column analysis

In [0]:
from pyspark.sql import functions as F
import matplotlib.pyplot as plt

# Your existing code to get symbol counts
symbol_counts = (
    df
    .select(
        F.explode(
            F.split(
                F.regexp_replace(F.col("price").cast("string"), "[a-zA-Z0-9\\s]", ""),
                ""
            )
        ).alias("symbol")
    )
    .filter(F.col("symbol") != "")
    .groupBy("symbol")
    .count()
    .orderBy(F.desc("count"))
)

# Convert to Pandas for visualization
symbol_counts_pd = symbol_counts.toPandas()

# Create bar chart
plt.figure(figsize=(12, 6))
plt.bar(symbol_counts_pd['symbol'], symbol_counts_pd['count'])
plt.title('Symbol Frequency in Price Column')
plt.xlabel('Symbols')
plt.ylabel('Frequency')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Print unique symbols
symbols_set = {row.symbol for row in symbol_counts.select("symbol").collect()}
print("Unique symbols in price column:", symbols_set)

As we can see, there is no other currencies except dollars. Dataset consists 44 rows which contain "," instead of ".", which can be easily replaced. 