In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import data_cleaning_functions as dcf

df = pd.read_csv("data/istanbulApartmentForRent.csv")
pd.set_option('display.float_format', '{:.2f}'.format)

In [None]:
df

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.columns

In [None]:
df.isna().sum()

In [None]:
df.groupby(["neighborhood"])["price"].agg(["sum", "min", "max", "mean"])

In [None]:
numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns

for col in numeric_cols:
    print(f"{col}")
    print(df[col].agg(["mean", "median", "std", "max", "min"]))
    print(" ")

In [None]:
fig, axes = plt.subplots(2, 5, figsize=(20, 12))
axes = axes.ravel()

for i, col in enumerate(numeric_cols):
    sns.boxplot(data=df, y=col, ax=axes[i])
    axes[i].set_title(col)

In [None]:
changing_cols = [col for col in numeric_cols if col != "price"]

for col in changing_cols:
    df[col] = dcf.clean_outliers(df[col])

In [None]:
fig, axes = plt.subplots(2, 5, figsize=(20, 12))
axes = axes.ravel()

for i, col in enumerate(numeric_cols):
    sns.boxplot(data=df, y=col, ax=axes[i])
    axes[i].set_title(col)

In [None]:
for col in numeric_cols:
    print(f"{col.upper()}")
    print(df[col].agg(["mean", "median", "std", "max", "min"]))
    print(" ")

In [None]:
df.price

In [None]:
print(df['price'].describe())
print(f"99. percentile: {df['price'].quantile(0.99)}")
print(f"99.9. percentile: {df['price'].quantile(0.999)}")

In [None]:
df["m2 price"] = df["price"] / df["area (m2)"]
df

In [None]:
df["house_value"] = (df["room"] * df["m2 price"]) / df["age"].replace(0, 1)
df

In [None]:
grouped = df.groupby(['district', 'neighborhood'])['house_value'].agg(['mean', 'median', 'count'])
grouped

In [None]:
fig, axes = plt.subplots(2, 5, figsize=(20, 12))
axes = axes.ravel()

for i, col in enumerate(numeric_cols):
    sns.boxplot(data=df, y=col, ax=axes[i])
    axes[i].set_title(col)

In [None]:
def remove_outliers_aggressive(df):
    df_clean = df.copy()
    
    # District ortalamaları
    district_means = df.groupby('district')['house_value'].mean()
    district_stds = df.groupby('district')['house_value'].std()
    
    for (district, neighborhood), group in df.groupby(['district', 'neighborhood']):
        
        if len(group) >= 4:  # Mahalle bazında
            mean_val = group['house_value'].mean()
            std_val = group['house_value'].std()
            
            # Daha sıkı sınırlar (1.5 standart sapma)
            lower_bound = mean_val - 1.5 * std_val
            upper_bound = mean_val + 1.5 * std_val
            
            outliers = (group['house_value'] < lower_bound) | (group['house_value'] > upper_bound)
            
            if outliers.any():
                df_clean.loc[group.index[outliers], 'house_value'] = mean_val
                
        else:  # District bazında
            district_mean = district_means[district]
            district_std = district_stds[district]
            
            # District için de aykırı kontrolü
            lower_bound = district_mean - 1.5 * district_std
            upper_bound = district_mean + 1.5 * district_std
            
            for idx in group.index:
                if (df_clean.loc[idx, 'house_value'] < lower_bound) or (df_clean.loc[idx, 'house_value'] > upper_bound):
                    df_clean.loc[idx, 'house_value'] = district_mean
    
    return df_clean

df_clean = remove_outliers_aggressive(df)

In [None]:
grouped = df.groupby(['district', 'neighborhood'])['house_value'].agg(['mean', 'median', 'count', 'min', 'max'])
grouped

In [None]:
grouped = df_clean.groupby(['district', 'neighborhood'])['house_value'].agg(['mean', 'median', 'count', 'min', 'max'])
grouped

In [None]:
sns.boxplot(data=df_clean, x=df_clean.house_value)

In [None]:
# En üst %5 ve en alt %5'i kırp
df_clean['house_value'] = df_clean['house_value'].clip(
    lower=df['house_value'].quantile(0.05),
    upper=df['house_value'].quantile(0.95)
)

In [None]:
sns.boxplot(data=df_clean, x=df_clean.house_value)

In [None]:
def remove_outliers_aggressive(df):
    df_clean = df.copy()
    
    # District ortalamaları
    district_means = df.groupby('district')['house_value'].mean()
    district_stds = df.groupby('district')['house_value'].std()
    
    for (district, neighborhood), group in df.groupby(['district', 'neighborhood']):
        
        if len(group) >= 4:  # Mahalle bazında
            mean_val = group['house_value'].mean()
            std_val = group['house_value'].std()
            
            # Daha sıkı sınırlar (1.5 standart sapma)
            lower_bound = mean_val - 1.5 * std_val
            upper_bound = mean_val + 1.5 * std_val
            
            outliers = (group['house_value'] < lower_bound) | (group['house_value'] > upper_bound)
            
            if outliers.any():
                df_clean.loc[group.index[outliers], 'house_value'] = mean_val
                
        else:  # District bazında
            district_mean = district_means[district]
            district_std = district_stds[district]
            
            # District için de aykırı kontrolü
            lower_bound = district_mean - 1.5 * district_std
            upper_bound = district_mean + 1.5 * district_std
            
            for idx in group.index:
                if (df_clean.loc[idx, 'house_value'] < lower_bound) or (df_clean.loc[idx, 'house_value'] > upper_bound):
                    df_clean.loc[idx, 'house_value'] = district_mean
    
    return df_clean

df_clean2 = remove_outliers_aggressive(df_clean)

In [None]:
sns.boxplot(data=df_clean, x=df_clean2.house_value)

In [None]:
# En üst %5 ve en alt %5'i kırp
df_clean2['house_value'] = df_clean2['house_value'].clip(
    lower=df_clean2['house_value'].quantile(0.05),
    upper=df_clean2['house_value'].quantile(0.95)
)

In [None]:
sns.boxplot(data=df_clean2, x=df_clean2.house_value)

In [None]:
df_clean2

In [None]:
df_clean2["m2 price"] = (df_clean2["house_value"] * df_clean2["age"].replace(0, 1)) / df_clean2["room"]
df_clean2

In [None]:
df_clean2["price"] = df["m2 price"] * df_clean2["area (m2)"]
df_clean2