In [1]:
import pandas as pd
import numpy as np
import os, re

In [2]:
data = pd.read_csv('./House_Rent_Dataset.csv')
data.head(5)

Unnamed: 0,Posted On,BHK,Rent,Size,Floor,Area Type,Area Locality,City,Furnishing Status,Tenant Preferred,Bathroom,Point of Contact
0,2022-05-18,2,10000,1100,Ground out of 2,Super Area,Bandel,Kolkata,Unfurnished,Bachelors/Family,2,Contact Owner
1,2022-05-13,2,20000,800,1 out of 3,Super Area,"Phool Bagan, Kankurgachi",Kolkata,Semi-Furnished,Bachelors/Family,1,Contact Owner
2,2022-05-16,2,17000,1000,1 out of 3,Super Area,Salt Lake City Sector 2,Kolkata,Semi-Furnished,Bachelors/Family,1,Contact Owner
3,2022-07-04,2,10000,800,1 out of 2,Super Area,Dumdum Park,Kolkata,Unfurnished,Bachelors/Family,1,Contact Owner
4,2022-05-09,2,7500,850,1 out of 2,Carpet Area,South Dum Dum,Kolkata,Unfurnished,Bachelors,1,Contact Owner


In [3]:
data.groupby("BHK")\
    .agg(Count=("BHK", "count"))\
    .assign(Proportion=lambda x: x["Count"] / x["Count"].sum() * 100)\
    .sort_values(by=['Count'], ascending=False)\
    .reset_index()

Unnamed: 0,BHK,Count,Proportion
0,2,2265,47.724399
1,1,1167,24.589128
2,3,1098,23.135272
3,4,189,3.982301
4,5,19,0.400337
5,6,8,0.168563


In [4]:
data.groupby("BHK").describe()['Rent']

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
BHK,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,1167.0,14139.22365,13514.982134,1500.0,6500.0,9000.0,17000.0,200000.0
2,2265.0,22113.864018,25803.382742,2000.0,10000.0,15000.0,22000.0,600000.0
3,1098.0,55863.062842,117555.074963,1200.0,20000.0,32000.0,65000.0,3500000.0
4,189.0,168864.555556,165788.401565,10000.0,60000.0,130000.0,250000.0,1200000.0
5,19.0,297500.0,207944.637397,22500.0,75000.0,310000.0,425000.0,650000.0
6,8.0,73125.0,86310.796378,20000.0,27500.0,47500.0,65000.0,280000.0


In [5]:
def categorize_floor(val):
    if "Upper Basement" in val:
        return "upper_basement"
    elif "Lower Basement" in val:
        return "lower_basement"
    elif "Ground" in val:
        return "ground"
    elif "out of" in val:
        return "normal"
    else:
        return val

def building_floor(val):
    nums = re.findall(r"\d+", val)
    if "Upper Basement" in val:
        return nums[0] if nums else np.nan
    elif "Lower Basement" in val:
        return nums[0] if nums else np.nan
    elif "Ground" in val:
        return nums[0] if nums else np.nan
    elif "out of" in val:
        return nums[1] if nums else np.nan
    else:
        return val

def flat_floor(val):
    nums = re.findall(r"\d+", val)
    if "Upper Basement" in val:
        return -1
    elif "Lower Basement" in val:
        return -1
    elif "Ground" in val:
        return 0
    elif "out of" in val:
        return int(nums[0]) if nums else np.nan
    else:
        return val

In [6]:
data['Categorized Floor'] = data["Floor"].apply(categorize_floor)
data['Building Floor'] = data["Floor"].apply(building_floor)
data['Flat Floor'] = data["Floor"].apply(flat_floor)

In [7]:
import re
from fuzzywuzzy import process

city_address_classes = [
    "metropol",
    "banliyo",
    "ghetto",
    "sanayi",
    "kirsal",
    "turistik",
    "unknown"
]

# Daha agresif keyword listesi (örnek kısaltılmış, istediğin kadar uzatabilirsin)
keyword_mapping = {
    "metropol": [
        "ballygunge","dum dum","jadavpur","behala","tollygunge","dhakuria","santoshpur",
        "howrah","ultadanga","mahim","parel","worli","prabhadevi","bandra","andheri",
        "juhu","powai","goregaon","koramangala","indiranagar","malleshwaram","egmore",
        "mylapore","tnagar","t nagar","velachery","adyar"
    ],
    "banliyo": [
        "rajarhat","new town","barasat","baruipur","sonarpur","narendrapur","joka","boral",
        "mira road","bhayandar","virar","vasai","nalasopara","borivali","malad","mulund",
        "whitefield","marathahalli","sarjapur","hsr layout","banashankari","btm","hebbal",
        "kukatpally","miyapur","uppal","nizampet","dwarka","rohini","mayur vihar","tambaram",
        "pallikaranai","medavakkam","sholinganallur","perungudi","siruseri","avadi"
    ],
    "sanayi": [
        "salt lake sector 5","sector v","electronic city","whitefield itpl","outer ring road","orr",
        "hitech city","gachibowli","madhapur","mindspace","sez","bypass road","gst road",
        "old mahabalipuram road","omr","sriperumbudur","bhel","industrial area","industrial estate"
    ],
    "turistik": [
        "marine lines","marine drive","worli sea face","dakshineswar","rabindra sarobar",
        "botanical garden","powai lake","juhu beach","marina beach","mahabalipuram","qutub minar"
    ],
    "kirsal": [
        "village","rural","gram","pally","countryside","farm","amtala","ponmar","mannivakkam","tiruvallur"
    ],
    "ghetto": [
        "slum","ghetto","chawl","basti"
    ]
}

def classify_address(address: str, threshold: int = 75) -> str:
    if not address or not isinstance(address, str):
        return "unknown"

    s = address.lower().strip()

    # 1) Keyword mapping
    for cls, kws in keyword_mapping.items():
        for kw in kws:
            if re.search(r"\b" + re.escape(kw) + r"\b", s):
                return cls

    # 2) Fuzzy fallback
    match, score = process.extractOne(address, city_address_classes)
    return match if score >= threshold else "unknown"


In [8]:
data["area_locality_class_last"] = data["Area Locality"].apply(classify_address)
data.head(5)

Unnamed: 0,Posted On,BHK,Rent,Size,Floor,Area Type,Area Locality,City,Furnishing Status,Tenant Preferred,Bathroom,Point of Contact,Categorized Floor,Building Floor,Flat Floor,area_locality_class_last
0,2022-05-18,2,10000,1100,Ground out of 2,Super Area,Bandel,Kolkata,Unfurnished,Bachelors/Family,2,Contact Owner,ground,2,0,unknown
1,2022-05-13,2,20000,800,1 out of 3,Super Area,"Phool Bagan, Kankurgachi",Kolkata,Semi-Furnished,Bachelors/Family,1,Contact Owner,normal,3,1,unknown
2,2022-05-16,2,17000,1000,1 out of 3,Super Area,Salt Lake City Sector 2,Kolkata,Semi-Furnished,Bachelors/Family,1,Contact Owner,normal,3,1,unknown
3,2022-07-04,2,10000,800,1 out of 2,Super Area,Dumdum Park,Kolkata,Unfurnished,Bachelors/Family,1,Contact Owner,normal,2,1,unknown
4,2022-05-09,2,7500,850,1 out of 2,Carpet Area,South Dum Dum,Kolkata,Unfurnished,Bachelors,1,Contact Owner,normal,2,1,metropol


In [14]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4746 entries, 0 to 4745
Data columns (total 16 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Posted On                 4746 non-null   object
 1   BHK                       4746 non-null   int64 
 2   Rent                      4746 non-null   int64 
 3   Size                      4746 non-null   int64 
 4   Floor                     4746 non-null   object
 5   Area Type                 4746 non-null   object
 6   Area Locality             4746 non-null   object
 7   City                      4746 non-null   object
 8   Furnishing Status         4746 non-null   object
 9   Tenant Preferred          4746 non-null   object
 10  Bathroom                  4746 non-null   int64 
 11  Point of Contact          4746 non-null   object
 12  Categorized Floor         4746 non-null   object
 13  Building Floor            4745 non-null   object
 14  Flat Floor              

In [15]:
import pandas as pd
import numpy as np
from sklearn.utils import resample
from sklearn.preprocessing import LabelEncoder, StandardScaler
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns

# =============================================================================
# 1. DATA TYPE CONVERSION METHODS
# =============================================================================

def convert_data_types(df):
    """
    Dataset için optimal data type dönüşümleri yapar
    """
    df_converted = df.copy()
    
    # Int64 değerleri kategorik yapalım (daha iyi resampling için)
    df_converted['BHK'] = df_converted['BHK'].astype('object')
    df_converted['Bathroom'] = df_converted['Bathroom'].astype('object')
    
    # Size'ı kategorilere bölelim
    df_converted['Size_Category'] = pd.cut(
        df_converted['Size'], 
        bins=5, 
        labels=['Very_Small', 'Small', 'Medium', 'Large', 'Very_Large']
    )
    
    # Rent'i kategorilere bölelim (target için stratified sampling)
    df_converted['Rent_Range'] = pd.qcut(
        df_converted['Rent'], 
        q=5, 
        labels=['Very_Low', 'Low', 'Medium', 'High', 'Very_High']
    )
    
    return df_converted

# =============================================================================
# 2. IMBALANCE ANALYSIS
# =============================================================================

def analyze_imbalance(df, categorical_columns=None):
    """
    Dataset'teki imbalance durumunu analiz eder
    """
    if categorical_columns is None:
        categorical_columns = df.select_dtypes(include=['object']).columns
    
    imbalance_info = {}
    
    print("=== IMBALANCE ANALYSIS ===\n")
    
    for col in categorical_columns:
        if col in df.columns:
            value_counts = df[col].value_counts()
            total_samples = len(df)
            
            # İmbalance ratio hesaplama
            max_class = value_counts.max()
            min_class = value_counts.min()
            imbalance_ratio = max_class / min_class if min_class > 0 else float('inf')
            
            print(f"{col}:")
            print(f"  Classes: {len(value_counts)}")
            print(f"  Most frequent: {value_counts.index[0]} ({max_class} samples)")
            print(f"  Least frequent: {value_counts.index[-1]} ({min_class} samples)")
            print(f"  Imbalance Ratio: {imbalance_ratio:.2f}")
            print(f"  Distribution: {dict(value_counts)}\n")
            
            imbalance_info[col] = {
                'classes': len(value_counts),
                'max_samples': max_class,
                'min_samples': min_class,
                'imbalance_ratio': imbalance_ratio,
                'distribution': dict(value_counts)
            }
    
    return imbalance_info

# =============================================================================
# 3. RESAMPLING METHODS
# =============================================================================

def random_resampling(df, target_column, strategy='balance', target_samples=None):
    """
    Random Over/Under Sampling
    strategy: 'balance', 'oversample', 'undersample'
    """
    balanced_dfs = []
    
    for class_value in df[target_column].unique():
        class_data = df[df[target_column] == class_value]
        current_samples = len(class_data)
        
        if target_samples is None:
            if strategy == 'balance':
                target_samples = int(df[target_column].value_counts().median())
            elif strategy == 'oversample':
                target_samples = df[target_column].value_counts().max()
            elif strategy == 'undersample':
                target_samples = df[target_column].value_counts().min()
        
        if current_samples < target_samples:
            # Oversample
            resampled = resample(
                class_data,
                n_samples=target_samples,
                replace=True,
                random_state=42
            )
        else:
            # Undersample
            resampled = resample(
                class_data,
                n_samples=target_samples,
                replace=False,
                random_state=42
            )
        
        balanced_dfs.append(resampled)
    
    return pd.concat(balanced_dfs, ignore_index=True)

def smote_for_regression(df, target_col='Rent', categorical_features=None):
    """
    Regression için SMOTE benzeri yaklaşım
    """
    from sklearn.neighbors import NearestNeighbors
    
    if categorical_features is None:
        categorical_features = df.select_dtypes(include=['object']).columns.tolist()
        categorical_features = [col for col in categorical_features if col != target_col]
    
    # Label encoding
    df_encoded = df.copy()
    encoders = {}
    
    for col in categorical_features:
        if col in df_encoded.columns:
            le = LabelEncoder()
            df_encoded[col] = le.fit_transform(df_encoded[col].astype(str))
            encoders[col] = le
    
    # Features ve target ayır
    X = df_encoded.drop(target_col, axis=1)
    y = df_encoded[target_col]
    
    # Rent kategorilerine göre SMOTE
    rent_categories = pd.qcut(y, q=5, labels=['VL', 'L', 'M', 'H', 'VH'])
    
    # Her kategori için oversampling
    balanced_data = []
    target_count = rent_categories.value_counts().max()
    
    for category in rent_categories.unique():
        if pd.isna(category):
            continue
            
        mask = rent_categories == category
        X_cat = X[mask]
        y_cat = y[mask]
        
        if len(X_cat) < target_count:
            # KNN ile sentetik örnekler
            knn = NearestNeighbors(n_neighbors=3)
            knn.fit(X_cat)
            
            # Eksik örnekler kadar sentetik oluştur
            n_synthetic = target_count - len(X_cat)
            
            for _ in range(n_synthetic):
                # Rastgele bir örnek seç
                idx = np.random.choice(len(X_cat))
                sample = X_cat.iloc[idx]
                
                # Komşuları bul
                neighbors = knn.kneighbors([sample], return_distance=False)[0]
                
                # Komşu seç ve interpolate et
                neighbor_idx = np.random.choice(neighbors[1:])  # İlkini atla (kendisi)
                neighbor = X_cat.iloc[neighbor_idx]
                
                # Sentetik örnek oluştur
                alpha = np.random.random()
                synthetic = sample * alpha + neighbor * (1 - alpha)
                
                # Y değeri de interpolate
                synthetic_y = y_cat.iloc[idx] * alpha + y_cat.iloc[neighbor_idx] * (1 - alpha)
                
                # Listeye ekle
                balanced_data.append(np.append(synthetic, synthetic_y))
        
        # Orijinal verileri ekle
        for i in range(len(X_cat)):
            balanced_data.append(np.append(X_cat.iloc[i], y_cat.iloc[i]))
    
    # DataFrame'e çevir
    columns = list(X.columns) + [target_col]
    balanced_df = pd.DataFrame(balanced_data, columns=columns)
    
    # Categorical verileri decode et
    for col, encoder in encoders.items():
        balanced_df[col] = encoder.inverse_transform(balanced_df[col].astype(int))
    
    return balanced_df

def multi_feature_resampling(df, features_config, random_state=42):
    """
    Birden fazla özellik için sıralı resampling
    
    features_config örnek:
    {
        'City': {'strategy': 'balance'},
        'BHK': {'target_samples': 500},
        'Area Type': {'strategy': 'undersample'}
    }
    """
    result_df = df.copy()
    
    for feature, config in features_config.items():
        if feature not in result_df.columns:
            continue
            
        print(f"\n=== {feature} Resampling ===")
        print(f"Before: {dict(result_df[feature].value_counts())}")
        
        strategy = config.get('strategy', 'balance')
        target_samples = config.get('target_samples', None)
        
        result_df = random_resampling(
            result_df, 
            feature, 
            strategy=strategy, 
            target_samples=target_samples
        )
        
        print(f"After: {dict(result_df[feature].value_counts())}")
        print(f"Total samples: {len(result_df)}")
    
    return result_df

def weighted_resampling(df, weight_column='City', target_col='Rent'):
    """
    Ağırlıklı resampling - önemli özelliklere daha fazla ağırlık verir
    """
    from sklearn.utils.class_weight import compute_sample_weight
    
    # Sample weights hesapla
    sample_weights = compute_sample_weight(
        class_weight='balanced',
        y=df[weight_column]
    )
    
    # Weighted sampling
    n_samples = len(df)
    sampled_indices = np.random.choice(
        df.index, 
        size=n_samples,
        p=sample_weights/sample_weights.sum(),
        replace=True
    )
    
    resampled_df = df.loc[sampled_indices].reset_index(drop=True)
    return resampled_df

# =============================================================================
# 4. COMPREHENSIVE RESAMPLING PIPELINE
# =============================================================================

def comprehensive_resampling_pipeline(df, target_col='Rent'):
    """
    Kapsamlı resampling pipeline
    """
    print("=== COMPREHENSIVE RESAMPLING PIPELINE ===\n")
    
    # 1. Data type conversion
    print("1. Converting data types...")
    df_converted = convert_data_types(df)
    
    # 2. Imbalance analysis
    print("\n2. Analyzing imbalance...")
    imbalance_info = analyze_imbalance(df_converted)
    
    # 3. Identify problematic features (imbalance ratio > 5)
    problematic_features = [
        feature for feature, info in imbalance_info.items() 
        if info['imbalance_ratio'] > 5
    ]
    
    print(f"\n3. Problematic features (imbalance ratio > 5): {problematic_features}")
    
    # 4. Configure resampling
    features_config = {}
    for feature in problematic_features[:5]:  # En fazla 5 özellik
        if feature != target_col:
            features_config[feature] = {'strategy': 'balance'}
    
    print(f"\n4. Resampling configuration: {features_config}")
    
    # 5. Apply resampling
    print("\n5. Applying resampling...")
    resampled_df = multi_feature_resampling(df_converted, features_config)
    
    # 6. Final analysis
    print("\n6. Final analysis...")
    final_imbalance = analyze_imbalance(resampled_df, features_config.keys())
    
    return resampled_df, final_imbalance

# =============================================================================
# 5. USAGE EXAMPLES
# =============================================================================

def run_resampling_examples(df):
    """
    Farklı resampling yöntemlerini test eder
    """
    print("=== RESAMPLING EXAMPLES ===\n")
    
    # Convert data types first
    df_converted = convert_data_types(df)
    
    # Example 1: Simple random resampling for City
    print("Example 1: Random resampling for City")
    city_balanced = random_resampling(df_converted, 'City', strategy='balance')
    print(f"Original City distribution: {dict(df_converted['City'].value_counts())}")
    print(f"Balanced City distribution: {dict(city_balanced['City'].value_counts())}\n")
    
    # Example 2: Multi-feature resampling
    print("Example 2: Multi-feature resampling")
    config = {
        'City': {'strategy': 'balance'},
        'BHK': {'target_samples': 400},
        'Area Type': {'strategy': 'undersample'}
    }
    multi_balanced = multi_feature_resampling(df_converted, config)
    print(f"Final dataset size: {len(multi_balanced)}\n")
    
    # Example 3: Comprehensive pipeline
    print("Example 3: Comprehensive pipeline")
    comprehensive_result, final_stats = comprehensive_resampling_pipeline(df)
    
    return {
        'city_balanced': city_balanced,
        'multi_balanced': multi_balanced,
        'comprehensive_result': comprehensive_result
    }

# =============================================================================
# KULLANIM
# =============================================================================

"""
# Ana kullanım:
df = pd.read_csv('your_house_rent_data.csv')

# 1. Hızlı başlangıç
results = run_resampling_examples(df)

# 2. Özelleştirilmiş resampling
df_converted = convert_data_types(df)
imbalance_info = analyze_imbalance(df_converted)

config = {
    'City': {'strategy': 'balance'},
    'BHK': {'target_samples': 500},
    'Furnishing Status': {'strategy': 'oversample'}
}

balanced_df = multi_feature_resampling(df_converted, config)

# 3. Kapsamlı pipeline
final_df, stats = comprehensive_resampling_pipeline(df)
"""

"\n# Ana kullanım:\ndf = pd.read_csv('your_house_rent_data.csv')\n\n# 1. Hızlı başlangıç\nresults = run_resampling_examples(df)\n\n# 2. Özelleştirilmiş resampling\ndf_converted = convert_data_types(df)\nimbalance_info = analyze_imbalance(df_converted)\n\nconfig = {\n    'City': {'strategy': 'balance'},\n    'BHK': {'target_samples': 500},\n    'Furnishing Status': {'strategy': 'oversample'}\n}\n\nbalanced_df = multi_feature_resampling(df_converted, config)\n\n# 3. Kapsamlı pipeline\nfinal_df, stats = comprehensive_resampling_pipeline(df)\n"

In [16]:
results = run_resampling_examples(df)


NameError: name 'df' is not defined