In [7]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score

# --- Load data ---
train = pd.read_csv(r'E:/IITK/P5 data/Property_train.csv')
test = pd.read_csv(r'E:/IITK/P5 data/Property_test_share.csv')

# Display column names
print("Train columns:")
print(train.columns.tolist())

print("\nTest columns:")
print(test.columns.tolist())

# Optionally check shapes
print(f"\nTrain shape: {train.shape}")
print(f"Test shape: {test.shape}")


Train columns:
['Junk', 'InteriorsStyle', 'PriceIndex8', 'ListDate', 'Material', 'PriceIndex9', 'Agency', 'AreaIncomeType', 'EnvRating', 'PriceIndex7', 'ExpeditedListing', 'PriceIndex4', 'PriceIndex1', 'PriceIndex6', 'PRIMEUNIT', 'Channel', 'Zip', 'InsurancePremiumIndex', 'PlotType', 'Architecture', 'PriceIndex3', 'Region', 'PriceIndex5', 'SubModel', 'Facade', 'State', 'NormalisedPopulation', 'BuildYear', 'RegionType', 'PropertyAge', 'PriceIndex2']

Test columns:
['InteriorsStyle', 'PriceIndex8', 'ListDate', 'Material', 'PriceIndex9', 'Agency', 'AreaIncomeType', 'EnvRating', 'PriceIndex7', 'ExpeditedListing', 'PriceIndex4', 'PriceIndex1', 'PriceIndex6', 'PRIMEUNIT', 'Channel', 'Zip', 'InsurancePremiumIndex', 'PlotType', 'Architecture', 'PriceIndex3', 'Region', 'PriceIndex5', 'SubModel', 'Facade', 'State', 'NormalisedPopulation', 'BuildYear', 'RegionType', 'PropertyAge', 'PriceIndex2']

Train shape: (62035, 31)
Test shape: (10948, 30)


In [15]:
!pip install lightgbm


Collecting lightgbm
  Downloading lightgbm-4.6.0-py3-none-win_amd64.whl.metadata (17 kB)
Downloading lightgbm-4.6.0-py3-none-win_amd64.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   --- ------------------------------------ 0.1/1.5 MB 1.8 MB/s eta 0:00:01
   ----------------------- ---------------- 0.8/1.5 MB 7.5 MB/s eta 0:00:01
   ---------------------------------------- 1.5/1.5 MB 10.2 MB/s eta 0:00:00
Installing collected packages: lightgbm
Successfully installed lightgbm-4.6.0


In [25]:
import lightgbm as lgb
# --- Save categorical columns ---
cat_cols = train.select_dtypes(['object']).columns.tolist()

# --- Mark and combine ---
train['data'] = 'train'
test['data'] = 'test'
test['Junk'] = np.nan

all_data = pd.concat([train, test], axis=0, sort=False)

# --- Split back ---
x_train = all_data[all_data['data'] == 'train'].drop(['Junk', 'data'], axis=1)
y_train = all_data[all_data['data'] == 'train']['Junk']
x_test = all_data[all_data['data'] == 'test'].drop(['Junk', 'data'], axis=1)

# --- Convert object columns to category ---
for col in cat_cols:
    if col in x_train.columns:
        x_train[col] = x_train[col].astype('category')
        x_test[col] = x_test[col].astype('category')

# --- Validation split ---
X_tr, X_val, y_tr, y_val = train_test_split(
    x_train, y_train, test_size=0.2, random_state=42, stratify=y_train
)

# --- LightGBM dataset ---
lgb_train = lgb.Dataset(X_tr, y_tr)
lgb_val = lgb.Dataset(X_val, y_val, reference=lgb_train)

# --- LightGBM parameters ---
params = {
    'objective': 'binary',
    'metric': 'auc',
    'learning_rate': 0.05,
    'max_depth': 6,
    'num_leaves': 31,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'random_state': 42
}

# --- Train model ---
print("\nTraining LightGBM...")
model = lgb.train(
    params,
    lgb_train,
    valid_sets=[lgb_train, lgb_val],
    num_boost_round=1000,
    callbacks=[
        lgb.early_stopping(stopping_rounds=50),
        lgb.log_evaluation(period=100)
    ]
)

# --- Validate ---
val_pred = model.predict(X_val, num_iteration=model.best_iteration)
val_auc = roc_auc_score(y_val, val_pred)
print(f"\nValidation ROC-AUC: {val_auc:.4f}")

# --- Train final model on full data ---
full_train = lgb.Dataset(x_train, y_train)
final_model = lgb.train(
    params,
    full_train,
    num_boost_round=model.best_iteration
)

# --- Predict on test ---
test_pred = final_model.predict(x_test, num_iteration=final_model.best_iteration)

# --- Create synthetic Id ---
test_id = np.arange(len(test))

# --- Save submission ---
submission = pd.DataFrame({
    'Id': test_id,
    'Junk': test_pred
})


Training LightGBM...
[LightGBM] [Info] Number of positive: 6082, number of negative: 43546
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005612 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 54766
[LightGBM] [Info] Number of data points in the train set: 49628, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.122552 -> initscore=-1.968484
[LightGBM] [Info] Start training from score -1.968484
Training until validation scores don't improve for 50 rounds
[100]	training's auc: 0.888638	valid_1's auc: 0.755756
Early stopping, best iteration is:
[75]	training's auc: 0.876997	valid_1's auc: 0.756431

Validation ROC-AUC: 0.7564
[LightGBM] [Info] Number of positive: 7602, number of negative: 54433
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008108 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] T

In [27]:
submission.to_csv(r'E:/IITK/P5 data/submissio.csv', index=False)
print("Saved:submission.csv")

In [31]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import lightgbm as lgb

# --- Load data ---
train = pd.read_csv(r'E:/IITK/P5 data/Property_train.csv')
test = pd.read_csv(r'E:/IITK/P5 data/Property_test_share.csv')
print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")

# --- Save categorical columns ---
cat_cols = train.select_dtypes(['object']).columns.tolist()

# --- Mark and combine ---
train['data'] = 'train'
test['data'] = 'test'
test['Junk'] = np.nan

all_data = pd.concat([train, test], axis=0, sort=False)

# --- Group rare categories ---
for col in cat_cols:
    freq = all_data[col].value_counts()
    rare = freq[freq < 50].index
    all_data[col] = all_data[col].replace(rare, 'Rare')

# --- Convert object columns to category ---
for col in cat_cols:
    all_data[col] = all_data[col].astype('category')

# --- Split back ---
x_train = all_data[all_data['data'] == 'train'].drop(['Junk', 'data'], axis=1)
y_train = all_data[all_data['data'] == 'train']['Junk']
x_test = all_data[all_data['data'] == 'test'].drop(['Junk', 'data'], axis=1)

# --- Validation split ---
X_tr, X_val, y_tr, y_val = train_test_split(
    x_train, y_train, test_size=0.2, random_state=42, stratify=y_train
)

# --- LightGBM dataset ---
lgb_train = lgb.Dataset(X_tr, y_tr)
lgb_val = lgb.Dataset(X_val, y_val, reference=lgb_train)

# --- LightGBM parameters ---
params = {
    'objective': 'binary',
    'metric': 'auc',
    'learning_rate': 0.05,
    'max_depth': 6,
    'num_leaves': 31,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'min_data_in_leaf': 20,
    'min_gain_to_split': 0.01,
    'max_bin': 512,
    'random_state': 42
}

# --- Train ---
print("\nTraining LightGBM...")
model = lgb.train(
    params,
    lgb_train,
    valid_sets=[lgb_train, lgb_val],
    num_boost_round=1000,
    callbacks=[
        lgb.early_stopping(stopping_rounds=50),
        lgb.log_evaluation(period=100)
    ]
)

# --- Validate ---
val_pred = model.predict(X_val, num_iteration=model.best_iteration)
val_auc = roc_auc_score(y_val, val_pred)
print(f"\nValidation ROC-AUC: {val_auc:.4f}")

# --- Final model on full data ---
full_train = lgb.Dataset(x_train, y_train)
final_model = lgb.train(
    params,
    full_train,
    num_boost_round=model.best_iteration
)

# --- Predict on test ---
test_pred = final_model.predict(x_test, num_iteration=final_model.best_iteration)

# --- Create submission: only Junk column, order matches test data ---
submission = pd.DataFrame({
    'Junk': test_pred
})

submission.to_csv(r'E:/IITK/P5 data/submission2.csv', index=False)
print("Saved:submission.csv")

Train shape: (62035, 31)
Test shape: (10948, 30)

Training LightGBM...
[LightGBM] [Info] Number of positive: 6082, number of negative: 43546
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001721 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2780
[LightGBM] [Info] Number of data points in the train set: 49628, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.122552 -> initscore=-1.968484
[LightGBM] [Info] Start training from score -1.968484
Training until validation scores don't improve for 50 rounds
[100]	training's auc: 0.885809	valid_1's auc: 0.760611
Early stopping, best iteration is:
[51]	training's auc: 0.85191	valid_1's auc: 0.763372

Validation ROC-AUC: 0.7634
[LightGBM] [Info] Number of positive: 7602, number of negative: 54433
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead

In [33]:
import pandas as pd

# Load data
train =  pd.read_csv(r'E:/IITK/P5 data/Property_train.csv')

# Compute missing percentage
missing_count = train['EnvRating'].isnull().sum()
total_count = len(train)

missing_percentage = (missing_count / total_count) * 100
print(round(missing_percentage))


0


In [35]:
import pandas as pd

# Load data
train =  pd.read_csv(r'E:/IITK/P5 data/Property_train.csv')
# Convert ListDate to datetime
train['ListDate'] = pd.to_datetime(train['ListDate'], errors='coerce')

# Extract month and compute frequency
month_counts = train['ListDate'].dt.month.value_counts()

# Get the month with the highest count
highest_month = month_counts.idxmax()
print(highest_month)

10


In [37]:
# Filter for AreaIncomeType 'A' and count Agency occurrences
agency_counts = train[train['AreaIncomeType'] == 'A']['Agency'].value_counts()

# Get the agency with the highest count
top_agency = agency_counts.idxmax()
print(top_agency)

CAT1


In [39]:
# Group by State and compute median PropertyAge
median_age_by_state = train.groupby('State')['PropertyAge'].median()

# Get the State with the highest median PropertyAge
top_state = median_age_by_state.idxmax()
print(top_state)

AR


In [43]:
# Convert PriceIndex1 to numeric, forcing errors to NaN
train['PriceIndex1'] = pd.to_numeric(train['PriceIndex1'], errors='coerce')

# Compute variance and round
variance = round(train['PriceIndex1'].var())
print(variance)

6066959


In [45]:
# Convert NormalisedPopulation to numeric if needed
train['NormalisedPopulation'] = pd.to_numeric(train['NormalisedPopulation'], errors='coerce')

# Compute mean for Architecture 'YIK5'
avg_pop = round(train.loc[train['Architecture'] == 'YIK5', 'NormalisedPopulation'].mean())
print(avg_pop)

73381


In [47]:
# Convert PriceIndex8 to numeric if necessary
train['PriceIndex8'] = pd.to_numeric(train['PriceIndex8'], errors='coerce')

# Calculate range = max - min
price_range = train['PriceIndex8'].max() - train['PriceIndex8'].min()
print(price_range)

41062.0
