In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import csv
import random
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from sklearn.datasets import load_diabetes
from sklearn.ensemble import ExtraTreesRegressor, AdaBoostRegressor, GradientBoostingRegressor, RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor

from evolutionary_forest.forest import EvolutionaryForestRegressor
from evolutionary_forest.utils import get_feature_importance, plot_feature_importance, feature_append

In [2]:
pd.set_option('display.max_rows', 10)         # Default: 60
pd.set_option('display.max_columns', 20)      # Default: 20
pd.set_option('display.max_colwidth', 50)     # Default: 50
# Check current settings
print(pd.get_option('display.max_rows'))
print(pd.get_option('display.max_columns'))
print(pd.get_option('display.max_colwidth'))

10
20
50


In [3]:
df = pd.read_csv("D:\\Ameri\\Kaggle\\HousingPricesCompetition\\Judging\\train.csv")
test_df = pd.read_csv("D:\\Ameri\\Kaggle\\HousingPricesCompetition\\Judging\\test.csv")

In [4]:
df.info()
row, col = df.shape

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [5]:
for name in df.columns:
    if (not name in df.columns): continue
    if (not name in test_df.columns): continue

    if (df[name].isnull().sum() >= row / 2 or test_df[name].isnull().sum() >= row / 2):
        print(name)
        df = df.drop([name], axis = 1)
        test_df = test_df.drop([name], axis = 1)
        continue

    if (isinstance(df[name][0], str)):
        df[name] = df[name].fillna(df[name].mode()[0])
    else: 
        df[name] = df[name].fillna(df[name].mean())

    if (isinstance(test_df[name][0], str)):
        test_df[name] = test_df[name].fillna(test_df[name].mode()[0])
    else: 
        test_df[name] = test_df[name].fillna(test_df[name].mean())

Alley
MasVnrType
FireplaceQu
PoolQC
Fence
MiscFeature


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 75 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1460 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   LotShape       1460 non-null   object 
 7   LandContour    1460 non-null   object 
 8   Utilities      1460 non-null   object 
 9   LotConfig      1460 non-null   object 
 10  LandSlope      1460 non-null   object 
 11  Neighborhood   1460 non-null   object 
 12  Condition1     1460 non-null   object 
 13  Condition2     1460 non-null   object 
 14  BldgType       1460 non-null   object 
 15  HouseStyle     1460 non-null   object 
 16  OverallQual    1460 non-null   int64  
 17  OverallCond    1460 non-null   int64  
 18  YearBuil

In [7]:
def category_onehot_multcols():
    df_final=final_df
    i=0
    for fields in df.columns:
        if (not isinstance(df[fields][0], str)): continue
        
        df1=pd.get_dummies(final_df[fields],drop_first=True)
        
        final_df.drop([fields],axis=1,inplace=True)
        if i==0:
            df_final=df1.copy()
        else:
            
            df_final=pd.concat([df_final,df1],axis=1)
        i=i+1
       
        
    df_final=pd.concat([final_df,df_final],axis=1)
        
    return df_final

In [8]:
final_df=pd.concat([df,test_df],axis=0)
final_df=category_onehot_multcols()

In [9]:
import warnings

# Suppress all warnings
warnings.filterwarnings('ignore')

# Suppress specific warnings
# warnings.filterwarnings('ignore', category=UserWarning)

In [10]:
final_df =final_df.loc[:,~final_df.columns.duplicated()]
df_train=final_df.iloc[:1460,:]
df_test=final_df.iloc[1460:,:]

df_test.drop(['SalePrice'], axis = 1, inplace = True)
X_train = df_train.drop(['SalePrice'], axis = 1)
y_train = df_train['SalePrice']

In [11]:
final_df.head()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,ConLI,ConLw,New,Oth,WD,AdjLand,Alloca,Family,Normal,Partial
0,1,60,65.0,8450,7,5,2003,2003,196.0,706.0,...,False,False,False,False,True,False,False,False,True,False
1,2,20,80.0,9600,6,8,1976,1976,0.0,978.0,...,False,False,False,False,True,False,False,False,True,False
2,3,60,68.0,11250,7,5,2001,2002,162.0,486.0,...,False,False,False,False,True,False,False,False,True,False
3,4,70,60.0,9550,7,5,1915,1970,0.0,216.0,...,False,False,False,False,True,False,False,False,False,False
4,5,60,84.0,14260,8,5,2000,2000,350.0,655.0,...,False,False,False,False,True,False,False,False,True,False


In [12]:
result = []
for i in range(df_test.shape[0]): result.append(1000000000000)

In [None]:
model = EvolutionaryForestRegressor(normalize=True, select='AutomaticLexicase',
                                gene_num=60, boost_size=100, n_gen=300, n_pop=200, cross_pb=1,
                                base_learner='Random-DT', verbose=False, n_process=1)
model.fit(X_train, y_train)
    
for i in range(1):
    for j in range(df_test.shape[0]):
        row = df_test.iloc[j].tolist()
        result[j] = min(result[j], model.predict(np.array(row).reshape(1, -1))[0])

print(result[0])

output = []
output.append(['Id', 'SalePrice'])
for i in range(df_test.shape[0]):
    value = [str(df_test['Id'][i]), str(result[i])]
    output.append(value)

filename = 'D:\\Ameri\\Kaggle\\HousingPricesCompetition\\submission.csv'
with open(filename, 'w', newline = '') as file:
    writer = csv.writer(file)
    writer.writerows(output)