In [71]:
from sklearn.tree import DecisionTreeClassifier
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
# tryout models
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

train_data = pd.read_csv("/content/sample_data/train.csv")
test_data = pd.read_csv("/content/sample_data/test.csv")

columns_to_drop = ['Alley','FireplaceQu','PoolQC','Fence','MiscFeature']
for column in columns_to_drop:
  train_data.drop(column,axis=1, inplace=True)

label_encoder = LabelEncoder()

catg_cols_to_transform = ['BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'GarageType', 'Utilities',
                          'GarageFinish', 'GarageQual', 'GarageCond', 'SaleType', 'MSZoning', 'Street', 'LotShape',
                          'LandContour', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
                          'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual',
                          'ExterCond', 'Foundation', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
                          'Functional', 'PavedDrive', 'SaleCondition']

for catg in catg_cols_to_transform:
  tr_label = label_encoder.fit(train_data[catg])
  train_data[catg] = label_encoder.transform(train_data[catg])

#filling the missing values
median_value = train_data['LotFrontage'].median()
train_data['LotFrontage'].fillna(median_value, inplace=True)

median_year = train_data['GarageYrBlt'].median()
train_data['GarageYrBlt'].fillna(median_year, inplace=True)

median_MasVnrType = train_data['MasVnrArea'].median()
train_data['MasVnrArea'].fillna(median_MasVnrType, inplace=True)

y = train_data['SalePrice']
train_data.drop(columns=['SalePrice'], axis=1, inplace=True)
X=train_data

#print(train_data.isnull().sum())
# print(train_data['LotFrontage'].head(20))
# print(train_data['GarageYrBlt'].head(20))

# prediction
models = [RandomForestRegressor(random_state=3),GradientBoostingRegressor(random_state=3, n_estimators=150)]
def compare_model_cross_validation():
  for model in models:
    cv_score = cross_val_score(model,X,y, cv=5)
    mean_accuracy = sum(cv_score)/len(cv_score)
    mean_accuracy = mean_accuracy * 100 #converting into percentage
    mean_accuracy = round(mean_accuracy,2) #rounding off to two decimal values
    print('Cross Validation accuracy for ', model, '= ', cv_score)
    print('Accuracy % of the ', model, mean_accuracy)
    print('------------------------------------------------')

compare_model_cross_validation()

train_X,val_X,train_y,val_y = train_test_split(X,y,random_state=3)
grad_model = GradientBoostingRegressor(random_state=3,n_estimators=150)
grad_model.fit(train_X,train_y)
grad_predict = grad_model.predict(val_X)
print(mean_absolute_error(val_y,grad_predict))


Cross Validation accuracy for  RandomForestRegressor(random_state=3) =  [0.87427024 0.84112851 0.88041448 0.88707394 0.83799396]
Accuracy % of the  RandomForestRegressor(random_state=3) 86.42
------------------------------------------------
Cross Validation accuracy for  GradientBoostingRegressor(n_estimators=150, random_state=3) =  [0.90572222 0.85567915 0.89814529 0.90913897 0.90470653]
Accuracy % of the  GradientBoostingRegressor(n_estimators=150, random_state=3) 89.47
------------------------------------------------
15058.907677415978


In [72]:
test_data = pd.read_csv("/content/sample_data/test.csv")
label_encode = LabelEncoder()

test_col_to_drop = ['Alley','FireplaceQu','PoolQC','Fence','MiscFeature']
for column in test_col_to_drop:
  test_data.drop(column,axis=1, inplace=True)

test_catg_cols_to_transform = ['BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'GarageType', 'Utilities',
                          'GarageFinish', 'GarageQual', 'GarageCond', 'SaleType', 'MSZoning', 'Street', 'LotShape',
                          'LandContour', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
                          'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual',
                          'ExterCond', 'Foundation', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
                          'Functional', 'PavedDrive', 'SaleCondition']

for catg in test_catg_cols_to_transform:
  tr_label = label_encode.fit(test_data[catg])
  test_data[catg] = label_encode.transform(test_data[catg])

# print(test_data.head())

#filling the missing values with the median
missing_val_col = ['LotFrontage','GarageYrBlt','MasVnrArea']
fewer_missing_val_col = ['BsmtFinSF1','BsmtFinSF2','BsmtUnfSF','TotalBsmtSF','BsmtFullBath',
                        'BsmtHalfBath','GarageCars','GarageArea']

for val in fewer_missing_val_col:
  test_data[val].interpolate(method='linear', inplace=True)

for val in missing_val_col:
  median_value = test_data[val].median()
  test_data[val].fillna(median_value, inplace=True)

predict = grad_model.predict(test_data)
#print(predict)
# print(test_data.isnull().sum())

submission_df = pd.DataFrame({
    'Id': test_data['Id'],
    'SalePrice': predict
})

submission_df.to_csv('submission2.csv', index=False)
