In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RepeatedStratifiedKFold
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
import numpy as np
from sklearn.pipeline import Pipeline
import warnings
warnings.filterwarnings('ignore')
from sklearn import set_config
set_config(display="diagram")

In [2]:
X = pd.read_csv('housing-classification-iter6.csv')
y = X.pop('Expensive')
X.head()

Unnamed: 0,LotArea,LotFrontage,TotalBsmtSF,BedroomAbvGr,Fireplaces,PoolArea,GarageCars,WoodDeckSF,ScreenPorch,MSZoning,...,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition
0,8450,65.0,856,3,0,0,2,0,0,RL,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
1,9600,80.0,1262,3,1,0,2,298,0,RL,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
2,11250,68.0,920,3,1,0,2,0,0,RL,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
3,9550,60.0,756,3,1,0,3,0,0,RL,...,Detchd,Unf,TA,TA,Y,,,,WD,Abnorml
4,14260,84.0,1145,4,1,0,3,192,0,RL,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal


In [28]:
X.shape

(1460, 80)

In [5]:
len(X.columns)

80

In [9]:
X_testing.head(3)

Unnamed: 0,Id,LotArea,LotFrontage,TotalBsmtSF,BedroomAbvGr,Fireplaces,PoolArea,GarageCars,WoodDeckSF,ScreenPorch,...,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition
0,1461,11622,80.0,882.0,2,0,0,1.0,140,120,...,Attchd,Unf,TA,TA,Y,,MnPrv,,WD,Normal
1,1462,14267,81.0,1329.0,3,0,0,1.0,393,0,...,Attchd,Unf,TA,TA,Y,,,Gar2,WD,Normal
2,1463,13830,74.0,928.0,3,1,0,2.0,212,0,...,Attchd,Fin,TA,TA,Y,,MnPrv,,WD,Normal


##### Data engineering to make sure the training data set and the prediction dataset have the same shape

In [4]:
X_testing = pd.read_csv('test.csv')
X_testing.shape

(1459, 81)

In [8]:
X_testing.head(3)

Unnamed: 0,Id,LotArea,LotFrontage,TotalBsmtSF,BedroomAbvGr,Fireplaces,PoolArea,GarageCars,WoodDeckSF,ScreenPorch,...,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition
0,1461,11622,80.0,882.0,2,0,0,1.0,140,120,...,Attchd,Unf,TA,TA,Y,,MnPrv,,WD,Normal
1,1462,14267,81.0,1329.0,3,0,0,1.0,393,0,...,Attchd,Unf,TA,TA,Y,,,Gar2,WD,Normal
2,1463,13830,74.0,928.0,3,1,0,2.0,212,0,...,Attchd,Fin,TA,TA,Y,,MnPrv,,WD,Normal


In [6]:
len(X_testing.columns)

80

In [12]:
#X_testing.columns

In [13]:
#X_testing['Id.1'] 

X_testing has 2 different columns with the same data, hence we will be dropping one so that it can have the same shape  with the training data set 

In [14]:
X_testing = X_testing.drop(columns=['Id.1'], axis=1)

In [15]:
len(X_testing.columns)

80

We will then make the column 'Id' the first columns for better clarity

In [39]:
first_column = X.pop('Id')

In [40]:
X.insert(0, 'Id', first_column)

In [41]:
X.head()

Unnamed: 0,Id,LotArea,LotFrontage,TotalBsmtSF,BedroomAbvGr,Fireplaces,PoolArea,GarageCars,WoodDeckSF,ScreenPorch,...,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition
0,1,8450,65.0,856,3,0,0,2,0,0,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
1,2,9600,80.0,1262,3,1,0,2,298,0,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
2,3,11250,68.0,920,3,1,0,2,0,0,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
3,4,9550,60.0,756,3,1,0,3,0,0,...,Detchd,Unf,TA,TA,Y,,,,WD,Abnorml
4,5,14260,84.0,1145,4,1,0,3,192,0,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal


In [16]:
# SPLIT DATA
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

In [17]:
X_train.head()

Unnamed: 0,LotArea,LotFrontage,TotalBsmtSF,BedroomAbvGr,Fireplaces,PoolArea,GarageCars,WoodDeckSF,ScreenPorch,MSZoning,...,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition
318,9900,90.0,1347,4,1,0,3,340,0,RL,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
580,14585,,1144,3,2,0,2,216,0,RL,...,Attchd,Unf,TA,TA,Y,,,,WD,Normal
961,12227,,1330,4,1,0,2,550,0,RL,...,Attchd,Fin,TA,TA,Y,,,,WD,Normal
78,10778,72.0,1768,4,0,0,0,0,0,RL,...,,,,,Y,,,,WD,Normal
5,14115,85.0,796,1,0,0,2,40,0,RL,...,Attchd,Unf,TA,TA,Y,,MnPrv,Shed,WD,Normal


In [45]:
#y_train

##### PREPROCESSING PIPELINE

In [18]:
# BUILD PIPELINE
X_cat = X.select_dtypes(exclude='number').copy()
X_num = X.select_dtypes(include='number').copy()

numeric_pipe = make_pipeline(
    SimpleImputer(strategy='mean'))

categoric_pipe = make_pipeline(
    SimpleImputer(strategy='constant', fill_value='N_A'),
                              OneHotEncoder(handle_unknown='ignore')
)

preprocessor = ColumnTransformer(
    transformers=[
        ('num_pipe', numeric_pipe, X_num.columns),
        ('cat_pipe', categoric_pipe, X_cat.columns),
    ]
 )                          

In [19]:
categoric_pipe


##### USING DECISION TREE CLASSIFIER

In [21]:
from sklearn.model_selection import GridSearchCV

full_pipeline = make_pipeline(preprocessor, 
                              DecisionTreeClassifier())

param_grid = {
    "columntransformer__num_pipe__simpleimputer__strategy":["mean", "median"],
    "decisiontreeclassifier__max_depth": range(2, 14, 2),
    "decisiontreeclassifier__min_samples_leaf": range(3, 12, 2),
    "decisiontreeclassifier__min_samples_split": range(2, 10, 2)
}

search = GridSearchCV(full_pipeline,
                      param_grid,
                      cv=5,
                      verbose=1)

search.fit(X_train, y_train)
 
scores = {"dtree" : search.best_score_}

scores

Fitting 5 folds for each of 240 candidates, totalling 1200 fits


{'dtree': 0.9297898096181358}

In [22]:
full_pipeline

In [26]:
# predicting using a new file
X_testing_array = search.predict(X_testing) 
X_testing_array

array([0, 0, 0, ..., 1, 0, 0])

In [27]:
# converting X_testing array into dataframe
X_testing_df = pd.DataFrame(X_testing_array, columns=['Expensive'])
X_testing_df.head(3) 

Unnamed: 0,Expensive
0,0
1,0
2,0


We will use the Id column the index column in the new dataframe by first extracting the index column from the initial dataframe and then inserting it into the new dataframe

In [32]:
extracted_col = X_testing['Id'] # extracting the Id column from the new file
X_testing_df.insert(0,'Id', extracted_col)#inserting the column in the new df

X_testing_df = X_testing_df.set_index('Id') #setting Id as the new index
X_testing_df.head(3)

Unnamed: 0_level_0,Expensive
Id,Unnamed: 1_level_1
1461,0
1462,0
1463,0


In [27]:
#converting it to a new csv file 
X_testing_df.to_csv('X_testing_df.csv') 

##### USING KNN

In [34]:
# DEFINE MODELS AND PARAMETERS
from sklearn.neighbors import KNeighborsClassifier

knn_full_pipeline = make_pipeline(preprocessor, KNeighborsClassifier())

from sklearn.model_selection import GridSearchCV

param_grid = {
    "columntransformer__num_pipe__simpleimputer__strategy":["mean", "median"],
    "kneighborsclassifier__n_neighbors": range(2, 20),
    "kneighborsclassifier__weights": ["uniform", "distance"]
}

knn_search = GridSearchCV(knn_full_pipeline,
                          param_grid,
                          cv=8,
                          verbose=1)
knn_search.fit(X_train, y_train)


scores = {"dtree" : search.best_score_}
scores["knn"] = knn_search.best_score_

scores


Fitting 8 folds for each of 72 candidates, totalling 576 fits


{'dtree': 0.9297898096181358, 'knn': 0.9143835616438356}

In [35]:
# predicting using KNN search in a new file
X_testing_Knn_array = knn_search.predict(X_testing) 
X_testing_knn_df = pd.DataFrame(X_testing_Knn_array, columns=['Expensive'])
X_testing_knn_df.head(3) 

Unnamed: 0,Expensive
0,0
1,0
2,0


In [36]:
#inserting the column in the new df
X_testing_knn_df.insert(0,'Id', extracted_col)

X_testing_knn_df = X_testing_knn_df.set_index('Id')#setting Id as the new index
X_testing_knn_df.head(3)

Unnamed: 0_level_0,Expensive
Id,Unnamed: 1_level_1
1461,0
1462,0
1463,0


In [32]:
#converting it to a new csv file 
X_testing_knn_df.to_csv('X_testing_knn_df.csv')

##### USING LOGIT REGRESSION

In [37]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RepeatedStratifiedKFold
import warnings
warnings.filterwarnings('ignore')

# define models and parameters
logit_full_pipeline = make_pipeline(preprocessor, 
                                    LogisticRegression()
                                    )
from sklearn.model_selection import GridSearchCV

param_grid = {"columntransformer__num_pipe__simpleimputer__strategy":["mean", "median"],
              'logisticregression__penalty':['l1', 'l2', 'elasticnet', 'none'],
              'logisticregression__C': [1.0]}

logit_search = GridSearchCV(logit_full_pipeline,
                            param_grid,
                            cv=8,
                            verbose=1) 
logit_search.fit(X_train, y_train)

scores["logit"] = logit_search.best_score_

scores

Fitting 8 folds for each of 8 candidates, totalling 64 fits


{'dtree': 0.9297898096181358,
 'knn': 0.9143835616438356,
 'logit': 0.9212328767123288}

In [38]:
logit_full_pipeline

In [34]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [40]:
# predicting using logit search in a new file
X_testing_logit_array = logit_search.predict(X_testing) 
X_testing_logit_df = pd.DataFrame(X_testing_logit_array, columns=['Expensive'])
X_testing_logit_df.head(3)


Unnamed: 0,Expensive
0,0
1,0
2,0


In [41]:
#inserting the column in the new df
X_testing_logit_df.insert(0,'Id', extracted_col)

#setting Id as the new index
X_testing_logit_df = X_testing_logit_df.set_index('Id')
X_testing_logit_df.head(3)


Unnamed: 0_level_0,Expensive
Id,Unnamed: 1_level_1
1461,0
1462,0
1463,0


In [42]:
#converting it to a new csv file 
X_testing_logit_df.to_csv('X_testing_logit_df.csv')

#### USING RANDOM FOREST

In [43]:
# using random forest
from sklearn.ensemble import RandomForestClassifier


#define models and parameters
randomforest_full_pipeline = make_pipeline(preprocessor,
                                  # StandardScaler(),
                                  RandomForestClassifier()
                                 )
from sklearn.model_selection import GridSearchCV


param_grid = {"columntransformer__num_pipe__simpleimputer__strategy":["mean", "median"],
              'randomforestclassifier__max_depth':range(2, 10),
              'randomforestclassifier__criterion':['gini', 'entropy', 'log_loss'],
              'randomforestclassifier__min_samples_split':[5]
}
randomforest_search = GridSearchCV(randomforest_full_pipeline, 
                                   param_grid,
                                   cv=8,
                                   verbose=1)
randomforest_search.fit(X_train, y_train)
scores['randomforest'] = randomforest_search.best_score_

scores

Fitting 8 folds for each of 48 candidates, totalling 384 fits


{'dtree': 0.9297898096181358,
 'knn': 0.9143835616438356,
 'logit': 0.9212328767123288,
 'randomforest': 0.946917808219178}

In [49]:
randomforest_search.best_params_

{'columntransformer__num_pipe__simpleimputer__strategy': 'mean',
 'randomforestclassifier__criterion': 'gini',
 'randomforestclassifier__max_depth': 9,
 'randomforestclassifier__min_samples_split': 5}

In [44]:
randomforest_full_pipeline


In [45]:
# predicting using random forest search in a new file
X_testing_random_forest_array = randomforest_search.predict(X_testing) 
X_testing_random_forest_df = pd.DataFrame(X_testing_random_forest_array, columns=['Expensive'])
X_testing_random_forest_df.head(3)


Unnamed: 0,Expensive
0,0
1,0
2,0


In [46]:
#inserting the column in the new df
X_testing_random_forest_df.insert(0,'Id', extracted_col)

#setting Id as the new index
X_testing_random_forest_df = X_testing_random_forest_df.set_index('Id')
X_testing_random_forest_df.head(3)

Unnamed: 0_level_0,Expensive
Id,Unnamed: 1_level_1
1461,0
1462,0
1463,0


In [48]:
#converting it to a new csv file 
X_testing_random_forest_df.to_csv('X_testing_random_forest_df.csv')

In [51]:
# using random forest
from sklearn.ensemble import RandomForestClassifier


#define models and parameters
randomforest_full_pipeline_2 = make_pipeline(preprocessor,
                                  # StandardScaler(),
                                  RandomForestClassifier()
                                 )
from sklearn.model_selection import GridSearchCV


param_grid = {"columntransformer__num_pipe__simpleimputer__strategy":["mean"],
              'randomforestclassifier__max_depth': [9],
              'randomforestclassifier__criterion':['gini'],
              'randomforestclassifier__min_samples_split':[5],
              'randomforestclassifier__min_samples_leaf' : [5]
                                               
                                                    
}

randomforest_search_2 = GridSearchCV(randomforest_full_pipeline_2, 
                                   param_grid,
                                   cv=10,
                                   verbose=1)
randomforest_search_2.fit(X_train, y_train)
scores['randomforest'] = randomforest_search_2.best_score_

scores

Fitting 10 folds for each of 1 candidates, totalling 10 fits


{'dtree': 0.9297898096181358,
 'knn': 0.9143835616438356,
 'logit': 0.9212328767123288,
 'randomforest': 0.9383289124668435}

In [52]:
# predicting using logit search in a new file
X_testing_random_forest2_array = randomforest_search.predict(X_testing) 
X_testing_random_forest2_df = pd.DataFrame(X_testing_random_forest2_array, columns=['Expensive'])
X_testing_random_forest2_df.head(3)


Unnamed: 0,Expensive
0,0
1,0
2,0


In [53]:
#inserting the column in the new df
X_testing_random_forest2_df.insert(0,'Id', extracted_col)

#setting Id as the new index
X_testing_random_forest2_df = X_testing_random_forest2_df.set_index('Id')
X_testing_random_forest2_df.head(3)

Unnamed: 0_level_0,Expensive
Id,Unnamed: 1_level_1
1461,0
1462,0
1463,0


In [46]:
#converting it to a new csv file 
X_testing_random_forest2_df.to_csv('X_testing_random_forest2_df.csv')

##### USING SVC

In [54]:
#using support vetor machine
from sklearn.svm import SVC


#define models and parameters
SVC_full_pipeline = make_pipeline(preprocessor,
                                  # StandardScaler(),
                                  SVC()
                                 )
from sklearn.model_selection import GridSearchCV


param_grid = {"columntransformer__num_pipe__simpleimputer__strategy":["mean", "median"],
              'svc__kernel':['rbf'],
              'svc__C':[1.0],
              'svc__gamma':['scale', 'auto']
}
SVC_search = GridSearchCV(SVC_full_pipeline, 
                                   param_grid,
                                   cv=8,
                                   verbose=1)
SVC_search.fit(X_train, y_train)
scores['SVC'] = SVC_search.best_score_

scores

Fitting 8 folds for each of 4 candidates, totalling 32 fits


{'dtree': 0.9297898096181358,
 'knn': 0.9143835616438356,
 'logit': 0.9212328767123288,
 'randomforest': 0.9383289124668435,
 'SVC': 0.8570205479452055}

In [55]:
# predicting using logit search in a new file
X_testing_SVC_array = SVC_search.predict(X_testing) 
X_testing_SVC_df = pd.DataFrame(X_testing_SVC_array, columns=['Expensive'])
X_testing_SVC_df.head(3)

Unnamed: 0,Expensive
0,0
1,0
2,0


In [56]:
#inserting the column in the new df
X_testing_SVC_df.insert(0,'Id', extracted_col)

#setting Id as the new index
X_testing_SVC_df = X_testing_SVC_df.set_index('Id')
X_testing_SVC_df.head(3)

Unnamed: 0_level_0,Expensive
Id,Unnamed: 1_level_1
1461,0
1462,0
1463,0


In [57]:
#converting it to a new csv file 
X_testing_SVC_df.to_csv('X_testing_SVC_df.csv')