In [180]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer

pd.set_option('display.max_columns', None)
pd.set_option(display.max_rows', 80)

In [181]:
def gd_path(file_id):
    """Generate a shareable link from Google Drive file id."""
    return f"https://drive.google.com/uc?export=download&id={file_id}"

files_id = {
    'housing_data':"1VEpP7kLJjlLR9MyTgu2FyFnOJArh6U2b", #iteration6 data
    'test_housing_data':"1CMsAWhWKWBjI6DDEHtcYmRVZRazfE9bo", #test data for housing
    'ids_com':"10gwiL49calkj-xbx-3rQEK4H2zoJcU11" #ID for the commiting of the project

}


housing_data = pd.read_csv(gd_path(files_id['housing_data']), sep=",")
test_housing_data = pd.read_csv(gd_path(files_id['test_housing_data']), sep=",")
ids_com = pd.read_csv(gd_path(files_id['ids_com']), sep=",")
df = housing_data

# Pre-Processing Pipe

## Ordinary columns

This part was done by hands. Many Bothans died to bring us this information.

In [182]:
from sklearn.model_selection import train_test_split

# X and y creation
y = df.pop("Expensive")

# Feature Engineering
X = df

# data splitting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

In [191]:
# building the pipeline
X_cat = X.select_dtypes(exclude="number").copy()
X_num = X.select_dtypes(include="number").copy()

#  numerical pipeline
numeric_pipe = make_pipeline(
    SimpleImputer(strategy="mean"))


# categorical pipeline
# defining ordinal & onehot columns

ordinal_cols = X_cat.columns.get_indexer(['LandContour', 'LandSlope',
                     'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond',
                     'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
                     'HeatingQC', 'KitchenQual', 'Functional',
                     'FireplaceQu', 'GarageFinish', 'GarageQual', 'GarageCond',
                     'PoolQC']) 

onehot_cols = X_cat.columns.get_indexer(['MSZoning', 'Condition1', 'Heating', 'Street', 'CentralAir', 
                                         'Foundation', 'Alley', 'LotShape', 'Utilities', 'LotConfig', 
                                         'Neighborhood', 'Condition2', 'BldgType', 'HouseStyle', 
                                         'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 
                                         'Electrical', 'GarageType', 'PavedDrive', 'Fence', 'MiscFeature',
                                         'SaleType', 'SaleCondition'])



# defining the categorical encoder
# we manually establish the order of the categories for our ordinal feature (Cabin), including "N_A"
LandContour = ['Lvl', 'Bnk', 'HLS', 'Low']
LandSlope = ['Gtl', 'Mod', 'Sev']
ExterQual = ['Ex', 'Gd', 'TA', 'Fa', 'Po']
ExterCond = ['Ex', 'Gd', 'TA', 'Fa', 'Po']
BsmtQual = ['Ex', 'Gd', 'TA', 'Fa', 'Po', 'N_A']
BsmtCond = ['Ex', 'Gd', 'TA', 'Fa', 'Po', 'N_A']
BsmtExposure = ['Gd', 'Av', 'Mn', 'No', 'N_A']
BsmtFinType1 = ['GLQ', 'ALQ', 'BLQ', 'Rec', 'LwQ', 'Unf', 'N_A']
BsmtFinType2 = ['GLQ', 'ALQ', 'BLQ', 'Rec', 'LwQ', 'Unf', 'N_A']
HeatingQC = ['Ex', 'Gd', 'TA', 'Fa', 'Po']
KitchenQual = ['Ex', 'Gd', 'TA', 'Fa', 'Po']
Functional = ['Typ', 'Min1', 'Min2', 'Mod', 'Maj1', 'Maj2', 'Sev', 'Sal']
FireplaceQu = ['Ex', 'Gd', 'TA', 'Fa', 'Po', 'N_A']
GarageFinish = ['Fin', 'RFn', 'Unf', 'N_A']
GarageQual = ['Ex', 'Gd', 'TA', 'Fa', 'Po', 'N_A']
GarageCond = ['Ex', 'Gd', 'TA', 'Fa', 'Po', 'N_A']
PoolQC = ['Ex', 'Gd', 'TA', 'Fa', 'N_A']


categorical_encoder = ColumnTransformer(
    transformers=[
        ("cat_ordinal", OrdinalEncoder(categories=[LandContour,LandSlope,
                                                    ExterQual,ExterCond, BsmtQual, BsmtCond,
                                                    BsmtExposure, BsmtFinType1, BsmtFinType2,
                                                    HeatingQC, KitchenQual, Functional,
                                                    FireplaceQu, GarageFinish, GarageQual, GarageCond,
                                                    PoolQC]), ordinal_cols),

        ("cat_onehot", OneHotEncoder(handle_unknown="ignore"), onehot_cols)
    ]
)


categoric_pipe = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="N_A"),
    categorical_encoder 
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num_pipe", numeric_pipe, X_num.columns),
        ("cat_pipe", categoric_pipe, X_cat.columns),
    ]
)

# Modelling Pipe - 1

## Decisiontree - Creation

In [192]:
from sklearn.model_selection import GridSearchCV

full_pipeline = make_pipeline(preprocessor,
                              DecisionTreeClassifier())

param_grid = {
    "columntransformer__num_pipe__simpleimputer__strategy":["mean", "median"],
    "decisiontreeclassifier__max_depth": range(2, 14, 2),
    "decisiontreeclassifier__min_samples_leaf": range(3, 12, 2)
}

'''param_grid = {
    "columntransformer__num_pipe__simpleimputer__strategy":["mean", "median"],
    'decisiontreeclassifier__max_depth': range(2, 20),
    'decisiontreeclassifier__min_samples_leaf': range(3, 15, 2),
    'decisiontreeclassifier__min_samples_split': range(3, 40, 5),
    'decisiontreeclassifier__criterion':['gini', 'entropy']
    }'''

search = GridSearchCV(full_pipeline,
                      param_grid,
                      cv=5,
                      verbose=1)

search.fit(X_train, y_train)

scores = {"dtree" : search.best_score_}

scores

Fitting 5 folds for each of 60 candidates, totalling 300 fits


{'dtree': 0.9289387770074464}

In [193]:
search.predict(X_test)

array([0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0,
       1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0])

In [67]:
X_sumbmition = pd.read_csv(r'/Users/Smirnov/Downloads/test-housing-classification.csv')


## Decisiontree - Analysis

### Confusion Matrix

In [194]:
from sklearn.metrics import confusion_matrix, accuracy_score

# Get predictions for the training and testing datasets
y_train_pred = search.predict(X_train)
y_test_pred = search.predict(X_test)

# Create confusion matrices
confusion_matrix_train = confusion_matrix(y_train, y_train_pred)
confusion_matrix_test = confusion_matrix(y_test, y_test_pred)

# Calculate accuracy for training and testing datasets
accuracy_train = accuracy_score(y_train, y_train_pred)
accuracy_test = accuracy_score(y_test, y_test_pred)

print("Training Accuracy:", accuracy_train)
print("Testing Accuracy:", accuracy_test)

print("Confusion Matrix (Training):\n", confusion_matrix_train)
print("Confusion Matrix (Testing):\n", confusion_matrix_test)

Training Accuracy: 0.9657534246575342
Testing Accuracy: 0.9383561643835616
Confusion Matrix (Training):
 [[977  16]
 [ 24 151]]
Confusion Matrix (Testing):
 [[240  10]
 [  8  34]]


### F-Score

In [195]:
from sklearn.metrics import f1_score
print("Training Accuracy:", f1_score(y_train,y_train_pred))
print("Testing Accuracy:", f1_score(y_test,y_test_pred))

Training Accuracy: 0.8830409356725146
Testing Accuracy: 0.7906976744186046


### Kappa

In [196]:
from sklearn.metrics import cohen_kappa_score
print("Training Accuracy:", cohen_kappa_score(y_train,y_train_pred))
print("Testing Accuracy:", cohen_kappa_score(y_test,y_test_pred))

Training Accuracy: 0.8629936130156067
Testing Accuracy: 0.7545760179305192


# Modeling Pipe - 2


In [197]:
# Modeling Pipe - 2

from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier


full_pipeline = make_pipeline(preprocessor,
                              KNeighborsClassifier())

param_grid = {
    "columntransformer__num_pipe__simpleimputer__strategy":["mean", "median"],
    "kneighborsclassifier__n_neighbors": range(3, 20, 2)
}

search2 = GridSearchCV(full_pipeline,
                      param_grid,
                      cv=10,
                      verbose=1)

search2.fit(X_train, y_train)

scores2 = {"knn" : search.best_score_}

scores2


Fitting 10 folds for each of 18 candidates, totalling 180 fits


{'knn': 0.9289387770074464}

## Decisiontree - Analysis KNN

### Confusion Matrix

In [198]:
from sklearn.metrics import confusion_matrix, accuracy_score

# Get predictions for the training and testing datasets
y_train_pred = search.predict(X_train)
y_test_pred = search.predict(X_test)

# Create confusion matrices
confusion_matrix_train = confusion_matrix(y_train, y_train_pred)
confusion_matrix_test = confusion_matrix(y_test, y_test_pred)

# Calculate accuracy for training and testing datasets
accuracy_train = accuracy_score(y_train, y_train_pred)
accuracy_test = accuracy_score(y_test, y_test_pred)

print("Training Accuracy:", accuracy_train)
print("Testing Accuracy:", accuracy_test)

print("Confusion Matrix (Training):\n", confusion_matrix_train)
print("Confusion Matrix (Testing):\n", confusion_matrix_test)

Training Accuracy: 0.9657534246575342
Testing Accuracy: 0.9383561643835616
Confusion Matrix (Training):
 [[977  16]
 [ 24 151]]
Confusion Matrix (Testing):
 [[240  10]
 [  8  34]]


### F-Score

In [199]:
from sklearn.metrics import f1_score
print("Training Accuracy:", f1_score(y_train,y_train_pred))
print("Testing Accuracy:", f1_score(y_test,y_test_pred))

Training Accuracy: 0.8830409356725146
Testing Accuracy: 0.7906976744186046


### Kappa

In [200]:
from sklearn.metrics import cohen_kappa_score
print("Training Accuracy:", cohen_kappa_score(y_train,y_train_pred))
print("Testing Accuracy:", cohen_kappa_score(y_test,y_test_pred))

Training Accuracy: 0.8629936130156067
Testing Accuracy: 0.7545760179305192


+ Logistic Regression
+ Random Forest


# Modelling Pipe - 3

## Support Vector Machine - Creation

In [26]:
# Modelling Pipe - 3
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

# Assuming you have already defined 'preprocessor', 'X_train', and 'y_train'

full_pipeline = make_pipeline(preprocessor, SVC())

param_grid = {
    "columntransformer__num_pipe__simpleimputer__strategy": ["mean", "median"],
    "svc__C": [0.1, 1, 10],  # You can adjust the values of C as needed
    "svc__kernel": ["linear", "rbf"],  # You can also try different kernel functions
}

search_svc = GridSearchCV(full_pipeline, param_grid, cv=10, verbose=1)
search_svc.fit(X_train, y_train)

scores_svc = {"svc": search_svc.best_score_}

print(scores_svc)



Fitting 10 folds for each of 12 candidates, totalling 120 fits


KeyboardInterrupt: 

## Support Vector Machine - Analysis

### Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score

# Get predictions for the training and testing datasets
y_train_pred = search.predict(X_train)
y_test_pred = search.predict(X_test)

# Create confusion matrices
confusion_matrix_train = confusion_matrix(y_train, y_train_pred)
confusion_matrix_test = confusion_matrix(y_test, y_test_pred)

# Calculate accuracy for training and testing datasets
accuracy_train = accuracy_score(y_train, y_train_pred)
accuracy_test = accuracy_score(y_test, y_test_pred)

print("Training Accuracy:", accuracy_train)
print("Testing Accuracy:", accuracy_test)

print("Confusion Matrix (Training):\n", confusion_matrix_train)
print("Confusion Matrix (Testing):\n", confusion_matrix_test)

Training Accuracy: 0.9537671232876712
Testing Accuracy: 0.9383561643835616
Confusion Matrix (Training):
 [[973  20]
 [ 34 141]]
Confusion Matrix (Testing):
 [[243   7]
 [ 11  31]]


### F-Score

In [None]:
from sklearn.metrics import f1_score
print("Training Accuracy:", f1_score(y_train,y_train_pred))
print("Testing Accuracy:", f1_score(y_test,y_test_pred))

Training Accuracy: 0.8392857142857142
Testing Accuracy: 0.775


### Kappa

In [None]:
from sklearn.metrics import cohen_kappa_score
print("Training Accuracy:", cohen_kappa_score(y_train,y_train_pred))
print("Testing Accuracy:", cohen_kappa_score(y_test,y_test_pred))

Training Accuracy: 0.8686247591800007
Testing Accuracy: 0.7659373821199547
