In [2]:
import pandas as pd

In [6]:
#Data reading
X = pd.read_csv('housing-classification-iter6.csv') # feature vector
y = X.pop('Expensive') #target/label
#X.head()

In [5]:
from sklearn.model_selection import train_test_split
X_train,X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=31436)

#### we'll create a simple working solution by by predicting that all the houses are cheap

In [7]:
y_train.value_counts()

0    989
1    179
Name: Expensive, dtype: int64

In [8]:
len(y_train) 

1168

#### predicitons for train set will be 1168 zeros

In [10]:
pred_pessimistic_train = pd.Series(0, index=range(1168))

###### To find out how good those predictions are, we will compare them with the true values and find out the percentage of correctly predicted prices

In [14]:
from sklearn.metrics import accuracy_score

train_accuracy = accuracy_score(
    y_true = y_train,
    y_pred = pred_pessimistic_train
)

round(train_accuracy, 2) #rounding output to 2 sf

0.85

#### finding out accuracy on test dataset

In [16]:
len(y_test)

292

In [17]:
pred_pessimistic_test = pd.Series(0, index=range(292))

In [18]:
test_accuracy = accuracy_score(
    y_true = y_test,
    y_pred = pred_pessimistic_test
)

round(test_accuracy, 2)

0.87

#### Creating another intuition based model using another dataset

In [57]:
import pandas as pd

url = "https://drive.google.com/file/d/1g3uhw_y3tboRm2eYDPfUzXXsw8IOYDCy/view?usp=sharing"
path = 'https://drive.google.com/uc?export=download&id='+url.split('/')[-2]


data = pd.read_csv(path)
data.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

#### Creating new column based on sex, assigning  '1' to female and '0' to male

In [58]:
def new_column(row):
    if row ['Sex'] == 'female':
        return 1
    elif row ['Sex'] == 'male':
        return 0


In [59]:
data['Sex_int'] = data.apply(lambda row: new_column(row), axis=1)

In [60]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Sex_int
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,1
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,0


In [61]:
X = data.drop(columns = ['PassengerId', 'Ticket','Name' ])
y = X.pop('Sex_int')

In [62]:
#X.info()

##### Data splitting

In [67]:
X_train, X_test,y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=3736)

In [68]:
y_train.value_counts()

0    462
1    250
Name: Sex_int, dtype: int64

In [69]:
len(y_train)

712

#### Predicting all the passengers are male and checking accuracy

In [70]:
pred_male_train = pd.Series(0, index=range(712))

In [71]:

train_accuracy = accuracy_score(
    y_true = y_train,
    y_pred = pred_female_train
)

round(train_accuracy, 2) #rounding output to 2 sf

0.65

In [72]:
len(y_test)

179

In [73]:
pred_male_test = pd.Series(0, index=range(179))

In [74]:
test_accuracy = accuracy_score(
    y_true = y_test,
    y_pred = pred_female_test
)

round(test_accuracy, 2) 

0.64

#### Conclusion:
The model has consistent performance between both train and test set which indicates low variance. However, there is low performance(high bias) which means there are missing important connections between tge features and the target (underfitting)