In [67]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report

In [68]:
train = pd.read_csv("train.csv")

In [69]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72220 entries, 0 to 72219
Data columns (total 34 columns):
Unnamed: 0                      72220 non-null int64
parcelid                        72220 non-null int64
logerror                        72220 non-null float64
transactiondate                 72220 non-null object
transaction_month               72220 non-null int64
airconditioningtypeid           72220 non-null float64
bathroomcnt                     72220 non-null float64
bedroomcnt                      72220 non-null float64
buildingqualitytypeid           72220 non-null float64
calculatedbathnbr               72220 non-null float64
calculatedfinishedsquarefeet    72220 non-null float64
finishedsquarefeet12            72220 non-null float64
fips                            72220 non-null float64
fireplacecnt                    72220 non-null float64
fullbathcnt                     72220 non-null float64
garagecarcnt                    72220 non-null float64
garagetotalsqft   

In [72]:
train_y = list(train["logerror"])
train_z = [1 if y > 0.1647 or y < -0.1278 else 0 for y in train_y]
train_X = train.drop(["logerror", "transactiondate", "Unnamed: 0"], axis = 1)

In [73]:
len(train_X.columns)

31

## Classification

### Baseline: Log Reg with all features

In [74]:
clf = LogisticRegressionCV(cv=10, random_state=0).fit(train_X, train_z)

In [75]:
pred = clf.predict(train_X)

In [76]:
print(classification_report(train_z, pred))

              precision    recall  f1-score   support

           0       0.90      1.00      0.95     65019
           1       0.00      0.00      0.00      7201

   micro avg       0.90      0.90      0.90     72220
   macro avg       0.45      0.50      0.47     72220
weighted avg       0.81      0.90      0.85     72220



  'precision', 'predicted', average, warn_for)


In [77]:
fpr, tpr, thresholds = metrics.roc_curve(train_z, pred)
print(metrics.auc(fpr, tpr))

0.5


### Log Reg with Standardization

In [78]:
scaler = StandardScaler()
scaler.fit(train_X)
train_X1 = scaler.transform(train_X)

  return self.partial_fit(X, y)
  This is separate from the ipykernel package so we can avoid doing imports until


In [82]:
clf = LogisticRegressionCV(cv=10, random_state=0).fit(train_X1, train_z)



In [83]:
pred1 = clf.predict(train_X1)

In [84]:
print(classification_report(train_z, pred1))

              precision    recall  f1-score   support

           0       0.90      1.00      0.95     65019
           1       0.65      0.00      0.01      7201

   micro avg       0.90      0.90      0.90     72220
   macro avg       0.77      0.50      0.48     72220
weighted avg       0.88      0.90      0.85     72220



In [85]:
fpr, tpr, thresholds = metrics.roc_curve(train_z, pred1)
print(metrics.auc(fpr, tpr))

0.5014352848979426


### Log Reg with L1 Penalty

In [None]:
clf = LogisticRegressionCV(cv=10, random_state=0, solver = "liblinear", penalty='l1').fit(train_X1, train_z)

In [88]:
pred2 = clf.predict(train_X1)

array([ 1.29762887e+07,  5.85252008e+00,  1.25801717e+00,  2.27729853e+00,
        3.03075325e+00,  3.90199391e+00,  2.30689152e+00,  1.76398377e+03,
        1.73676790e+03,  6.04883786e+03,  1.18882215e+00,  2.23916886e+00,
        1.81165788e+00,  3.45446230e+02,  2.81896981e+00,  3.40057052e+07,
       -1.18198556e+08,  2.95176407e+04,  2.61823456e+02,  6.04914729e+07,
        2.52658917e+03,  1.47153143e+00,  1.00888285e+00,  1.10968198e+00,
        1.96854500e+03,  1.44039331e+00,  1.79502130e+05,  4.56373534e+05,
        2.77641917e+05,  5.84779611e+03,  6.04911707e+13])

In [None]:
print(classification_report(train_z, pred2))

In [None]:
fpr, tpr, thresholds = metrics.roc_curve(train_z, pred2)
print(metrics.auc(fpr, tpr))

### Log Reg with L2 Penalty

## Regression

### Baseline Lin Reg with all features

### Log Reg with Standardization

### Ridge 

### Lasso